From 7d09a8d01cc96ae793099139748ca36b7f69f8dd Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Sat, 13 Jun 2026 20:37:26 +0000 Subject: [PATCH 1/6] spec: support qwen3.5 & 3.6 eagle3 draft --- src/models/qwen35.cpp | 2 ++ src/models/qwen35moe.cpp | 2 ++ tools/server/server-context.cpp | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 6783d98ec204..d8ffe43ae76c 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index eb5e9a406a15..7b0876cbb04b 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 31280d63c4ea..b526008096d5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2977,6 +2977,17 @@ struct server_context_impl { bool do_reset = it == slot.prompt.checkpoints.rend(); + // eagle3 draft is one position behind the target due to deferred boundary), so it + // can't resume from a checkpoint restored on a recurrent/hybrid target; re-process fully instead. + const bool spec_eagle3 = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3) != params_base.speculative.types.end(); + if (!do_reset && spec_eagle3 && + (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS)) { + SLT_WRN(slot, "%s", "eagle3 draft cannot resume from a recurrent/hybrid checkpoint, forcing full re-processing\n"); + do_reset = true; + } + if (!do_reset) { // restore the context checkpoint it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); From df3bc6db4f4a456d53fa756907241579ce2042c9 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 17 Jun 2026 21:46:56 +0200 Subject: [PATCH 2/6] eagle3: Add deferred boundary checkpoints restore support for hybrid models --- common/common.cpp | 4 ++- common/common.h | 5 ++- common/speculative.cpp | 57 +++++++++++++++++++++++++++++++++ common/speculative.h | 4 +++ tools/server/server-context.cpp | 15 +++------ 5 files changed, 72 insertions(+), 13 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b01772e1cbfe..a66fc8c5da12 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode( } size_t common_prompt_checkpoint::size() const { - return data_tgt.size() + data_dft.size(); + return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float); } bool common_prompt_checkpoint::empty() const { @@ -2049,6 +2049,7 @@ void common_prompt_checkpoint::clear() { data_tgt.clear(); data_dft.clear(); + data_dft_boundary_g_embd.clear(); } void common_prompt_checkpoint::update_pos( @@ -2138,4 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() { void common_prompt_checkpoint::clear_dft() { data_dft.clear(); + data_dft_boundary_g_embd.clear(); } diff --git a/common/common.h b/common/common.h index 040b9cf23312..d023b6d8c059 100644 --- a/common/common.h +++ b/common/common.h @@ -363,7 +363,7 @@ struct common_params_speculative { uint32_t need_n_rs_seq() const { bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { - return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP; + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3; }); return needs_rs_seq ? draft.n_max : 0u; @@ -1065,6 +1065,9 @@ struct common_prompt_checkpoint { std::vector data_tgt; std::vector data_dft; + // eagle3: deferred-boundary g_embd row stashed with the checkpoint + std::vector data_dft_boundary_g_embd; + size_t size() const; bool empty() const; diff --git a/common/speculative.cpp b/common/speculative.cpp index 6f387f2cfc13..9ca4955a626c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -161,6 +161,10 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; + // eagle3: deferred-boundary g_embd stash for checkpoints (default: none) + virtual bool get_deferred_boundary(llama_seq_id /*seq_id*/, std::vector & /*g_out*/) const { return false; } + virtual void set_deferred_boundary(llama_seq_id /*seq_id*/, llama_pos /*pos*/, const std::vector & /*g*/) {} + // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -841,6 +845,35 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { (size_t) n_embd_dec * sizeof(float)); } + // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets: + // their single-position checkpoints drop it on restore + bool need_boundary_stash() const { + const llama_model * model_tgt = llama_get_model(params.ctx_tgt); + return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt); + } + + bool get_deferred_boundary(llama_seq_id seq_id, std::vector & g_out) const override { + if (!need_boundary_stash()) { + return false; + } + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) { + return false; + } + g_out = pending_g_last[seq_id]; + return true; + } + + void set_deferred_boundary(llama_seq_id seq_id, llama_pos pos, const std::vector & g) override { + if (!need_boundary_stash()) { + return; + } + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || (int32_t) g.size() != n_embd_dec) { + return; + } + pending_pos_last[seq_id] = pos; + pending_g_last[seq_id] = g; + } + bool need_embd() const override { return false; } @@ -2118,6 +2151,30 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u } } +bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector & g_out) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->get_deferred_boundary(seq_id, g_out)) { + return true; + } + } + + return false; +} + +void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos pos, const std::vector & g) { + if (spec == nullptr) { + return; + } + + for (auto & impl : spec->impls) { + impl->set_deferred_boundary(seq_id, pos, g); + } +} + void common_speculative_print_stats(const common_speculative * spec) { if (spec == nullptr) { return; diff --git a/common/speculative.h b/common/speculative.h index bf76ad709e26..24d943203a52 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec); // informs the speculative context that n_accepted tokens were accepted by the target model void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted); +// eagle3: deferred-boundary g_embd stash for checkpoints (no-op for other draft types) +bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector & g_out); +void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos boundary_pos, const std::vector & g); + // print statistics about the speculative decoding void common_speculative_print_stats(const common_speculative * spec); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b526008096d5..818a0766c7a4 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2157,6 +2157,8 @@ struct server_context_impl { cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + // stash the draft's deferred boundary with the checkpoint (only eagle3 needs it; no-op otherwise) + common_speculative_get_deferred_boundary(spec.get(), slot.id, cur.data_dft_boundary_g_embd); SLT_INF(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", @@ -2977,21 +2979,12 @@ struct server_context_impl { bool do_reset = it == slot.prompt.checkpoints.rend(); - // eagle3 draft is one position behind the target due to deferred boundary), so it - // can't resume from a checkpoint restored on a recurrent/hybrid target; re-process fully instead. - const bool spec_eagle3 = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3) != params_base.speculative.types.end(); - if (!do_reset && spec_eagle3 && - (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || - ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS)) { - SLT_WRN(slot, "%s", "eagle3 draft cannot resume from a recurrent/hybrid checkpoint, forcing full re-processing\n"); - do_reset = true; - } - if (!do_reset) { // restore the context checkpoint it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + // restore the draft's deferred boundary (only eagle3 needs it; no-op otherwise) + common_speculative_set_deferred_boundary(spec.get(), slot.id, it->pos_max, it->data_dft_boundary_g_embd); pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); From 714be987c46be84f48b8d88bae3418cde4c03d16 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 18 Jun 2026 17:34:21 +0200 Subject: [PATCH 3/6] apply suggestions Co-authored-by: Georgi Gerganov --- common/speculative.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/speculative.h b/common/speculative.h index 24d943203a52..c58fac3cc6d0 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -68,9 +68,9 @@ void common_speculative_draft(common_speculative * spec); // informs the speculative context that n_accepted tokens were accepted by the target model void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted); -// eagle3: deferred-boundary g_embd stash for checkpoints (no-op for other draft types) -bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector & g_out); -void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos boundary_pos, const std::vector & g); +// (optional) get/set internal state +bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector & data); +void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector & data); // print statistics about the speculative decoding void common_speculative_print_stats(const common_speculative * spec); From 46b9dc91cd32644ee62afca2c246de9f63100493 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 18 Jun 2026 15:57:49 +0000 Subject: [PATCH 4/6] spec: adapt to API change --- common/common.cpp | 6 +++--- common/common.h | 5 +++-- common/speculative.cpp | 38 ++++++++++++++++++++++----------- tools/server/server-context.cpp | 8 +++---- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a66fc8c5da12..643d93db3738 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode( } size_t common_prompt_checkpoint::size() const { - return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float); + return data_tgt.size() + data_dft.size() + data_spec_state.size(); } bool common_prompt_checkpoint::empty() const { @@ -2049,7 +2049,7 @@ void common_prompt_checkpoint::clear() { data_tgt.clear(); data_dft.clear(); - data_dft_boundary_g_embd.clear(); + data_spec_state.clear(); } void common_prompt_checkpoint::update_pos( @@ -2139,5 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() { void common_prompt_checkpoint::clear_dft() { data_dft.clear(); - data_dft_boundary_g_embd.clear(); + data_spec_state.clear(); } diff --git a/common/common.h b/common/common.h index d023b6d8c059..b92dfcce3329 100644 --- a/common/common.h +++ b/common/common.h @@ -1065,8 +1065,9 @@ struct common_prompt_checkpoint { std::vector data_tgt; std::vector data_dft; - // eagle3: deferred-boundary g_embd row stashed with the checkpoint - std::vector data_dft_boundary_g_embd; + // (optional) speculative-decoding implementation state stashed with the checkpoint + // (e.g. eagle3's deferred-boundary g_embd row) + std::vector data_spec_state; size_t size() const; diff --git a/common/speculative.cpp b/common/speculative.cpp index 9ca4955a626c..04495a2c6f78 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -161,9 +161,9 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; - // eagle3: deferred-boundary g_embd stash for checkpoints (default: none) - virtual bool get_deferred_boundary(llama_seq_id /*seq_id*/, std::vector & /*g_out*/) const { return false; } - virtual void set_deferred_boundary(llama_seq_id /*seq_id*/, llama_pos /*pos*/, const std::vector & /*g*/) {} + // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary). + virtual bool get_state(llama_seq_id /*seq_id*/, std::vector & /*data*/) const { return false; } + virtual void set_state(llama_seq_id /*seq_id*/, const std::vector & /*data*/) {} // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -852,26 +852,40 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt); } - bool get_deferred_boundary(llama_seq_id seq_id, std::vector & g_out) const override { + bool get_state(llama_seq_id seq_id, std::vector & data) const override { if (!need_boundary_stash()) { return false; } if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) { return false; } - g_out = pending_g_last[seq_id]; + + const llama_pos pos = pending_pos_last[seq_id]; + const std::vector & g = pending_g_last[seq_id]; + + data.resize(sizeof(llama_pos) + g.size() * sizeof(float)); + std::memcpy(data.data(), &pos, sizeof(llama_pos)); + std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float)); return true; } - void set_deferred_boundary(llama_seq_id seq_id, llama_pos pos, const std::vector & g) override { + void set_state(llama_seq_id seq_id, const std::vector & data) override { if (!need_boundary_stash()) { return; } - if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || (int32_t) g.size() != n_embd_dec) { + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } + if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) { + return; + } + + llama_pos pos = -1; + std::memcpy(&pos, data.data(), sizeof(llama_pos)); + pending_pos_last[seq_id] = pos; - pending_g_last[seq_id] = g; + pending_g_last[seq_id].resize(n_embd_dec); + std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float)); } bool need_embd() const override { @@ -2151,13 +2165,13 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u } } -bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector & g_out) { +bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector & data) { if (spec == nullptr) { return false; } for (auto & impl : spec->impls) { - if (impl->get_deferred_boundary(seq_id, g_out)) { + if (impl->get_state(seq_id, data)) { return true; } } @@ -2165,13 +2179,13 @@ bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_s return false; } -void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos pos, const std::vector & g) { +void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector & data) { if (spec == nullptr) { return; } for (auto & impl : spec->impls) { - impl->set_deferred_boundary(seq_id, pos, g); + impl->set_state(seq_id, data); } } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 818a0766c7a4..287c9bee38ba 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2157,8 +2157,8 @@ struct server_context_impl { cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - // stash the draft's deferred boundary with the checkpoint (only eagle3 needs it; no-op otherwise) - common_speculative_get_deferred_boundary(spec.get(), slot.id, cur.data_dft_boundary_g_embd); + // stash the draft's speculative state with the checkpoint + common_speculative_get_state(spec.get(), slot.id, cur.data_spec_state); SLT_INF(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", @@ -2983,8 +2983,8 @@ struct server_context_impl { // restore the context checkpoint it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - // restore the draft's deferred boundary (only eagle3 needs it; no-op otherwise) - common_speculative_set_deferred_boundary(spec.get(), slot.id, it->pos_max, it->data_dft_boundary_g_embd); + // restore the draft's speculative state + common_speculative_set_state(spec.get(), slot.id, it->data_spec_state); pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); From 20f542371dc2b7897b8821f6f397bf48816ce44a Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 18 Jun 2026 16:19:42 +0000 Subject: [PATCH 5/6] spec: fix naming --- common/common.cpp | 6 +++--- common/common.h | 2 +- tools/server/server-context.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 643d93db3738..f3f114f68245 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode( } size_t common_prompt_checkpoint::size() const { - return data_tgt.size() + data_dft.size() + data_spec_state.size(); + return data_tgt.size() + data_dft.size() + data_spec.size(); } bool common_prompt_checkpoint::empty() const { @@ -2049,7 +2049,7 @@ void common_prompt_checkpoint::clear() { data_tgt.clear(); data_dft.clear(); - data_spec_state.clear(); + data_spec.clear(); } void common_prompt_checkpoint::update_pos( @@ -2139,5 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() { void common_prompt_checkpoint::clear_dft() { data_dft.clear(); - data_spec_state.clear(); + data_spec.clear(); } diff --git a/common/common.h b/common/common.h index b92dfcce3329..535a4ed335ad 100644 --- a/common/common.h +++ b/common/common.h @@ -1067,7 +1067,7 @@ struct common_prompt_checkpoint { // (optional) speculative-decoding implementation state stashed with the checkpoint // (e.g. eagle3's deferred-boundary g_embd row) - std::vector data_spec_state; + std::vector data_spec; size_t size() const; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 287c9bee38ba..59048889e09f 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2158,7 +2158,7 @@ struct server_context_impl { cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); // stash the draft's speculative state with the checkpoint - common_speculative_get_state(spec.get(), slot.id, cur.data_spec_state); + common_speculative_get_state(spec.get(), slot.id, cur.data_spec); SLT_INF(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", @@ -2984,7 +2984,7 @@ struct server_context_impl { it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); // restore the draft's speculative state - common_speculative_set_state(spec.get(), slot.id, it->data_spec_state); + common_speculative_set_state(spec.get(), slot.id, it->data_spec); pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); From 25ce9a70efdc5ec1759115f42dc5430dbc7ab93f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 19 Jun 2026 13:08:27 +0300 Subject: [PATCH 6/6] cont : add TODO --- common/speculative.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/speculative.cpp b/common/speculative.cpp index 04495a2c6f78..9c20585dc3e3 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -2165,6 +2165,7 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u } } +// TODO: support the case of more than one speculative implementations having a state bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector & data) { if (spec == nullptr) { return false;