From 7d09a8d01cc96ae793099139748ca36b7f69f8dd Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Sat, 13 Jun 2026 20:37:26 +0000
Subject: [PATCH 1/6] spec: support qwen3.5 & 3.6 eagle3 draft

---
 src/models/qwen35.cpp           |  2 ++
 src/models/qwen35moe.cpp        |  2 ++
 tools/server/server-context.cpp | 11 +++++++++++
 3 files changed, 15 insertions(+)

diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 6783d98ec204..d8ffe43ae76c 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index eb5e9a406a15..7b0876cbb04b 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
 
     // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 31280d63c4ea..b526008096d5 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2977,6 +2977,17 @@ struct server_context_impl {
 
                                     bool do_reset = it == slot.prompt.checkpoints.rend();
 
+                                    // eagle3 draft is one position behind the target due to deferred boundary), so it
+                                    // can't resume from a checkpoint restored on a recurrent/hybrid target; re-process fully instead.
+                                    const bool spec_eagle3 = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
+                                                                       COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3) != params_base.speculative.types.end();
+                                    if (!do_reset && spec_eagle3 &&
+                                            (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL ||
+                                             ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS)) {
+                                        SLT_WRN(slot, "%s", "eagle3 draft cannot resume from a recurrent/hybrid checkpoint, forcing full re-processing\n");
+                                        do_reset = true;
+                                    }
+
                                     if (!do_reset) {
                                         // restore the context checkpoint
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);

From df3bc6db4f4a456d53fa756907241579ce2042c9 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Wed, 17 Jun 2026 21:46:56 +0200
Subject: [PATCH 2/6] eagle3: Add deferred boundary checkpoints restore support
 for hybrid models

---
 common/common.cpp               |  4 ++-
 common/common.h                 |  5 ++-
 common/speculative.cpp          | 57 +++++++++++++++++++++++++++++++++
 common/speculative.h            |  4 +++
 tools/server/server-context.cpp | 15 +++------
 5 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b01772e1cbfe..a66fc8c5da12 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(
 }
 
 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
+    return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float);
 }
 
 bool common_prompt_checkpoint::empty() const {
@@ -2049,6 +2049,7 @@ void common_prompt_checkpoint::clear() {
 
     data_tgt.clear();
     data_dft.clear();
+    data_dft_boundary_g_embd.clear();
 }
 
 void common_prompt_checkpoint::update_pos(
@@ -2138,4 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {
 
 void common_prompt_checkpoint::clear_dft() {
     data_dft.clear();
+    data_dft_boundary_g_embd.clear();
 }
diff --git a/common/common.h b/common/common.h
index 040b9cf23312..d023b6d8c059 100644
--- a/common/common.h
+++ b/common/common.h
@@ -363,7 +363,7 @@ struct common_params_speculative {
 
     uint32_t need_n_rs_seq() const {
         bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
         });
 
         return needs_rs_seq ? draft.n_max : 0u;
@@ -1065,6 +1065,9 @@ struct common_prompt_checkpoint {
     std::vector<uint8_t> data_tgt;
     std::vector<uint8_t> data_dft;
 
+    // eagle3: deferred-boundary g_embd row stashed with the checkpoint
+    std::vector<float> data_dft_boundary_g_embd;
+
     size_t size() const;
 
     bool empty() const;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 6f387f2cfc13..9ca4955a626c 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -161,6 +161,10 @@ struct common_speculative_impl {
 
     virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
 
+    // eagle3: deferred-boundary g_embd stash for checkpoints (default: none)
+    virtual bool get_deferred_boundary(llama_seq_id /*seq_id*/, std::vector<float> & /*g_out*/) const { return false; }
+    virtual void set_deferred_boundary(llama_seq_id /*seq_id*/, llama_pos /*pos*/, const std::vector<float> & /*g*/) {}
+
     // true if this implementation requires the target context to extract post-norm embeddings
     virtual bool need_embd() const = 0;
 
@@ -841,6 +845,35 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                     (size_t) n_embd_dec * sizeof(float));
     }
 
+    // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
+    // their single-position checkpoints drop it on restore
+    bool need_boundary_stash() const {
+        const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
+        return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
+    }
+
+    bool get_deferred_boundary(llama_seq_id seq_id, std::vector<float> & g_out) const override {
+        if (!need_boundary_stash()) {
+            return false;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
+            return false;
+        }
+        g_out = pending_g_last[seq_id];
+        return true;
+    }
+
+    void set_deferred_boundary(llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) override {
+        if (!need_boundary_stash()) {
+            return;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || (int32_t) g.size() != n_embd_dec) {
+            return;
+        }
+        pending_pos_last[seq_id] = pos;
+        pending_g_last[seq_id]   = g;
+    }
+
     bool need_embd() const override {
         return false;
     }
@@ -2118,6 +2151,30 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
     }
 }
 
+bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->get_deferred_boundary(seq_id, g_out)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) {
+    if (spec == nullptr) {
+        return;
+    }
+
+    for (auto & impl : spec->impls) {
+        impl->set_deferred_boundary(seq_id, pos, g);
+    }
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
     if (spec == nullptr) {
         return;
diff --git a/common/speculative.h b/common/speculative.h
index bf76ad709e26..24d943203a52 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
 
+// eagle3: deferred-boundary g_embd stash for checkpoints (no-op for other draft types)
+bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out);
+void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos boundary_pos, const std::vector<float> & g);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b526008096d5..818a0766c7a4 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2157,6 +2157,8 @@ struct server_context_impl {
 
         cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        // stash the draft's deferred boundary with the checkpoint (only eagle3 needs it; no-op otherwise)
+        common_speculative_get_deferred_boundary(spec.get(), slot.id, cur.data_dft_boundary_g_embd);
 
         SLT_INF(slot,
                 "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -2977,21 +2979,12 @@ struct server_context_impl {
 
                                     bool do_reset = it == slot.prompt.checkpoints.rend();
 
-                                    // eagle3 draft is one position behind the target due to deferred boundary), so it
-                                    // can't resume from a checkpoint restored on a recurrent/hybrid target; re-process fully instead.
-                                    const bool spec_eagle3 = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
-                                                                       COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3) != params_base.speculative.types.end();
-                                    if (!do_reset && spec_eagle3 &&
-                                            (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL ||
-                                             ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS)) {
-                                        SLT_WRN(slot, "%s", "eagle3 draft cannot resume from a recurrent/hybrid checkpoint, forcing full re-processing\n");
-                                        do_reset = true;
-                                    }
-
                                     if (!do_reset) {
                                         // restore the context checkpoint
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                                        // restore the draft's deferred boundary (only eagle3 needs it; no-op otherwise)
+                                        common_speculative_set_deferred_boundary(spec.get(), slot.id, it->pos_max, it->data_dft_boundary_g_embd);
 
                                         pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
                                         n_past   = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);

From 714be987c46be84f48b8d88bae3418cde4c03d16 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Thu, 18 Jun 2026 17:34:21 +0200
Subject: [PATCH 3/6] apply suggestions

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/speculative.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/speculative.h b/common/speculative.h
index 24d943203a52..c58fac3cc6d0 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -68,9 +68,9 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
 
-// eagle3: deferred-boundary g_embd stash for checkpoints (no-op for other draft types)
-bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out);
-void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos boundary_pos, const std::vector<float> & g);
+// (optional) get/set internal state
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
 
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);

From 46b9dc91cd32644ee62afca2c246de9f63100493 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Thu, 18 Jun 2026 15:57:49 +0000
Subject: [PATCH 4/6] spec: adapt to API change

---
 common/common.cpp               |  6 +++---
 common/common.h                 |  5 +++--
 common/speculative.cpp          | 38 ++++++++++++++++++++++-----------
 tools/server/server-context.cpp |  8 +++----
 4 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a66fc8c5da12..643d93db3738 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(
 }
 
 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float);
+    return data_tgt.size() + data_dft.size() + data_spec_state.size();
 }
 
 bool common_prompt_checkpoint::empty() const {
@@ -2049,7 +2049,7 @@ void common_prompt_checkpoint::clear() {
 
     data_tgt.clear();
     data_dft.clear();
-    data_dft_boundary_g_embd.clear();
+    data_spec_state.clear();
 }
 
 void common_prompt_checkpoint::update_pos(
@@ -2139,5 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {
 
 void common_prompt_checkpoint::clear_dft() {
     data_dft.clear();
-    data_dft_boundary_g_embd.clear();
+    data_spec_state.clear();
 }
diff --git a/common/common.h b/common/common.h
index d023b6d8c059..b92dfcce3329 100644
--- a/common/common.h
+++ b/common/common.h
@@ -1065,8 +1065,9 @@ struct common_prompt_checkpoint {
     std::vector<uint8_t> data_tgt;
     std::vector<uint8_t> data_dft;
 
-    // eagle3: deferred-boundary g_embd row stashed with the checkpoint
-    std::vector<float> data_dft_boundary_g_embd;
+    // (optional) speculative-decoding implementation state stashed with the checkpoint
+    // (e.g. eagle3's deferred-boundary g_embd row)
+    std::vector<uint8_t> data_spec_state;
 
     size_t size() const;
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 9ca4955a626c..04495a2c6f78 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -161,9 +161,9 @@ struct common_speculative_impl {
 
     virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
 
-    // eagle3: deferred-boundary g_embd stash for checkpoints (default: none)
-    virtual bool get_deferred_boundary(llama_seq_id /*seq_id*/, std::vector<float> & /*g_out*/) const { return false; }
-    virtual void set_deferred_boundary(llama_seq_id /*seq_id*/, llama_pos /*pos*/, const std::vector<float> & /*g*/) {}
+    // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
+    virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
+    virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
 
     // true if this implementation requires the target context to extract post-norm embeddings
     virtual bool need_embd() const = 0;
@@ -852,26 +852,40 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
     }
 
-    bool get_deferred_boundary(llama_seq_id seq_id, std::vector<float> & g_out) const override {
+    bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
         if (!need_boundary_stash()) {
             return false;
         }
         if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
             return false;
         }
-        g_out = pending_g_last[seq_id];
+
+        const llama_pos          pos = pending_pos_last[seq_id];
+        const std::vector<float> & g = pending_g_last[seq_id];
+
+        data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
+        std::memcpy(data.data(),                     &pos,     sizeof(llama_pos));
+        std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
         return true;
     }
 
-    void set_deferred_boundary(llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) override {
+    void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
         if (!need_boundary_stash()) {
             return;
         }
-        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || (int32_t) g.size() != n_embd_dec) {
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
             return;
         }
+        if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
+            return;
+        }
+
+        llama_pos pos = -1;
+        std::memcpy(&pos, data.data(), sizeof(llama_pos));
+
         pending_pos_last[seq_id] = pos;
-        pending_g_last[seq_id]   = g;
+        pending_g_last[seq_id].resize(n_embd_dec);
+        std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
     }
 
     bool need_embd() const override {
@@ -2151,13 +2165,13 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
     }
 }
 
-bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out) {
+bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
     if (spec == nullptr) {
         return false;
     }
 
     for (auto & impl : spec->impls) {
-        if (impl->get_deferred_boundary(seq_id, g_out)) {
+        if (impl->get_state(seq_id, data)) {
             return true;
         }
     }
@@ -2165,13 +2179,13 @@ bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_s
     return false;
 }
 
-void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) {
+void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
     if (spec == nullptr) {
         return;
     }
 
     for (auto & impl : spec->impls) {
-        impl->set_deferred_boundary(seq_id, pos, g);
+        impl->set_state(seq_id, data);
     }
 }
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 818a0766c7a4..287c9bee38ba 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2157,8 +2157,8 @@ struct server_context_impl {
 
         cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-        // stash the draft's deferred boundary with the checkpoint (only eagle3 needs it; no-op otherwise)
-        common_speculative_get_deferred_boundary(spec.get(), slot.id, cur.data_dft_boundary_g_embd);
+        // stash the draft's speculative state with the checkpoint
+        common_speculative_get_state(spec.get(), slot.id, cur.data_spec_state);
 
         SLT_INF(slot,
                 "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -2983,8 +2983,8 @@ struct server_context_impl {
                                         // restore the context checkpoint
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-                                        // restore the draft's deferred boundary (only eagle3 needs it; no-op otherwise)
-                                        common_speculative_set_deferred_boundary(spec.get(), slot.id, it->pos_max, it->data_dft_boundary_g_embd);
+                                        // restore the draft's speculative state
+                                        common_speculative_set_state(spec.get(), slot.id, it->data_spec_state);
 
                                         pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
                                         n_past   = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);

From 20f542371dc2b7897b8821f6f397bf48816ce44a Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Thu, 18 Jun 2026 16:19:42 +0000
Subject: [PATCH 5/6] spec: fix naming

---
 common/common.cpp               | 6 +++---
 common/common.h                 | 2 +-
 tools/server/server-context.cpp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 643d93db3738..f3f114f68245 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(
 }
 
 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size() + data_spec_state.size();
+    return data_tgt.size() + data_dft.size() + data_spec.size();
 }
 
 bool common_prompt_checkpoint::empty() const {
@@ -2049,7 +2049,7 @@ void common_prompt_checkpoint::clear() {
 
     data_tgt.clear();
     data_dft.clear();
-    data_spec_state.clear();
+    data_spec.clear();
 }
 
 void common_prompt_checkpoint::update_pos(
@@ -2139,5 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {
 
 void common_prompt_checkpoint::clear_dft() {
     data_dft.clear();
-    data_spec_state.clear();
+    data_spec.clear();
 }
diff --git a/common/common.h b/common/common.h
index b92dfcce3329..535a4ed335ad 100644
--- a/common/common.h
+++ b/common/common.h
@@ -1067,7 +1067,7 @@ struct common_prompt_checkpoint {
 
     // (optional) speculative-decoding implementation state stashed with the checkpoint
     // (e.g. eagle3's deferred-boundary g_embd row)
-    std::vector<uint8_t> data_spec_state;
+    std::vector<uint8_t> data_spec;
 
     size_t size() const;
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 287c9bee38ba..59048889e09f 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2158,7 +2158,7 @@ struct server_context_impl {
         cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         // stash the draft's speculative state with the checkpoint
-        common_speculative_get_state(spec.get(), slot.id, cur.data_spec_state);
+        common_speculative_get_state(spec.get(), slot.id, cur.data_spec);
 
         SLT_INF(slot,
                 "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -2984,7 +2984,7 @@ struct server_context_impl {
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         // restore the draft's speculative state
-                                        common_speculative_set_state(spec.get(), slot.id, it->data_spec_state);
+                                        common_speculative_set_state(spec.get(), slot.id, it->data_spec);
 
                                         pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
                                         n_past   = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);

From 25ce9a70efdc5ec1759115f42dc5430dbc7ab93f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jun 2026 13:08:27 +0300
Subject: [PATCH 6/6] cont : add TODO

---
 common/speculative.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 04495a2c6f78..9c20585dc3e3 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -2165,6 +2165,7 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
     }
 }
 
+// TODO: support the case of more than one speculative implementations having a state
 bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
     if (spec == nullptr) {
         return false;