From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001
From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com>
Date: Wed, 13 May 2026 16:35:30 -0400
Subject: [PATCH] fix(embedding): set kv_unified=True when embedding=True to
 enable batch processing (#2217)

* fix(embedding): set kv_unified=True when embedding=True to enable batch processing

* chore: update changelog for batch embedding fix

---------

Co-authored-by: abetlen <abetlen@gmail.com>
---
 CHANGELOG.md       | 2 ++
 llama_cpp/llama.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 645fd8005..900176ea1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217
+
 ## [0.3.23]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2afa4c8e9..75c74b41f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -402,7 +402,7 @@ def __init__(
                 self.n_batch,
                 llama_cpp.llama_max_parallel_sequences(),
             )
-
+            self.context_params.kv_unified = True
         self._ctx = self._stack.enter_context(
             contextlib.closing(
                 internals.LlamaContext(