From 95ccb191ab119dc5020a5ed6599c943e258ed0f2 Mon Sep 17 00:00:00 2001 From: Sanjana Brahmbhatt <90378084+SanjanaB123@users.noreply.github.com> Date: Wed, 13 May 2026 16:35:30 -0400 Subject: [PATCH] fix(embedding): set kv_unified=True when embedding=True to enable batch processing (#2217) * fix(embedding): set kv_unified=True when embedding=True to enable batch processing * chore: update changelog for batch embedding fix --------- Co-authored-by: abetlen --- CHANGELOG.md | 2 ++ llama_cpp/llama.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 645fd8005..900176ea1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Enable unified KV cache for embedding contexts to preserve full per-sequence context in batch embedding calls by @SanjanaB123 in #2217 + ## [0.3.23] - feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2afa4c8e9..75c74b41f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -402,7 +402,7 @@ def __init__( self.n_batch, llama_cpp.llama_max_parallel_sequences(), ) - + self.context_params.kv_unified = True self._ctx = self._stack.enter_context( contextlib.closing( internals.LlamaContext(