diff --git a/CHANGELOG.md b/CHANGELOG.md index 5031e5808..a783fab42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- feat: Update llama.cpp to ggerganov/llama.cpp@5d6f18a63 and sync Python bindings +- feat: Update llama.cpp to ggerganov/llama.cpp@7d442abf - fix: Correct batched embedding outputs for multi-sequence `embed()` calls by @Anai-Guo in #2205 - fix: Configure embedding contexts with enough sequence slots for batched `embed()` calls +- fix: Mark all embedding input tokens as outputs to avoid llama.cpp override warnings by @Anai-Guo in #2212 ## [0.3.22] diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 752c25dd3..2afa4c8e9 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1040,7 +1040,13 @@ def embed( # get pooling information pooling_type = self.pooling_type() - logits_all = pooling_type == llama_cpp.LLAMA_POOLING_TYPE_NONE + # In embedding mode every input token must be marked as an output, regardless of + # pooling type. llama.cpp would otherwise override per-token `logits[i]` and emit + # "embeddings required but some input tokens were not marked as outputs -> + # overriding" once per input. Pooling NONE vs MEAN/CLS only changes how the + # per-token outputs are read back (see decode_batch below), not whether they are + # produced. See abetlen/llama-cpp-python#2208. + logits_all = True if self.context_params.embeddings is False: raise RuntimeError( diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 5d6f18a63..7d442abf5 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 5d6f18a6387a7066fe387233f2ca6f113cb209fb +Subproject commit 7d442abf5c6244117fd5a1dc51a5d19f00792491