Skip to content

Commit 3d5aedb

Browse files
author
Brendan Gray
committed
v1.8.45: Fix node-llama-cpp API misuse causing CONTEXT_OVERFLOW
Fix G1: Correct lastEvaluationContextWindow structure - was passing raw lastEvaluation object instead of {history, minimumOverlapPercentageToPreventContextShift}. This silently disabled KV cache reuse, causing full re-tokenization every turn. Fix G2: Correct contextShift metadata key - was using wrong key name 'lastEvaluationContextWindowHistory' instead of 'lastEvaluationMetadata'. Fix G3: Add pre-generation diagnostic logging for chatHistory state. Fix G4: Strip think/thought segment objects from cleanHistory before storing. node-llama-cpp preserves hidden thought segments in cleanHistory even when budgets.thoughtTokens=0, causing token count inflation across turns.
1 parent 5881495 commit 3d5aedb

2 files changed

Lines changed: 34 additions & 5 deletions

File tree

main/llmEngine.js

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,25 @@ class LLMEngine extends EventEmitter {
156156
this._lastCompactDropped = droppedCount;
157157
}
158158

159+
/**
160+
* Strip think/thought segment objects from cleanHistory model responses.
161+
* node-llama-cpp includes segment objects (type: "segment", segmentType: "thought")
162+
* in cleanHistory even when budgets.thoughtTokens = 0. These accumulate across
163+
* turns and inflate the history token count on re-tokenization. This method
164+
* preserves only string (visible) content in model responses.
165+
*/
166+
_stripThinkSegments(history) {
167+
if (!Array.isArray(history)) return history;
168+
return history.map(entry => {
169+
if (entry.type !== 'model' || !Array.isArray(entry.response)) return entry;
170+
const filtered = entry.response.filter(item =>
171+
typeof item === 'string' || (item && item.type === 'segment' && item.segmentType !== 'thought')
172+
);
173+
if (filtered.length === entry.response.length) return entry;
174+
return { ...entry, response: filtered };
175+
});
176+
}
177+
159178
_sanitizeResponse(text) {
160179
return sanitizeResponse(text);
161180
}
@@ -653,6 +672,13 @@ class LLMEngine extends EventEmitter {
653672
// Compact history if too long
654673
this._compactHistory();
655674

675+
// Diagnostic: actual chatHistory size before generation
676+
const _histChars = this.chatHistory.reduce((s, h) => {
677+
if (h.type === 'model') return s + JSON.stringify(h.response).length;
678+
return s + (h.text?.length || 0);
679+
}, 0);
680+
console.log(`[LLM DIAG] Pre-gen: entries=${this.chatHistory.length}, chars=${_histChars}, kvReuse=${this._kvReuseCooldown <= 0 && !!this.lastEvaluation}, seqPos=${this.sequence?.nextTokenIndex || 0}`);
681+
656682
// Stall watchdog — two-phase: longer timeout for prompt eval (first token),
657683
// shorter timeout for generation stalls (between tokens)
658684
const PROMPT_EVAL_TIMEOUT_MS = (this.modelInfo?.gpuMode === false) ? STALL_TIMEOUT_CPU_MS : STALL_TIMEOUT_GPU_MS;
@@ -811,11 +837,11 @@ class LLMEngine extends EventEmitter {
811837
});
812838
if (retryResult?.lastEvaluation) {
813839
this.lastEvaluation = retryResult.lastEvaluation;
814-
this.chatHistory = retryResult.lastEvaluation.cleanHistory || this.chatHistory;
840+
this.chatHistory = this._stripThinkSegments(retryResult.lastEvaluation.cleanHistory) || this.chatHistory;
815841
}
816842
} else if (result?.lastEvaluation) {
817843
this.lastEvaluation = result.lastEvaluation;
818-
this.chatHistory = result.lastEvaluation.cleanHistory || this.chatHistory;
844+
this.chatHistory = this._stripThinkSegments(result.lastEvaluation.cleanHistory) || this.chatHistory;
819845
}
820846

821847
if (this._kvReuseCooldown > 0) this._kvReuseCooldown--;
@@ -898,9 +924,12 @@ class LLMEngine extends EventEmitter {
898924
lastTokens: params.lastTokensPenaltyCount,
899925
},
900926
seed: params.seed !== -1 ? params.seed : undefined,
901-
lastEvaluationContextWindow: useKvCache ? this.lastEvaluation : undefined,
927+
lastEvaluationContextWindow: useKvCache ? {
928+
history: this.lastEvaluation?.contextWindow,
929+
minimumOverlapPercentageToPreventContextShift: 0.5,
930+
} : undefined,
902931
contextShift: useKvCache ? {
903-
lastEvaluationContextWindowHistory: this.lastEvaluation?.contextShiftMetadata,
932+
lastEvaluationMetadata: this.lastEvaluation?.contextShiftMetadata,
904933
} : undefined,
905934
budgets,
906935
signal: this.abortController?.signal,

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "guide-ide",
3-
"version": "1.8.44",
3+
"version": "1.8.45",
44
"description": "guIDE - AI-Powered Offline IDE with local LLM, RAG, MCP tools, browser automation, and integrated terminal",
55
"author": {
66
"name": "Brendan Gray",

0 commit comments

Comments
 (0)