From 733bb36f71a200fbb4be0a4ca11b1d810d82dde9 Mon Sep 17 00:00:00 2001 From: Kevin Courbet Date: Fri, 20 Feb 2026 20:28:45 +0100 Subject: [PATCH] fix: prevent reranker context overflow on large chunks Increase RERANK_CONTEXT_SIZE from 2048 to 8192 and add truncation safety net to prevent crashes when individual chunks exceed the context window. The Qwen3-Reranker model supports larger contexts, and 8192 tokens only uses ~4 GB VRAM with flash attention (vs ~960 MB at 2048), which is a reasonable trade-off for robustness. Additionally, before passing documents to rankAll(), estimate each document's token count (chars/4) and truncate any that would exceed the context size minus query and template overhead. This ensures the reranker never crashes even with unexpectedly large chunks. --- src/llm.ts | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 46c62957..d6e3a6cb 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -725,8 +725,10 @@ export class LlamaCpp implements LLM { */ // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.) // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical. - // Use 2048 for safety margin. Still 17× less than auto (40960). - private static readonly RERANK_CONTEXT_SIZE = 2048; + // Use 8192 to handle edge cases where chunks (especially code or non-ASCII + // text) exceed expected token counts. VRAM cost is modest (~4 GB with flash + // attention). Still 5× less than auto (40960). + private static readonly RERANK_CONTEXT_SIZE = 8192; private async ensureRerankContexts(): Promise>[]> { if (this.rerankContexts.length === 0) { @@ -1038,8 +1040,20 @@ export class LlamaCpp implements LLM { textToDoc.set(doc.text, { file: doc.file, index }); }); - // Extract just the text for ranking - const texts = documents.map((doc) => doc.text); + // Extract just the text for ranking, truncating any that would exceed + // the reranker context size. The Qwen3 template adds ~200 tokens of + // overhead (system prompt, tags, etc.) plus the query itself. + const overheadTokens = 200 + Math.ceil(query.length / 4); + const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - overheadTokens; + // chars/4 is a conservative token-count estimate + const maxDocChars = maxDocTokens * 4; + + const texts = documents.map((doc) => { + if (doc.text.length > maxDocChars) { + return doc.text.slice(0, maxDocChars); + } + return doc.text; + }); // Split documents across contexts for parallel evaluation. // Each context has its own sequence with a lock, so parallelism comes