From 733bb36f71a200fbb4be0a4ca11b1d810d82dde9 Mon Sep 17 00:00:00 2001
From: Kevin Courbet <kevin.courbet.contact@gmail.com>
Date: Fri, 20 Feb 2026 20:28:45 +0100
Subject: [PATCH] fix: prevent reranker context overflow on large chunks

Increase RERANK_CONTEXT_SIZE from 2048 to 8192 and add truncation safety
net to prevent crashes when individual chunks exceed the context window.

The Qwen3-Reranker model supports larger contexts, and 8192 tokens only
uses ~4 GB VRAM with flash attention (vs ~960 MB at 2048), which is a
reasonable trade-off for robustness.

Additionally, before passing documents to rankAll(), estimate each
document's token count (chars/4) and truncate any that would exceed the
context size minus query and template overhead. This ensures the reranker
never crashes even with unexpectedly large chunks.
---
 src/llm.ts | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/llm.ts b/src/llm.ts
index 46c62957..d6e3a6cb 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -725,8 +725,10 @@ export class LlamaCpp implements LLM {
    */
   // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
   // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
-  // Use 2048 for safety margin. Still 17× less than auto (40960).
-  private static readonly RERANK_CONTEXT_SIZE = 2048;
+  // Use 8192 to handle edge cases where chunks (especially code or non-ASCII
+  // text) exceed expected token counts. VRAM cost is modest (~4 GB with flash
+  // attention). Still 5× less than auto (40960).
+  private static readonly RERANK_CONTEXT_SIZE = 8192;
 
   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
     if (this.rerankContexts.length === 0) {
@@ -1038,8 +1040,20 @@ export class LlamaCpp implements LLM {
       textToDoc.set(doc.text, { file: doc.file, index });
     });
 
-    // Extract just the text for ranking
-    const texts = documents.map((doc) => doc.text);
+    // Extract just the text for ranking, truncating any that would exceed
+    // the reranker context size. The Qwen3 template adds ~200 tokens of
+    // overhead (system prompt, tags, etc.) plus the query itself.
+    const overheadTokens = 200 + Math.ceil(query.length / 4);
+    const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - overheadTokens;
+    // chars/4 is a conservative token-count estimate
+    const maxDocChars = maxDocTokens * 4;
+
+    const texts = documents.map((doc) => {
+      if (doc.text.length > maxDocChars) {
+        return doc.text.slice(0, maxDocChars);
+      }
+      return doc.text;
+    });
 
     // Split documents across contexts for parallel evaluation.
     // Each context has its own sequence with a lock, so parallelism comes