From 3ca40cbd252663198f543d20285245e5be683cfd Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 00:09:26 +0000 Subject: [PATCH 01/20] refactor: add default LLM seam for backend transition --- src/llm.ts | 21 +++++++++++++++++++++ src/mcp.ts | 4 ++-- src/qmd.ts | 4 ++-- src/store.ts | 14 +++++++------- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 46c62957..26348cde 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -298,6 +298,11 @@ export interface LLM { */ embed(text: string, options?: EmbedOptions): Promise; + /** + * Get embeddings for multiple texts + */ + embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; + /** * Generate text completion */ @@ -1385,6 +1390,14 @@ export function getDefaultLlamaCpp(): LlamaCpp { return defaultLlamaCpp; } +/** + * Get the default LLM backend instance. + * Currently this is LlamaCpp; kept as a separate seam for future backends. + */ +export function getDefaultLLM(): LLM { + return getDefaultLlamaCpp(); +} + /** * Set a custom default LlamaCpp instance (useful for testing) */ @@ -1402,3 +1415,11 @@ export async function disposeDefaultLlamaCpp(): Promise { defaultLlamaCpp = null; } } + +/** + * Dispose the default LLM backend instance. + * Currently aliases LlamaCpp disposal. + */ +export async function disposeDefaultLLM(): Promise { + await disposeDefaultLlamaCpp(); +} diff --git a/src/mcp.ts b/src/mcp.ts index 323f4698..9cedcb71 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -24,7 +24,7 @@ import { } from "./store.js"; import type { Store, StructuredSubSearch } from "./store.js"; import { getCollection, getGlobalContext, getDefaultCollectionNames } from "./collections.js"; -import { disposeDefaultLlamaCpp } from "./llm.js"; +import { disposeDefaultLLM } from "./llm.js"; // ============================================================================= // Types for structured content @@ -717,7 +717,7 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole await transport.close(); httpServer.close(); store.close(); - await disposeDefaultLlamaCpp(); + await disposeDefaultLLM(); }; process.on("SIGTERM", async () => { diff --git a/src/qmd.ts b/src/qmd.ts index d57b7e8c..9b870ba1 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -70,7 +70,7 @@ import { createStore, getDefaultDbPath, } from "./store.js"; -import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { disposeDefaultLLM, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; import { formatSearchResults, formatDocuments, @@ -2828,7 +2828,7 @@ if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsW } if (cli.command !== "mcp") { - await disposeDefaultLlamaCpp(); + await disposeDefaultLLM(); process.exit(0); } diff --git a/src/store.ts b/src/store.ts index ff08c2a2..d39b30c8 100644 --- a/src/store.ts +++ b/src/store.ts @@ -17,8 +17,8 @@ import picomatch from "picomatch"; import { createHash } from "crypto"; import { realpathSync, statSync, mkdirSync } from "node:fs"; import { - LlamaCpp, getDefaultLlamaCpp, + getDefaultLLM, formatQueryForEmbedding, formatDocForEmbedding, type RerankDocument, @@ -2234,7 +2234,7 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text); const result = session ? await session.embed(formattedText, { model, isQuery }) - : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery }); + : await getDefaultLLM().embed(formattedText, { model, isQuery }); return result?.embedding || null; } @@ -2299,8 +2299,8 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M } } - const llm = getDefaultLlamaCpp(); - // Note: LlamaCpp uses hardcoded model, model parameter is ignored + const llm = getDefaultLLM(); + // Note: current local backend uses a configured default model; `model` may be ignored. const results = await llm.expandQuery(query); // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals). @@ -2337,9 +2337,9 @@ export async function rerank(query: string, documents: { file: string; text: str } } - // Rerank uncached documents using LlamaCpp + // Rerank uncached documents using the configured LLM backend if (uncachedDocs.length > 0) { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const rerankResult = await llm.rerank(query, uncachedDocs, { model }); // Cache results — use original doc.text for cache key (result.file lacks chunk text) @@ -2973,7 +2973,7 @@ export async function hybridQuery( } // Batch embed all vector queries in a single call - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); From 75c1383ea8b61254c734e46f62c9857e0848b205 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 17:46:41 +0000 Subject: [PATCH 02/20] feat: add API embeddings backend with contract and live tests --- src/api.ts | 153 ++++++++++++++++++++++++++++++++++++++ src/llm.ts | 21 +++++- test/api.contract.test.ts | 100 +++++++++++++++++++++++++ test/api.live.test.ts | 19 +++++ 4 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 src/api.ts create mode 100644 test/api.contract.test.ts create mode 100644 test/api.live.test.ts diff --git a/src/api.ts b/src/api.ts new file mode 100644 index 00000000..3393a5c0 --- /dev/null +++ b/src/api.ts @@ -0,0 +1,153 @@ +/** + * api.ts - API-backed LLM implementation (incremental rollout) + * + * Current phase: embeddings via OpenAI-compatible /v1/embeddings. + * Other capabilities can delegate to a fallback backend. + */ + +import type { + LLM, + EmbedOptions, + EmbeddingResult, + GenerateOptions, + GenerateResult, + ModelInfo, + Queryable, + RerankDocument, + RerankOptions, + RerankResult, +} from "./llm.js"; + +const DEFAULT_API_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; + +type OpenAIEmbeddingResponse = { + data?: Array<{ embedding?: number[] }>; +}; + +export type ApiLLMConfig = { + baseUrl?: string; + apiKey?: string; + embedModel?: string; + fallbackLLM?: LLM; +}; + +/** + * API-backed LLM implementation. + * Embeddings are remote; other methods delegate to fallback when provided. + */ +export class ApiLLM implements LLM { + private readonly baseUrl: string; + private readonly apiKey: string; + private readonly embedModel: string; + private readonly fallbackLLM?: LLM; + + constructor(config: ApiLLMConfig = {}) { + this.baseUrl = ( + config.baseUrl + || process.env.QMD_API_BASE_URL + || process.env.OPENAI_BASE_URL + || DEFAULT_API_BASE_URL + ).replace(/\/+$/, ""); + this.apiKey = config.apiKey || process.env.QMD_API_KEY || process.env.OPENAI_API_KEY || ""; + this.embedModel = config.embedModel || process.env.QMD_API_EMBED_MODEL || process.env.OPENAI_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.fallbackLLM = config.fallbackLLM; + } + + private getHeaders(): Record { + return { + "Content-Type": "application/json", + "Authorization": `Bearer ${this.apiKey}`, + }; + } + + private getFallback(method: string): LLM { + if (!this.fallbackLLM) { + throw new Error(`ApiLLM.${method} is not implemented without fallback backend`); + } + return this.fallbackLLM; + } + + private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { + if (!this.apiKey) { + console.error("ApiLLM embedding error: missing API key (set QMD_API_KEY or OPENAI_API_KEY)"); + return null; + } + + const model = modelOverride || this.embedModel; + try { + const resp = await fetch(`${this.baseUrl}/embeddings`, { + method: "POST", + headers: this.getHeaders(), + body: JSON.stringify({ + model, + input: texts, + }), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + console.error(`ApiLLM embedding error: ${resp.status} ${resp.statusText} ${body}`.trim()); + return null; + } + return await resp.json() as OpenAIEmbeddingResponse; + } catch (error) { + console.error("ApiLLM embedding error:", error); + return null; + } + } + + async embed(text: string, options: EmbedOptions = {}): Promise { + const response = await this.requestEmbeddings([text], options.model); + const vector = response?.data?.[0]?.embedding; + if (!vector || !Array.isArray(vector)) return null; + + return { + embedding: vector, + model: options.model || this.embedModel, + }; + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + if (texts.length === 0) return []; + + const response = await this.requestEmbeddings(texts); + if (!response?.data || !Array.isArray(response.data)) { + return texts.map(() => null); + } + + const results: (EmbeddingResult | null)[] = []; + for (let i = 0; i < texts.length; i++) { + const vector = response.data[i]?.embedding; + if (!vector || !Array.isArray(vector)) { + results.push(null); + } else { + results.push({ + embedding: vector, + model: this.embedModel, + }); + } + } + return results; + } + + async generate(prompt: string, options: GenerateOptions = {}): Promise { + return this.getFallback("generate").generate(prompt, options); + } + + async modelExists(model: string): Promise { + return { name: model, exists: true }; + } + + async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise { + return this.getFallback("expandQuery").expandQuery(query, options); + } + + async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { + return this.getFallback("rerank").rerank(query, documents, options); + } + + async dispose(): Promise { + // No API client resources to dispose in this implementation. + } +} + diff --git a/src/llm.ts b/src/llm.ts index 26348cde..115e16e9 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -18,6 +18,7 @@ import { import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs"; +import { ApiLLM } from "./api.js"; // ============================================================================= // Embedding Formatting Functions @@ -1379,6 +1380,7 @@ export function canUnloadLLM(): boolean { // ============================================================================= let defaultLlamaCpp: LlamaCpp | null = null; +let defaultApiLLM: ApiLLM | null = null; /** * Get the default LlamaCpp instance (creates one if needed) @@ -1395,7 +1397,18 @@ export function getDefaultLlamaCpp(): LlamaCpp { * Currently this is LlamaCpp; kept as a separate seam for future backends. */ export function getDefaultLLM(): LLM { - return getDefaultLlamaCpp(); + const backend = (process.env.QMD_LLM_BACKEND || "local").toLowerCase(); + if (backend !== "api") { + return getDefaultLlamaCpp(); + } + + if (!defaultApiLLM) { + defaultApiLLM = new ApiLLM({ + // During phased rollout, non-embedding methods can delegate to local backend. + fallbackLLM: getDefaultLlamaCpp(), + }); + } + return defaultApiLLM; } /** @@ -1403,6 +1416,8 @@ export function getDefaultLLM(): LLM { */ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { defaultLlamaCpp = llm; + // Clear API wrapper so it can rebuild with the new fallback instance. + defaultApiLLM = null; } /** @@ -1421,5 +1436,9 @@ export async function disposeDefaultLlamaCpp(): Promise { * Currently aliases LlamaCpp disposal. */ export async function disposeDefaultLLM(): Promise { + if (defaultApiLLM) { + await defaultApiLLM.dispose(); + defaultApiLLM = null; + } await disposeDefaultLlamaCpp(); } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts new file mode 100644 index 00000000..ccd7a5bc --- /dev/null +++ b/test/api.contract.test.ts @@ -0,0 +1,100 @@ +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { ApiLLM } from "../src/api.js"; + +describe("ApiLLM Embeddings (contract)", () => { + const fetchMock = vi.fn(); + const originalFetch = globalThis.fetch; + const originalQmdApiKey = process.env.QMD_API_KEY; + const originalOpenAiApiKey = process.env.OPENAI_API_KEY; + + beforeEach(() => { + fetchMock.mockReset(); + (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch; + }); + + afterEach(() => { + (globalThis as { fetch: typeof fetch }).fetch = originalFetch; + process.env.QMD_API_KEY = originalQmdApiKey; + process.env.OPENAI_API_KEY = originalOpenAiApiKey; + }); + + test("embed sends OpenAI-compatible /embeddings request and parses response", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [{ embedding: [0.1, 0.2, 0.3] }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + baseUrl: "https://example.test/v1", + apiKey: "test-key", + embedModel: "test-embed-model", + }); + + const result = await llm.embed("hello"); + + expect(result).not.toBeNull(); + expect(result?.embedding).toEqual([0.1, 0.2, 0.3]); + expect(result?.model).toBe("test-embed-model"); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://example.test/v1/embeddings"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer test-key", + }); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "test-embed-model", + input: ["hello"], + }); + }); + + test("embedBatch returns one result per input and null for missing vectors", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [ + { embedding: [1, 2] }, + {}, + { embedding: [3, 4] }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + baseUrl: "https://example.test/v1", + apiKey: "test-key", + embedModel: "test-embed-model", + }); + + const results = await llm.embedBatch(["a", "b", "c"]); + expect(results).toHaveLength(3); + expect(results[0]?.embedding).toEqual([1, 2]); + expect(results[1]).toBeNull(); + expect(results[2]?.embedding).toEqual([3, 4]); + }); + + test("embed returns null and avoids fetch when API key is missing", async () => { + process.env.QMD_API_KEY = ""; + process.env.OPENAI_API_KEY = ""; + const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + + const llm = new ApiLLM({ + baseUrl: "https://example.test/v1", + apiKey: "", + embedModel: "test-embed-model", + }); + + const result = await llm.embed("hello"); + expect(result).toBeNull(); + expect(fetchMock).not.toHaveBeenCalled(); + consoleErrorSpy.mockRestore(); + }); +}); diff --git a/test/api.live.test.ts b/test/api.live.test.ts new file mode 100644 index 00000000..2ae89d79 --- /dev/null +++ b/test/api.live.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, test } from "vitest"; +import { ApiLLM } from "../src/api.js"; + +describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { + test("OpenAI /v1/embeddings returns a non-empty vector", async () => { + const llm = new ApiLLM({ + baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + apiKey: process.env.OPENAI_API_KEY, + embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", + }); + + const result = await llm.embed("QMD embedding live test"); + expect(result).not.toBeNull(); + expect(Array.isArray(result?.embedding)).toBe(true); + expect(result!.embedding.length).toBeGreaterThan(10); + expect(Number.isFinite(result!.embedding[0])).toBe(true); + }, 30000); +}); + From eb6bac84c2ec9bd353d178ed99abc6541893e490 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 18:01:16 +0000 Subject: [PATCH 03/20] feat: add API rerank backend with contract and live tests --- src/api.ts | 134 +++++++++++++++++++++++++++++++------- test/api.contract.test.ts | 96 ++++++++++++++++++++++++--- test/api.live.test.ts | 25 ++++++- 3 files changed, 222 insertions(+), 33 deletions(-) diff --git a/src/api.ts b/src/api.ts index 3393a5c0..c335d654 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,8 +1,8 @@ /** * api.ts - API-backed LLM implementation (incremental rollout) * - * Current phase: embeddings via OpenAI-compatible /v1/embeddings. - * Other capabilities can delegate to a fallback backend. + * Current phase: embeddings (/v1/embeddings) and rerank (/v1/rerank). + * Query expansion/generation can delegate to a fallback backend. */ import type { @@ -18,46 +18,68 @@ import type { RerankResult, } from "./llm.js"; -const DEFAULT_API_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_EMBED_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; +const DEFAULT_RERANK_BASE_URL = "https://api.cohere.com/v1"; +const DEFAULT_RERANK_MODEL = "rerank-v3.5"; type OpenAIEmbeddingResponse = { data?: Array<{ embedding?: number[] }>; }; +type CohereRerankResponse = { + results?: Array<{ index?: number; relevance_score?: number }>; +}; + export type ApiLLMConfig = { - baseUrl?: string; - apiKey?: string; + embedBaseUrl?: string; + embedApiKey?: string; embedModel?: string; + rerankBaseUrl?: string; + rerankApiKey?: string; + rerankModel?: string; fallbackLLM?: LLM; }; /** * API-backed LLM implementation. - * Embeddings are remote; other methods delegate to fallback when provided. + * Embeddings/reranking are remote; query expansion/generation can fallback. */ export class ApiLLM implements LLM { - private readonly baseUrl: string; - private readonly apiKey: string; + private readonly embedBaseUrl: string; + private readonly embedApiKey: string; private readonly embedModel: string; + private readonly rerankBaseUrl: string; + private readonly rerankApiKey: string; + private readonly rerankModel: string; private readonly fallbackLLM?: LLM; constructor(config: ApiLLMConfig = {}) { - this.baseUrl = ( - config.baseUrl + const normalizedEmbedBaseUrl = ( + config.embedBaseUrl || process.env.QMD_API_BASE_URL || process.env.OPENAI_BASE_URL - || DEFAULT_API_BASE_URL + || DEFAULT_EMBED_BASE_URL ).replace(/\/+$/, ""); - this.apiKey = config.apiKey || process.env.QMD_API_KEY || process.env.OPENAI_API_KEY || ""; + this.embedBaseUrl = normalizedEmbedBaseUrl; + + this.embedApiKey = config.embedApiKey || process.env.QMD_API_KEY || process.env.OPENAI_API_KEY || ""; this.embedModel = config.embedModel || process.env.QMD_API_EMBED_MODEL || process.env.OPENAI_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.rerankBaseUrl = ( + config.rerankBaseUrl + || process.env.QMD_API_RERANK_BASE_URL + || process.env.COHERE_BASE_URL + || (process.env.COHERE_API_KEY ? DEFAULT_RERANK_BASE_URL : normalizedEmbedBaseUrl) + ).replace(/\/+$/, ""); + this.rerankApiKey = config.rerankApiKey || process.env.QMD_API_RERANK_KEY || process.env.COHERE_API_KEY || this.embedApiKey; + this.rerankModel = config.rerankModel || process.env.QMD_API_RERANK_MODEL || process.env.COHERE_RERANK_MODEL || DEFAULT_RERANK_MODEL; this.fallbackLLM = config.fallbackLLM; } - private getHeaders(): Record { + private getHeaders(apiKey: string): Record { return { "Content-Type": "application/json", - "Authorization": `Bearer ${this.apiKey}`, + "Authorization": `Bearer ${apiKey}`, }; } @@ -68,17 +90,33 @@ export class ApiLLM implements LLM { return this.fallbackLLM; } + private isLikelyLocalModel(model: string): boolean { + const lower = model.toLowerCase(); + return ( + model.startsWith("hf:") + || lower.includes(".gguf") + || lower === "embeddinggemma" + || lower.includes("qwen3-reranker") + || lower.startsWith("expedientfalcon/") + ); + } + + private resolveModel(modelOverride: string | undefined, configuredModel: string): string { + if (!modelOverride) return configuredModel; + return this.isLikelyLocalModel(modelOverride) ? configuredModel : modelOverride; + } + private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { - if (!this.apiKey) { + if (!this.embedApiKey) { console.error("ApiLLM embedding error: missing API key (set QMD_API_KEY or OPENAI_API_KEY)"); return null; } - const model = modelOverride || this.embedModel; + const model = this.resolveModel(modelOverride, this.embedModel); try { - const resp = await fetch(`${this.baseUrl}/embeddings`, { + const resp = await fetch(`${this.embedBaseUrl}/embeddings`, { method: "POST", - headers: this.getHeaders(), + headers: this.getHeaders(this.embedApiKey), body: JSON.stringify({ model, input: texts, @@ -97,13 +135,14 @@ export class ApiLLM implements LLM { } async embed(text: string, options: EmbedOptions = {}): Promise { - const response = await this.requestEmbeddings([text], options.model); + const model = this.resolveModel(options.model, this.embedModel); + const response = await this.requestEmbeddings([text], model); const vector = response?.data?.[0]?.embedding; if (!vector || !Array.isArray(vector)) return null; return { embedding: vector, - model: options.model || this.embedModel, + model, }; } @@ -143,11 +182,62 @@ export class ApiLLM implements LLM { } async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { - return this.getFallback("rerank").rerank(query, documents, options); + if (!this.rerankApiKey) { + throw new Error("ApiLLM rerank error: missing API key (set QMD_API_RERANK_KEY or COHERE_API_KEY)"); + } + if (documents.length === 0) { + return { results: [], model: this.resolveModel(options.model, this.rerankModel) }; + } + + const model = this.resolveModel(options.model, this.rerankModel); + + let response: CohereRerankResponse; + try { + const resp = await fetch(`${this.rerankBaseUrl}/rerank`, { + method: "POST", + headers: this.getHeaders(this.rerankApiKey), + body: JSON.stringify({ + model, + query, + documents: documents.map((doc) => doc.text), + top_n: documents.length, + }), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim()); + } + response = await resp.json() as CohereRerankResponse; + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + throw new Error(`ApiLLM rerank request failed: ${detail}`); + } + + if (!Array.isArray(response.results)) { + throw new Error("ApiLLM rerank error: invalid response (missing results array)"); + } + + const scoreByIndex = new Map(); + for (const item of response.results) { + if (typeof item.index !== "number" || typeof item.relevance_score !== "number") continue; + scoreByIndex.set(item.index, item.relevance_score); + } + + const results = documents + .map((doc, index) => ({ + file: doc.file, + score: scoreByIndex.get(index) ?? 0, + index, + })) + .sort((a, b) => b.score - a.score); + + return { + results, + model, + }; } async dispose(): Promise { // No API client resources to dispose in this implementation. } } - diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index ccd7a5bc..087164ee 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -1,11 +1,13 @@ import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; import { ApiLLM } from "../src/api.js"; -describe("ApiLLM Embeddings (contract)", () => { +describe("ApiLLM (contract)", () => { const fetchMock = vi.fn(); const originalFetch = globalThis.fetch; const originalQmdApiKey = process.env.QMD_API_KEY; const originalOpenAiApiKey = process.env.OPENAI_API_KEY; + const originalQmdApiRerankKey = process.env.QMD_API_RERANK_KEY; + const originalCohereApiKey = process.env.COHERE_API_KEY; beforeEach(() => { fetchMock.mockReset(); @@ -16,9 +18,11 @@ describe("ApiLLM Embeddings (contract)", () => { (globalThis as { fetch: typeof fetch }).fetch = originalFetch; process.env.QMD_API_KEY = originalQmdApiKey; process.env.OPENAI_API_KEY = originalOpenAiApiKey; + process.env.QMD_API_RERANK_KEY = originalQmdApiRerankKey; + process.env.COHERE_API_KEY = originalCohereApiKey; }); - test("embed sends OpenAI-compatible /embeddings request and parses response", async () => { + test("embed sends OpenAI-compatible /embeddings request, normalizes model, and parses response", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -29,12 +33,12 @@ describe("ApiLLM Embeddings (contract)", () => { ); const llm = new ApiLLM({ - baseUrl: "https://example.test/v1", - apiKey: "test-key", + embedBaseUrl: "https://example.test/v1", + embedApiKey: "test-key", embedModel: "test-embed-model", }); - const result = await llm.embed("hello"); + const result = await llm.embed("hello", { model: "embeddinggemma" }); expect(result).not.toBeNull(); expect(result?.embedding).toEqual([0.1, 0.2, 0.3]); @@ -69,8 +73,8 @@ describe("ApiLLM Embeddings (contract)", () => { ); const llm = new ApiLLM({ - baseUrl: "https://example.test/v1", - apiKey: "test-key", + embedBaseUrl: "https://example.test/v1", + embedApiKey: "test-key", embedModel: "test-embed-model", }); @@ -87,8 +91,8 @@ describe("ApiLLM Embeddings (contract)", () => { const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); const llm = new ApiLLM({ - baseUrl: "https://example.test/v1", - apiKey: "", + embedBaseUrl: "https://example.test/v1", + embedApiKey: "", embedModel: "test-embed-model", }); @@ -97,4 +101,78 @@ describe("ApiLLM Embeddings (contract)", () => { expect(fetchMock).not.toHaveBeenCalled(); consoleErrorSpy.mockRestore(); }); + + test("rerank sends Cohere-compatible /rerank request and maps response by index", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + results: [ + { index: 1, relevance_score: 0.91 }, + { index: 0, relevance_score: 0.24 }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "embed-key", + rerankBaseUrl: "https://rerank.test/v1", + rerankApiKey: "rerank-key", + rerankModel: "rerank-v3.5", + }); + + const result = await llm.rerank( + "capital of france", + [ + { file: "a.md", text: "Berlin is the capital of Germany." }, + { file: "b.md", text: "Paris is the capital of France." }, + ], + { model: "ExpedientFalcon/qwen3-reranker:0.6b-q8_0" } + ); + + expect(result.model).toBe("rerank-v3.5"); + expect(result.results).toEqual([ + { file: "b.md", score: 0.91, index: 1 }, + { file: "a.md", score: 0.24, index: 0 }, + ]); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://rerank.test/v1/rerank"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer rerank-key", + }); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "rerank-v3.5", + query: "capital of france", + documents: [ + "Berlin is the capital of Germany.", + "Paris is the capital of France.", + ], + top_n: 2, + }); + }); + + test("rerank throws and avoids fetch when rerank API key is missing", async () => { + process.env.QMD_API_KEY = ""; + process.env.OPENAI_API_KEY = ""; + process.env.QMD_API_RERANK_KEY = ""; + process.env.COHERE_API_KEY = ""; + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "", + rerankApiKey: "", + rerankModel: "rerank-v3.5", + }); + + await expect( + llm.rerank("q", [{ file: "doc.md", text: "t" }]) + ).rejects.toThrow("missing API key"); + expect(fetchMock).not.toHaveBeenCalled(); + }); }); diff --git a/test/api.live.test.ts b/test/api.live.test.ts index 2ae89d79..a28b3483 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -4,8 +4,8 @@ import { ApiLLM } from "../src/api.js"; describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { test("OpenAI /v1/embeddings returns a non-empty vector", async () => { const llm = new ApiLLM({ - baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", - apiKey: process.env.OPENAI_API_KEY, + embedBaseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + embedApiKey: process.env.OPENAI_API_KEY, embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", }); @@ -17,3 +17,24 @@ describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { }, 30000); }); +describe.skipIf(!process.env.COHERE_API_KEY)("ApiLLM Rerank (live)", () => { + test("Cohere /v1/rerank returns ranked documents with finite scores", async () => { + const llm = new ApiLLM({ + rerankBaseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", + rerankApiKey: process.env.COHERE_API_KEY, + rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", + }); + + const docs = [ + { file: "france.md", text: "Paris is the capital city of France." }, + { file: "pets.md", text: "Cats and dogs are common household pets." }, + { file: "germany.md", text: "Berlin is the capital city of Germany." }, + ]; + + const result = await llm.rerank("What is the capital of France?", docs); + expect(result.results.length).toBe(3); + expect(result.results[0]!.file).toBe("france.md"); + expect(Number.isFinite(result.results[0]!.score)).toBe(true); + expect(result.results[0]!.score).toBeGreaterThanOrEqual(result.results[1]!.score); + }, 30000); +}); From ff184573b5203de9422e6c3eb991cea9eacab47c Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 18:20:56 +0000 Subject: [PATCH 04/20] feat: add API query expansion with strict-json toggle and normalize env vars to QMD_{EMBED|CHAT|RERANK}_* --- src/api.ts | 255 ++++++++++++++++++++++++++++++++++++-- test/api.contract.test.ts | 114 +++++++++++++++-- test/api.live.test.ts | 54 ++++++-- 3 files changed, 396 insertions(+), 27 deletions(-) diff --git a/src/api.ts b/src/api.ts index c335d654..d8d489ab 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,8 +1,9 @@ /** * api.ts - API-backed LLM implementation (incremental rollout) * - * Current phase: embeddings (/v1/embeddings) and rerank (/v1/rerank). - * Query expansion/generation can delegate to a fallback backend. + * Current phase: embeddings (/v1/embeddings), query expansion (/v1/chat/completions), + * and rerank (/v1/rerank). + * Text generation can delegate to a fallback backend. */ import type { @@ -12,6 +13,7 @@ import type { GenerateOptions, GenerateResult, ModelInfo, + QueryType, Queryable, RerankDocument, RerankOptions, @@ -20,6 +22,8 @@ import type { const DEFAULT_EMBED_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; +const DEFAULT_CHAT_BASE_URL = "https://api.openai.com/v1"; +const DEFAULT_CHAT_MODEL = "gpt-4o-mini"; const DEFAULT_RERANK_BASE_URL = "https://api.cohere.com/v1"; const DEFAULT_RERANK_MODEL = "rerank-v3.5"; @@ -31,10 +35,22 @@ type CohereRerankResponse = { results?: Array<{ index?: number; relevance_score?: number }>; }; +type OpenAIChatResponse = { + choices?: Array<{ + message?: { + content?: string | Array<{ type?: string; text?: string }>; + }; + }>; +}; + export type ApiLLMConfig = { embedBaseUrl?: string; embedApiKey?: string; embedModel?: string; + chatBaseUrl?: string; + chatApiKey?: string; + chatModel?: string; + strictJsonOutput?: boolean; rerankBaseUrl?: string; rerankApiKey?: string; rerankModel?: string; @@ -43,12 +59,16 @@ export type ApiLLMConfig = { /** * API-backed LLM implementation. - * Embeddings/reranking are remote; query expansion/generation can fallback. + * Embeddings/query-expansion/reranking are remote; text generation can fallback. */ export class ApiLLM implements LLM { private readonly embedBaseUrl: string; private readonly embedApiKey: string; private readonly embedModel: string; + private readonly chatBaseUrl: string; + private readonly chatApiKey: string; + private readonly chatModel: string; + private readonly strictJsonOutput: boolean; private readonly rerankBaseUrl: string; private readonly rerankApiKey: string; private readonly rerankModel: string; @@ -57,25 +77,78 @@ export class ApiLLM implements LLM { constructor(config: ApiLLMConfig = {}) { const normalizedEmbedBaseUrl = ( config.embedBaseUrl - || process.env.QMD_API_BASE_URL + || process.env.QMD_EMBED_BASE_URL + || process.env.QMD_API_BASE_URL // Legacy alias || process.env.OPENAI_BASE_URL || DEFAULT_EMBED_BASE_URL ).replace(/\/+$/, ""); this.embedBaseUrl = normalizedEmbedBaseUrl; - this.embedApiKey = config.embedApiKey || process.env.QMD_API_KEY || process.env.OPENAI_API_KEY || ""; - this.embedModel = config.embedModel || process.env.QMD_API_EMBED_MODEL || process.env.OPENAI_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.embedApiKey = + config.embedApiKey + || process.env.QMD_EMBED_API_KEY + || process.env.QMD_API_KEY // Legacy alias + || process.env.OPENAI_API_KEY + || ""; + this.embedModel = + config.embedModel + || process.env.QMD_EMBED_MODEL + || process.env.QMD_API_EMBED_MODEL // Legacy alias + || process.env.OPENAI_EMBED_MODEL + || DEFAULT_EMBED_MODEL; + this.chatBaseUrl = ( + config.chatBaseUrl + || process.env.QMD_CHAT_BASE_URL + || process.env.QMD_API_CHAT_BASE_URL // Legacy alias + || process.env.OPENAI_BASE_URL + || DEFAULT_CHAT_BASE_URL + ).replace(/\/+$/, ""); + this.chatApiKey = + config.chatApiKey + || process.env.QMD_CHAT_API_KEY + || process.env.QMD_API_CHAT_KEY // Legacy alias + || process.env.OPENAI_API_KEY + || this.embedApiKey; + this.chatModel = + config.chatModel + || process.env.QMD_CHAT_MODEL + || process.env.QMD_API_CHAT_MODEL // Legacy alias + || process.env.OPENAI_CHAT_MODEL + || DEFAULT_CHAT_MODEL; + this.strictJsonOutput = config.strictJsonOutput ?? this.parseBooleanEnv( + process.env.QMD_CHAT_STRICT_JSON_OUTPUT ?? process.env.QMD_API_STRICT_JSON_OUTPUT, // Legacy alias + false + ); this.rerankBaseUrl = ( config.rerankBaseUrl - || process.env.QMD_API_RERANK_BASE_URL + || process.env.QMD_RERANK_BASE_URL + || process.env.QMD_API_RERANK_BASE_URL // Legacy alias || process.env.COHERE_BASE_URL || (process.env.COHERE_API_KEY ? DEFAULT_RERANK_BASE_URL : normalizedEmbedBaseUrl) ).replace(/\/+$/, ""); - this.rerankApiKey = config.rerankApiKey || process.env.QMD_API_RERANK_KEY || process.env.COHERE_API_KEY || this.embedApiKey; - this.rerankModel = config.rerankModel || process.env.QMD_API_RERANK_MODEL || process.env.COHERE_RERANK_MODEL || DEFAULT_RERANK_MODEL; + this.rerankApiKey = + config.rerankApiKey + || process.env.QMD_RERANK_API_KEY + || process.env.QMD_API_RERANK_KEY // Legacy alias + || process.env.COHERE_API_KEY + || this.embedApiKey; + this.rerankModel = + config.rerankModel + || process.env.QMD_RERANK_MODEL + || process.env.QMD_API_RERANK_MODEL // Legacy alias + || process.env.COHERE_RERANK_MODEL + || DEFAULT_RERANK_MODEL; this.fallbackLLM = config.fallbackLLM; } + private parseBooleanEnv(value: string | undefined, fallback: boolean): boolean { + if (value === undefined) return fallback; + const normalized = value.trim().toLowerCase(); + if (["1", "true", "yes", "on"].includes(normalized)) return true; + if (["0", "false", "no", "off"].includes(normalized)) return false; + return fallback; + } + private getHeaders(apiKey: string): Record { return { "Content-Type": "application/json", @@ -106,9 +179,124 @@ export class ApiLLM implements LLM { return this.isLikelyLocalModel(modelOverride) ? configuredModel : modelOverride; } + private extractChatContent(response: OpenAIChatResponse): string { + const content = response.choices?.[0]?.message?.content; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .filter(part => part.type === "text" && typeof part.text === "string") + .map(part => part.text as string) + .join("\n"); + } + return ""; + } + + private parseExpandedQueries(content: string, strictJson: boolean): Queryable[] { + const trimmed = content.trim(); + if (!trimmed) { + throw new Error("ApiLLM expandQuery error: empty model output"); + } + + // Try strict JSON shape first: [{ type, text }, ...] or { queries: [...] } + try { + const parsed = JSON.parse(trimmed) as unknown; + const asArray = + Array.isArray(parsed) ? parsed : ( + typeof parsed === "object" + && parsed !== null + && Array.isArray((parsed as { queries?: unknown }).queries) + ? (parsed as { queries: unknown[] }).queries + : null + ); + if (asArray) { + const queries = asArray + .map(item => { + if (typeof item !== "object" || item === null) return null; + const type = (item as { type?: unknown }).type; + const text = (item as { text?: unknown }).text; + if ( + (type === "lex" || type === "vec" || type === "hyde") + && typeof text === "string" + && text.trim().length > 0 + ) { + return { type: type as QueryType, text: text.trim() }; + } + return null; + }) + .filter((q): q is Queryable => q !== null); + if (queries.length > 0) return queries; + } + } catch { + if (strictJson) { + throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response was not valid JSON"); + } + } + if (strictJson) { + throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response shape was invalid"); + } + + // Line format: "lex: ...", "vec: ...", "hyde: ..." + const fromLines = trimmed + .split("\n") + .map(line => line.trim()) + .filter(Boolean) + .map(line => { + const match = line.match(/^(lex|vec|hyde)\s*:\s*(.+)$/i); + if (!match) return null; + const type = match[1]!.toLowerCase() as QueryType; + const text = match[2]!.trim(); + if (!text) return null; + return { type, text }; + }) + .filter((q): q is Queryable => q !== null); + + if (fromLines.length > 0) return fromLines; + throw new Error("ApiLLM expandQuery error: could not parse query expansions"); + } + + private async requestChatCompletions( + messages: Array<{ role: "system" | "user"; content: string }>, + options?: { model?: string; strictJson?: boolean } + ): Promise { + if (!this.chatApiKey) { + throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY or OPENAI_API_KEY)"); + } + const model = options?.model || this.chatModel; + const strictJson = options?.strictJson ?? this.strictJsonOutput; + + let response: OpenAIChatResponse; + try { + const payload: Record = { + model, + messages, + temperature: 0.2, + }; + + const resp = await fetch(`${this.chatBaseUrl}/chat/completions`, { + method: "POST", + headers: this.getHeaders(this.chatApiKey), + body: JSON.stringify(payload), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM chat error: ${resp.status} ${resp.statusText} ${body}`.trim()); + } + response = await resp.json() as OpenAIChatResponse; + } catch (error) { + const detail = error instanceof Error ? error.message : String(error); + throw new Error(`ApiLLM chat request failed: ${detail}`); + } + + const content = this.extractChatContent(response); + if (!content.trim()) { + throw new Error("ApiLLM chat error: empty response content"); + } + return content; + } + private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { if (!this.embedApiKey) { - console.error("ApiLLM embedding error: missing API key (set QMD_API_KEY or OPENAI_API_KEY)"); + console.error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY or OPENAI_API_KEY)"); return null; } @@ -178,12 +366,55 @@ export class ApiLLM implements LLM { } async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise { - return this.getFallback("expandQuery").expandQuery(query, options); + const includeLexical = options?.includeLexical ?? true; + const strictJson = this.strictJsonOutput; + const formatInstruction = strictJson + ? "Return ONLY valid JSON as an array of objects: [{\"type\":\"lex|vec|hyde\",\"text\":\"...\"}, ...]. No markdown." + : "Return one query per line in format: type: text, where type is lex, vec, or hyde."; + const lexicalInstruction = includeLexical + ? "Include at least one lex query." + : "Do not include any lex queries."; + + const systemPrompt = [ + "You expand search queries for hybrid retrieval.", + "Produce useful variations for lexical and semantic search.", + formatInstruction, + ].join(" "); + + const userPrompt = [ + `Original query: ${query}`, + options?.context ? `Context: ${options.context}` : "", + lexicalInstruction, + "Return 2-4 total items. Keep each text concise and relevant.", + "Allowed types: lex, vec, hyde.", + ].filter(Boolean).join("\n"); + + const content = await this.requestChatCompletions( + [ + { role: "system", content: systemPrompt }, + { role: "user", content: userPrompt }, + ], + { model: this.chatModel, strictJson } + ); + + const parsed = this.parseExpandedQueries(content, strictJson); + const filteredByLex = includeLexical ? parsed : parsed.filter(q => q.type !== "lex"); + const deduped = Array.from(new Map( + filteredByLex + .map(q => ({ ...q, text: q.text.trim() })) + .filter(q => q.text.length > 0) + .map(q => [`${q.type}|${q.text.toLowerCase()}`, q] as const) + ).values()); + + if (deduped.length === 0) { + throw new Error("ApiLLM expandQuery error: no valid expansions produced"); + } + return deduped; } async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { if (!this.rerankApiKey) { - throw new Error("ApiLLM rerank error: missing API key (set QMD_API_RERANK_KEY or COHERE_API_KEY)"); + throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY or COHERE_API_KEY)"); } if (documents.length === 0) { return { results: [], model: this.resolveModel(options.model, this.rerankModel) }; diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 087164ee..28d1331b 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -4,9 +4,13 @@ import { ApiLLM } from "../src/api.js"; describe("ApiLLM (contract)", () => { const fetchMock = vi.fn(); const originalFetch = globalThis.fetch; - const originalQmdApiKey = process.env.QMD_API_KEY; + const originalQmdEmbedApiKey = process.env.QMD_EMBED_API_KEY; const originalOpenAiApiKey = process.env.OPENAI_API_KEY; - const originalQmdApiRerankKey = process.env.QMD_API_RERANK_KEY; + const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY; + const originalQmdChatStrictJsonOutput = process.env.QMD_CHAT_STRICT_JSON_OUTPUT; + const originalOpenAiChatModel = process.env.OPENAI_CHAT_MODEL; + const originalQmdChatModel = process.env.QMD_CHAT_MODEL; + const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY; const originalCohereApiKey = process.env.COHERE_API_KEY; beforeEach(() => { @@ -16,9 +20,13 @@ describe("ApiLLM (contract)", () => { afterEach(() => { (globalThis as { fetch: typeof fetch }).fetch = originalFetch; - process.env.QMD_API_KEY = originalQmdApiKey; + process.env.QMD_EMBED_API_KEY = originalQmdEmbedApiKey; process.env.OPENAI_API_KEY = originalOpenAiApiKey; - process.env.QMD_API_RERANK_KEY = originalQmdApiRerankKey; + process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey; + process.env.QMD_CHAT_STRICT_JSON_OUTPUT = originalQmdChatStrictJsonOutput; + process.env.OPENAI_CHAT_MODEL = originalOpenAiChatModel; + process.env.QMD_CHAT_MODEL = originalQmdChatModel; + process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; process.env.COHERE_API_KEY = originalCohereApiKey; }); @@ -86,7 +94,7 @@ describe("ApiLLM (contract)", () => { }); test("embed returns null and avoids fetch when API key is missing", async () => { - process.env.QMD_API_KEY = ""; + process.env.QMD_EMBED_API_KEY = ""; process.env.OPENAI_API_KEY = ""; const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); @@ -158,9 +166,9 @@ describe("ApiLLM (contract)", () => { }); test("rerank throws and avoids fetch when rerank API key is missing", async () => { - process.env.QMD_API_KEY = ""; + process.env.QMD_EMBED_API_KEY = ""; process.env.OPENAI_API_KEY = ""; - process.env.QMD_API_RERANK_KEY = ""; + process.env.QMD_RERANK_API_KEY = ""; process.env.COHERE_API_KEY = ""; const llm = new ApiLLM({ @@ -175,4 +183,96 @@ describe("ApiLLM (contract)", () => { ).rejects.toThrow("missing API key"); expect(fetchMock).not.toHaveBeenCalled(); }); + + test("expandQuery accepts line format when strict JSON is disabled (default)", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + choices: [{ + message: { + content: "lex: api auth docs\nvec: api authentication guide\nhyde: A guide to API authentication setup", + }, + }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + const result = await llm.expandQuery("api auth docs"); + expect(result).toEqual([ + { type: "lex", text: "api auth docs" }, + { type: "vec", text: "api authentication guide" }, + { type: "hyde", text: "A guide to API authentication setup" }, + ]); + + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://chat.example.test/v1/chat/completions"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer chat-key", + }); + }); + + test("expandQuery uses strict JSON mode from env and parses JSON output", async () => { + process.env.QMD_CHAT_STRICT_JSON_OUTPUT = "true"; + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + choices: [{ + message: { + content: JSON.stringify([ + { type: "lex", text: "api auth docs" }, + { type: "vec", text: "api authentication guide" }, + ]), + }, + }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + const result = await llm.expandQuery("api auth docs", { includeLexical: false }); + expect(result).toEqual([ + { type: "vec", text: "api authentication guide" }, + ]); + }); + + test("expandQuery rejects line output when strict JSON mode is enabled", async () => { + process.env.QMD_CHAT_STRICT_JSON_OUTPUT = "true"; + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + choices: [{ + message: { + content: "vec: api authentication guide", + }, + }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + await expect( + llm.expandQuery("api auth docs") + ).rejects.toThrow("strict JSON output is enabled"); + }); }); diff --git a/test/api.live.test.ts b/test/api.live.test.ts index a28b3483..3800fe5a 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -1,12 +1,12 @@ import { describe, expect, test } from "vitest"; import { ApiLLM } from "../src/api.js"; -describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { +describe.skipIf(!(process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY))("ApiLLM Embeddings (live)", () => { test("OpenAI /v1/embeddings returns a non-empty vector", async () => { const llm = new ApiLLM({ - embedBaseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", - embedApiKey: process.env.OPENAI_API_KEY, - embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", + embedBaseUrl: process.env.QMD_EMBED_BASE_URL || process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + embedApiKey: process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY, + embedModel: process.env.QMD_EMBED_MODEL || process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", }); const result = await llm.embed("QMD embedding live test"); @@ -17,12 +17,50 @@ describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { }, 30000); }); -describe.skipIf(!process.env.COHERE_API_KEY)("ApiLLM Rerank (live)", () => { +describe.skipIf(!(process.env.QMD_CHAT_API_KEY || process.env.OPENAI_API_KEY))("ApiLLM Query Expansion (live)", () => { + const chatBaseUrl = process.env.QMD_CHAT_BASE_URL || process.env.OPENAI_BASE_URL || "https://api.openai.com/v1"; + const chatApiKey = process.env.QMD_CHAT_API_KEY || process.env.OPENAI_API_KEY; + const chatModel = process.env.QMD_CHAT_MODEL || process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini"; + + test("chat completions expands query with non-strict output mode", async () => { + const llm = new ApiLLM({ + chatBaseUrl, + chatApiKey, + chatModel, + strictJsonOutput: false, + }); + + const result = await llm.expandQuery("how to authenticate API requests"); + expect(result.length).toBeGreaterThanOrEqual(1); + for (const item of result) { + expect(["lex", "vec", "hyde"]).toContain(item.type); + expect(item.text.length).toBeGreaterThan(0); + } + }, 30000); + + test("chat completions expands query with strict JSON output mode", async () => { + const llm = new ApiLLM({ + chatBaseUrl, + chatApiKey, + chatModel, + strictJsonOutput: true, + }); + + const result = await llm.expandQuery("how to authenticate API requests", { includeLexical: false }); + expect(result.length).toBeGreaterThanOrEqual(1); + for (const item of result) { + expect(["vec", "hyde"]).toContain(item.type); + expect(item.text.length).toBeGreaterThan(0); + } + }, 30000); +}); + +describe.skipIf(!(process.env.QMD_RERANK_API_KEY || process.env.COHERE_API_KEY))("ApiLLM Rerank (live)", () => { test("Cohere /v1/rerank returns ranked documents with finite scores", async () => { const llm = new ApiLLM({ - rerankBaseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", - rerankApiKey: process.env.COHERE_API_KEY, - rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", + rerankBaseUrl: process.env.QMD_RERANK_BASE_URL || process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", + rerankApiKey: process.env.QMD_RERANK_API_KEY || process.env.COHERE_API_KEY, + rerankModel: process.env.QMD_RERANK_MODEL || process.env.COHERE_RERANK_MODEL || "rerank-v3.5", }); const docs = [ From 04fadb8c56abe0c88a01515f9ef23836db7d29e6 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 18:32:04 +0000 Subject: [PATCH 05/20] chore: enforce QMD_ runtime envs and provider-only envs for live API tests --- src/api.ts | 22 ++-------------------- test/api.contract.test.ts | 9 --------- test/api.live.test.ts | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 41 deletions(-) diff --git a/src/api.ts b/src/api.ts index d8d489ab..8ddb7e65 100644 --- a/src/api.ts +++ b/src/api.ts @@ -78,8 +78,6 @@ export class ApiLLM implements LLM { const normalizedEmbedBaseUrl = ( config.embedBaseUrl || process.env.QMD_EMBED_BASE_URL - || process.env.QMD_API_BASE_URL // Legacy alias - || process.env.OPENAI_BASE_URL || DEFAULT_EMBED_BASE_URL ).replace(/\/+$/, ""); this.embedBaseUrl = normalizedEmbedBaseUrl; @@ -87,56 +85,40 @@ export class ApiLLM implements LLM { this.embedApiKey = config.embedApiKey || process.env.QMD_EMBED_API_KEY - || process.env.QMD_API_KEY // Legacy alias - || process.env.OPENAI_API_KEY || ""; this.embedModel = config.embedModel || process.env.QMD_EMBED_MODEL - || process.env.QMD_API_EMBED_MODEL // Legacy alias - || process.env.OPENAI_EMBED_MODEL || DEFAULT_EMBED_MODEL; this.chatBaseUrl = ( config.chatBaseUrl || process.env.QMD_CHAT_BASE_URL - || process.env.QMD_API_CHAT_BASE_URL // Legacy alias - || process.env.OPENAI_BASE_URL || DEFAULT_CHAT_BASE_URL ).replace(/\/+$/, ""); this.chatApiKey = config.chatApiKey || process.env.QMD_CHAT_API_KEY - || process.env.QMD_API_CHAT_KEY // Legacy alias - || process.env.OPENAI_API_KEY || this.embedApiKey; this.chatModel = config.chatModel || process.env.QMD_CHAT_MODEL - || process.env.QMD_API_CHAT_MODEL // Legacy alias - || process.env.OPENAI_CHAT_MODEL || DEFAULT_CHAT_MODEL; this.strictJsonOutput = config.strictJsonOutput ?? this.parseBooleanEnv( - process.env.QMD_CHAT_STRICT_JSON_OUTPUT ?? process.env.QMD_API_STRICT_JSON_OUTPUT, // Legacy alias + process.env.QMD_CHAT_STRICT_JSON_OUTPUT, false ); this.rerankBaseUrl = ( config.rerankBaseUrl || process.env.QMD_RERANK_BASE_URL - || process.env.QMD_API_RERANK_BASE_URL // Legacy alias - || process.env.COHERE_BASE_URL - || (process.env.COHERE_API_KEY ? DEFAULT_RERANK_BASE_URL : normalizedEmbedBaseUrl) + || DEFAULT_RERANK_BASE_URL ).replace(/\/+$/, ""); this.rerankApiKey = config.rerankApiKey || process.env.QMD_RERANK_API_KEY - || process.env.QMD_API_RERANK_KEY // Legacy alias - || process.env.COHERE_API_KEY || this.embedApiKey; this.rerankModel = config.rerankModel || process.env.QMD_RERANK_MODEL - || process.env.QMD_API_RERANK_MODEL // Legacy alias - || process.env.COHERE_RERANK_MODEL || DEFAULT_RERANK_MODEL; this.fallbackLLM = config.fallbackLLM; } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 28d1331b..633fa250 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -5,13 +5,10 @@ describe("ApiLLM (contract)", () => { const fetchMock = vi.fn(); const originalFetch = globalThis.fetch; const originalQmdEmbedApiKey = process.env.QMD_EMBED_API_KEY; - const originalOpenAiApiKey = process.env.OPENAI_API_KEY; const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY; const originalQmdChatStrictJsonOutput = process.env.QMD_CHAT_STRICT_JSON_OUTPUT; - const originalOpenAiChatModel = process.env.OPENAI_CHAT_MODEL; const originalQmdChatModel = process.env.QMD_CHAT_MODEL; const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY; - const originalCohereApiKey = process.env.COHERE_API_KEY; beforeEach(() => { fetchMock.mockReset(); @@ -21,13 +18,10 @@ describe("ApiLLM (contract)", () => { afterEach(() => { (globalThis as { fetch: typeof fetch }).fetch = originalFetch; process.env.QMD_EMBED_API_KEY = originalQmdEmbedApiKey; - process.env.OPENAI_API_KEY = originalOpenAiApiKey; process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey; process.env.QMD_CHAT_STRICT_JSON_OUTPUT = originalQmdChatStrictJsonOutput; - process.env.OPENAI_CHAT_MODEL = originalOpenAiChatModel; process.env.QMD_CHAT_MODEL = originalQmdChatModel; process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; - process.env.COHERE_API_KEY = originalCohereApiKey; }); test("embed sends OpenAI-compatible /embeddings request, normalizes model, and parses response", async () => { @@ -95,7 +89,6 @@ describe("ApiLLM (contract)", () => { test("embed returns null and avoids fetch when API key is missing", async () => { process.env.QMD_EMBED_API_KEY = ""; - process.env.OPENAI_API_KEY = ""; const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); const llm = new ApiLLM({ @@ -167,9 +160,7 @@ describe("ApiLLM (contract)", () => { test("rerank throws and avoids fetch when rerank API key is missing", async () => { process.env.QMD_EMBED_API_KEY = ""; - process.env.OPENAI_API_KEY = ""; process.env.QMD_RERANK_API_KEY = ""; - process.env.COHERE_API_KEY = ""; const llm = new ApiLLM({ embedBaseUrl: "https://example.test/v1", diff --git a/test/api.live.test.ts b/test/api.live.test.ts index 3800fe5a..f647b564 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -1,12 +1,12 @@ import { describe, expect, test } from "vitest"; import { ApiLLM } from "../src/api.js"; -describe.skipIf(!(process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY))("ApiLLM Embeddings (live)", () => { +describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { test("OpenAI /v1/embeddings returns a non-empty vector", async () => { const llm = new ApiLLM({ - embedBaseUrl: process.env.QMD_EMBED_BASE_URL || process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", - embedApiKey: process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY, - embedModel: process.env.QMD_EMBED_MODEL || process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", + embedBaseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + embedApiKey: process.env.OPENAI_API_KEY, + embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", }); const result = await llm.embed("QMD embedding live test"); @@ -17,10 +17,10 @@ describe.skipIf(!(process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY))( }, 30000); }); -describe.skipIf(!(process.env.QMD_CHAT_API_KEY || process.env.OPENAI_API_KEY))("ApiLLM Query Expansion (live)", () => { - const chatBaseUrl = process.env.QMD_CHAT_BASE_URL || process.env.OPENAI_BASE_URL || "https://api.openai.com/v1"; - const chatApiKey = process.env.QMD_CHAT_API_KEY || process.env.OPENAI_API_KEY; - const chatModel = process.env.QMD_CHAT_MODEL || process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini"; +describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Query Expansion (live)", () => { + const chatBaseUrl = process.env.OPENAI_BASE_URL || "https://api.openai.com/v1"; + const chatApiKey = process.env.OPENAI_API_KEY; + const chatModel = process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini"; test("chat completions expands query with non-strict output mode", async () => { const llm = new ApiLLM({ @@ -55,12 +55,12 @@ describe.skipIf(!(process.env.QMD_CHAT_API_KEY || process.env.OPENAI_API_KEY))(" }, 30000); }); -describe.skipIf(!(process.env.QMD_RERANK_API_KEY || process.env.COHERE_API_KEY))("ApiLLM Rerank (live)", () => { +describe.skipIf(!process.env.COHERE_API_KEY)("ApiLLM Rerank (live)", () => { test("Cohere /v1/rerank returns ranked documents with finite scores", async () => { const llm = new ApiLLM({ - rerankBaseUrl: process.env.QMD_RERANK_BASE_URL || process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", - rerankApiKey: process.env.QMD_RERANK_API_KEY || process.env.COHERE_API_KEY, - rerankModel: process.env.QMD_RERANK_MODEL || process.env.COHERE_RERANK_MODEL || "rerank-v3.5", + rerankBaseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", + rerankApiKey: process.env.COHERE_API_KEY, + rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", }); const docs = [ From 8fab053ab720a290c4e75e95efa69154949e4ad8 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 18:58:15 +0000 Subject: [PATCH 06/20] test: loop live embedding/chat tests across OpenAI and Cohere compatibility; keep rerank Cohere-only --- src/api.ts | 82 +++++++++++++++++++++---------- test/api.live.test.ts | 109 ++++++++++++++++++++++++------------------ 2 files changed, 120 insertions(+), 71 deletions(-) diff --git a/src/api.ts b/src/api.ts index 8ddb7e65..29872466 100644 --- a/src/api.ts +++ b/src/api.ts @@ -179,9 +179,8 @@ export class ApiLLM implements LLM { throw new Error("ApiLLM expandQuery error: empty model output"); } - // Try strict JSON shape first: [{ type, text }, ...] or { queries: [...] } - try { - const parsed = JSON.parse(trimmed) as unknown; + const parseQueryArray = (raw: string): Queryable[] | null => { + const parsed = JSON.parse(raw) as unknown; const asArray = Array.isArray(parsed) ? parsed : ( typeof parsed === "object" @@ -190,31 +189,64 @@ export class ApiLLM implements LLM { ? (parsed as { queries: unknown[] }).queries : null ); - if (asArray) { - const queries = asArray - .map(item => { - if (typeof item !== "object" || item === null) return null; - const type = (item as { type?: unknown }).type; - const text = (item as { text?: unknown }).text; - if ( - (type === "lex" || type === "vec" || type === "hyde") - && typeof text === "string" - && text.trim().length > 0 - ) { - return { type: type as QueryType, text: text.trim() }; - } - return null; - }) - .filter((q): q is Queryable => q !== null); - if (queries.length > 0) return queries; + if (!asArray) return null; + + const queries = asArray + .map(item => { + if (typeof item !== "object" || item === null) return null; + const type = (item as { type?: unknown }).type; + const text = (item as { text?: unknown }).text; + if ( + (type === "lex" || type === "vec" || type === "hyde") + && typeof text === "string" + && text.trim().length > 0 + ) { + return { type: type as QueryType, text: text.trim() }; + } + return null; + }) + .filter((q): q is Queryable => q !== null); + return queries.length > 0 ? queries : null; + }; + + const parseJsonWithWrappers = (raw: string): Queryable[] | null => { + const candidates: string[] = [raw]; + + const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/i); + if (fenceMatch?.[1]) { + candidates.push(fenceMatch[1].trim()); + } + + const firstArray = raw.indexOf("["); + const lastArray = raw.lastIndexOf("]"); + if (firstArray !== -1 && lastArray > firstArray) { + candidates.push(raw.slice(firstArray, lastArray + 1)); } - } catch { - if (strictJson) { - throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response was not valid JSON"); + + const firstObject = raw.indexOf("{"); + const lastObject = raw.lastIndexOf("}"); + if (firstObject !== -1 && lastObject > firstObject) { + candidates.push(raw.slice(firstObject, lastObject + 1)); + } + + for (const candidate of candidates) { + try { + const parsed = parseQueryArray(candidate); + if (parsed) return parsed; + } catch { + // Try next candidate + } } + return null; + }; + + // Try strict JSON shape first: [{ type, text }, ...] or { queries: [...] } + const parsedFromJson = parseJsonWithWrappers(trimmed); + if (parsedFromJson) { + return parsedFromJson; } if (strictJson) { - throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response shape was invalid"); + throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response did not contain valid JSON queries"); } // Line format: "lex: ...", "vec: ...", "hyde: ..." @@ -223,7 +255,7 @@ export class ApiLLM implements LLM { .map(line => line.trim()) .filter(Boolean) .map(line => { - const match = line.match(/^(lex|vec|hyde)\s*:\s*(.+)$/i); + const match = line.match(/^(?:[-*•\d\.\)\s]*)?(lex|vec|hyde)\s*:\s*(.+)$/i); if (!match) return null; const type = match[1]!.toLowerCase() as QueryType; const text = match[2]!.trim(); diff --git a/test/api.live.test.ts b/test/api.live.test.ts index f647b564..c17605e0 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -1,58 +1,75 @@ import { describe, expect, test } from "vitest"; import { ApiLLM } from "../src/api.js"; -describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Embeddings (live)", () => { - test("OpenAI /v1/embeddings returns a non-empty vector", async () => { - const llm = new ApiLLM({ - embedBaseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", - embedApiKey: process.env.OPENAI_API_KEY, - embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", - }); +const compatibilityProviders = [ + { + name: "OpenAI", + key: process.env.OPENAI_API_KEY || "", + baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", + chatModel: process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini", + }, + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1", + embedModel: process.env.COHERE_EMBED_MODEL || "embed-v4.0", + chatModel: process.env.COHERE_CHAT_MODEL || "command-a-03-2025", + }, +]; - const result = await llm.embed("QMD embedding live test"); - expect(result).not.toBeNull(); - expect(Array.isArray(result?.embedding)).toBe(true); - expect(result!.embedding.length).toBeGreaterThan(10); - expect(Number.isFinite(result!.embedding[0])).toBe(true); - }, 30000); +describe("ApiLLM Embeddings (live)", () => { + for (const provider of compatibilityProviders) { + test.skipIf(!provider.key)(`${provider.name} /v1/embeddings returns a non-empty vector`, async () => { + const llm = new ApiLLM({ + embedBaseUrl: provider.baseUrl, + embedApiKey: provider.key, + embedModel: provider.embedModel, + }); + + const result = await llm.embed(`QMD embedding live test (${provider.name})`); + expect(result).not.toBeNull(); + expect(Array.isArray(result?.embedding)).toBe(true); + expect(result!.embedding.length).toBeGreaterThan(10); + expect(Number.isFinite(result!.embedding[0])).toBe(true); + }, 30000); + } }); -describe.skipIf(!process.env.OPENAI_API_KEY)("ApiLLM Query Expansion (live)", () => { - const chatBaseUrl = process.env.OPENAI_BASE_URL || "https://api.openai.com/v1"; - const chatApiKey = process.env.OPENAI_API_KEY; - const chatModel = process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini"; +describe("ApiLLM Query Expansion (live)", () => { + for (const provider of compatibilityProviders) { + test.skipIf(!provider.key)(`${provider.name} chat completions expands query with non-strict output mode`, async () => { + const llm = new ApiLLM({ + chatBaseUrl: provider.baseUrl, + chatApiKey: provider.key, + chatModel: provider.chatModel, + strictJsonOutput: false, + }); - test("chat completions expands query with non-strict output mode", async () => { - const llm = new ApiLLM({ - chatBaseUrl, - chatApiKey, - chatModel, - strictJsonOutput: false, - }); + const result = await llm.expandQuery("how to authenticate API requests"); + expect(result.length).toBeGreaterThanOrEqual(1); + for (const item of result) { + expect(["lex", "vec", "hyde"]).toContain(item.type); + expect(item.text.length).toBeGreaterThan(0); + } + }, 30000); - const result = await llm.expandQuery("how to authenticate API requests"); - expect(result.length).toBeGreaterThanOrEqual(1); - for (const item of result) { - expect(["lex", "vec", "hyde"]).toContain(item.type); - expect(item.text.length).toBeGreaterThan(0); - } - }, 30000); + test.skipIf(!provider.key)(`${provider.name} chat completions expands query with strict JSON output mode`, async () => { + const llm = new ApiLLM({ + chatBaseUrl: provider.baseUrl, + chatApiKey: provider.key, + chatModel: provider.chatModel, + strictJsonOutput: true, + }); - test("chat completions expands query with strict JSON output mode", async () => { - const llm = new ApiLLM({ - chatBaseUrl, - chatApiKey, - chatModel, - strictJsonOutput: true, - }); - - const result = await llm.expandQuery("how to authenticate API requests", { includeLexical: false }); - expect(result.length).toBeGreaterThanOrEqual(1); - for (const item of result) { - expect(["vec", "hyde"]).toContain(item.type); - expect(item.text.length).toBeGreaterThan(0); - } - }, 30000); + const result = await llm.expandQuery("how to authenticate API requests", { includeLexical: false }); + expect(result.length).toBeGreaterThanOrEqual(1); + for (const item of result) { + expect(["vec", "hyde"]).toContain(item.type); + expect(item.text.length).toBeGreaterThan(0); + } + }, 30000); + } }); describe.skipIf(!process.env.COHERE_API_KEY)("ApiLLM Rerank (live)", () => { From bc427217aa468a7dffc97f46b078d3f5f9b756a6 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 20:04:31 +0000 Subject: [PATCH 07/20] feat(api): add Voyage embeddings/rerank compatibility and expand contract/live provider tests --- src/api.ts | 37 +++++++++++----- test/api.contract.test.ts | 47 ++++++++++++++++++++ test/api.live.test.ts | 92 +++++++++++++++++++++++++++++---------- 3 files changed, 144 insertions(+), 32 deletions(-) diff --git a/src/api.ts b/src/api.ts index 29872466..37018140 100644 --- a/src/api.ts +++ b/src/api.ts @@ -31,8 +31,9 @@ type OpenAIEmbeddingResponse = { data?: Array<{ embedding?: number[] }>; }; -type CohereRerankResponse = { +type RerankResponse = { results?: Array<{ index?: number; relevance_score?: number }>; + data?: Array<{ index?: number; relevance_score?: number }>; }; type OpenAIChatResponse = { @@ -145,6 +146,15 @@ export class ApiLLM implements LLM { return this.fallbackLLM; } + private usesVoyageRerankApi(): boolean { + try { + const hostname = new URL(this.rerankBaseUrl).hostname.toLowerCase(); + return hostname === "api.voyageai.com" || hostname.endsWith(".voyageai.com"); + } catch { + return this.rerankBaseUrl.toLowerCase().includes("voyageai.com"); + } + } + private isLikelyLocalModel(model: string): boolean { const lower = model.toLowerCase(); return ( @@ -273,7 +283,7 @@ export class ApiLLM implements LLM { options?: { model?: string; strictJson?: boolean } ): Promise { if (!this.chatApiKey) { - throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY or OPENAI_API_KEY)"); + throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)"); } const model = options?.model || this.chatModel; const strictJson = options?.strictJson ?? this.strictJsonOutput; @@ -310,7 +320,7 @@ export class ApiLLM implements LLM { private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { if (!this.embedApiKey) { - console.error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY or OPENAI_API_KEY)"); + console.error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)"); return null; } @@ -428,7 +438,7 @@ export class ApiLLM implements LLM { async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { if (!this.rerankApiKey) { - throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY or COHERE_API_KEY)"); + throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY)"); } if (documents.length === 0) { return { results: [], model: this.resolveModel(options.model, this.rerankModel) }; @@ -436,7 +446,8 @@ export class ApiLLM implements LLM { const model = this.resolveModel(options.model, this.rerankModel); - let response: CohereRerankResponse; + let response: RerankResponse; + const topCountField = this.usesVoyageRerankApi() ? "top_k" : "top_n"; try { const resp = await fetch(`${this.rerankBaseUrl}/rerank`, { method: "POST", @@ -445,25 +456,31 @@ export class ApiLLM implements LLM { model, query, documents: documents.map((doc) => doc.text), - top_n: documents.length, + [topCountField]: documents.length, }), }); if (!resp.ok) { const body = await resp.text().catch(() => ""); throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim()); } - response = await resp.json() as CohereRerankResponse; + response = await resp.json() as RerankResponse; } catch (error) { const detail = error instanceof Error ? error.message : String(error); throw new Error(`ApiLLM rerank request failed: ${detail}`); } - if (!Array.isArray(response.results)) { - throw new Error("ApiLLM rerank error: invalid response (missing results array)"); + const responseResults = Array.isArray(response.results) + ? response.results + : Array.isArray(response.data) + ? response.data + : null; + + if (!Array.isArray(responseResults)) { + throw new Error("ApiLLM rerank error: invalid response (missing results/data array)"); } const scoreByIndex = new Map(); - for (const item of response.results) { + for (const item of responseResults) { if (typeof item.index !== "number" || typeof item.relevance_score !== "number") continue; scoreByIndex.set(item.index, item.relevance_score); } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 633fa250..90ccfd38 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -158,6 +158,53 @@ describe("ApiLLM (contract)", () => { }); }); + test("rerank sends Voyage-compatible top_k and accepts data response shape", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [ + { index: 0, relevance_score: 0.12 }, + { index: 1, relevance_score: 0.95 }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + rerankBaseUrl: "https://api.voyageai.com/v1", + rerankApiKey: "voyage-key", + rerankModel: "rerank-2.5-lite", + }); + + const result = await llm.rerank( + "capital of france", + [ + { file: "a.md", text: "Berlin is the capital of Germany." }, + { file: "b.md", text: "Paris is the capital of France." }, + ] + ); + + expect(result.model).toBe("rerank-2.5-lite"); + expect(result.results).toEqual([ + { file: "b.md", score: 0.95, index: 1 }, + { file: "a.md", score: 0.12, index: 0 }, + ]); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://api.voyageai.com/v1/rerank"); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "rerank-2.5-lite", + query: "capital of france", + documents: [ + "Berlin is the capital of Germany.", + "Paris is the capital of France.", + ], + top_k: 2, + }); + }); + test("rerank throws and avoids fetch when rerank API key is missing", async () => { process.env.QMD_EMBED_API_KEY = ""; process.env.QMD_RERANK_API_KEY = ""; diff --git a/test/api.live.test.ts b/test/api.live.test.ts index c17605e0..df8b0cbf 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -1,25 +1,56 @@ import { describe, expect, test } from "vitest"; import { ApiLLM } from "../src/api.js"; -const compatibilityProviders = [ +const embeddingProviders = [ { name: "OpenAI", key: process.env.OPENAI_API_KEY || "", baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", - chatModel: process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini", + }, + { + name: "OpenRouter", + key: process.env.OPENROUTER_API_KEY || "", + baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1", + embedModel: process.env.OPENROUTER_EMBED_MODEL || "openai/text-embedding-3-small", }, { name: "Cohere", key: process.env.COHERE_API_KEY || "", baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1", embedModel: process.env.COHERE_EMBED_MODEL || "embed-v4.0", + }, + { + name: "Voyage", + key: process.env.VOYAGE_API_KEY || "", + baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1", + embedModel: process.env.VOYAGE_EMBED_MODEL || "voyage-3.5-lite", + }, +]; + +const chatProviders = [ + { + name: "OpenAI", + key: process.env.OPENAI_API_KEY || "", + baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + chatModel: process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini", + }, + { + name: "OpenRouter", + key: process.env.OPENROUTER_API_KEY || "", + baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1", + chatModel: process.env.OPENROUTER_CHAT_MODEL || "openai/gpt-4o-mini", + }, + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1", chatModel: process.env.COHERE_CHAT_MODEL || "command-a-03-2025", }, ]; describe("ApiLLM Embeddings (live)", () => { - for (const provider of compatibilityProviders) { + for (const provider of embeddingProviders) { test.skipIf(!provider.key)(`${provider.name} /v1/embeddings returns a non-empty vector`, async () => { const llm = new ApiLLM({ embedBaseUrl: provider.baseUrl, @@ -37,7 +68,7 @@ describe("ApiLLM Embeddings (live)", () => { }); describe("ApiLLM Query Expansion (live)", () => { - for (const provider of compatibilityProviders) { + for (const provider of chatProviders) { test.skipIf(!provider.key)(`${provider.name} chat completions expands query with non-strict output mode`, async () => { const llm = new ApiLLM({ chatBaseUrl: provider.baseUrl, @@ -72,24 +103,41 @@ describe("ApiLLM Query Expansion (live)", () => { } }); -describe.skipIf(!process.env.COHERE_API_KEY)("ApiLLM Rerank (live)", () => { - test("Cohere /v1/rerank returns ranked documents with finite scores", async () => { - const llm = new ApiLLM({ - rerankBaseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", - rerankApiKey: process.env.COHERE_API_KEY, - rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", - }); +const rerankProviders = [ + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", + rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", + }, + { + name: "Voyage", + key: process.env.VOYAGE_API_KEY || "", + baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1", + rerankModel: process.env.VOYAGE_RERANK_MODEL || "rerank-2.5-lite", + }, +]; + +describe("ApiLLM Rerank (live)", () => { + for (const provider of rerankProviders) { + test.skipIf(!provider.key)(`${provider.name} /v1/rerank returns ranked documents with finite scores`, async () => { + const llm = new ApiLLM({ + rerankBaseUrl: provider.baseUrl, + rerankApiKey: provider.key, + rerankModel: provider.rerankModel, + }); - const docs = [ - { file: "france.md", text: "Paris is the capital city of France." }, - { file: "pets.md", text: "Cats and dogs are common household pets." }, - { file: "germany.md", text: "Berlin is the capital city of Germany." }, - ]; + const docs = [ + { file: "france.md", text: "Paris is the capital city of France." }, + { file: "pets.md", text: "Cats and dogs are common household pets." }, + { file: "germany.md", text: "Berlin is the capital city of Germany." }, + ]; - const result = await llm.rerank("What is the capital of France?", docs); - expect(result.results.length).toBe(3); - expect(result.results[0]!.file).toBe("france.md"); - expect(Number.isFinite(result.results[0]!.score)).toBe(true); - expect(result.results[0]!.score).toBeGreaterThanOrEqual(result.results[1]!.score); - }, 30000); + const result = await llm.rerank("What is the capital of France?", docs); + expect(result.results.length).toBe(3); + expect(result.results[0]!.file).toBe("france.md"); + expect(Number.isFinite(result.results[0]!.score)).toBe(true); + expect(result.results[0]!.score).toBeGreaterThanOrEqual(result.results[1]!.score); + }, 30000); + } }); From 9e13578a02c35b32b94adb3867765f1800e5be6e Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 20:14:50 +0000 Subject: [PATCH 08/20] refactor(api): remove API-mode local fallback and make generate fail explicitly --- src/api.ts | 18 +++++------------- src/llm.ts | 9 +++------ test/api.contract.test.ts | 9 +++++++++ 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/src/api.ts b/src/api.ts index 37018140..1341ecb5 100644 --- a/src/api.ts +++ b/src/api.ts @@ -3,7 +3,7 @@ * * Current phase: embeddings (/v1/embeddings), query expansion (/v1/chat/completions), * and rerank (/v1/rerank). - * Text generation can delegate to a fallback backend. + * Text generation is intentionally unsupported in this backend for now. */ import type { @@ -55,12 +55,11 @@ export type ApiLLMConfig = { rerankBaseUrl?: string; rerankApiKey?: string; rerankModel?: string; - fallbackLLM?: LLM; }; /** * API-backed LLM implementation. - * Embeddings/query-expansion/reranking are remote; text generation can fallback. + * Embeddings/query-expansion/reranking are remote; text generation is unsupported. */ export class ApiLLM implements LLM { private readonly embedBaseUrl: string; @@ -73,7 +72,6 @@ export class ApiLLM implements LLM { private readonly rerankBaseUrl: string; private readonly rerankApiKey: string; private readonly rerankModel: string; - private readonly fallbackLLM?: LLM; constructor(config: ApiLLMConfig = {}) { const normalizedEmbedBaseUrl = ( @@ -121,7 +119,6 @@ export class ApiLLM implements LLM { config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; - this.fallbackLLM = config.fallbackLLM; } private parseBooleanEnv(value: string | undefined, fallback: boolean): boolean { @@ -139,13 +136,6 @@ export class ApiLLM implements LLM { }; } - private getFallback(method: string): LLM { - if (!this.fallbackLLM) { - throw new Error(`ApiLLM.${method} is not implemented without fallback backend`); - } - return this.fallbackLLM; - } - private usesVoyageRerankApi(): boolean { try { const hostname = new URL(this.rerankBaseUrl).hostname.toLowerCase(); @@ -382,7 +372,9 @@ export class ApiLLM implements LLM { } async generate(prompt: string, options: GenerateOptions = {}): Promise { - return this.getFallback("generate").generate(prompt, options); + void prompt; + void options; + throw new Error("ApiLLM generate is not implemented for API backend (use QMD_LLM_BACKEND=local)"); } async modelExists(model: string): Promise { diff --git a/src/llm.ts b/src/llm.ts index 115e16e9..08c000ec 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -1394,7 +1394,7 @@ export function getDefaultLlamaCpp(): LlamaCpp { /** * Get the default LLM backend instance. - * Currently this is LlamaCpp; kept as a separate seam for future backends. + * Selects local or API backend based on QMD_LLM_BACKEND. */ export function getDefaultLLM(): LLM { const backend = (process.env.QMD_LLM_BACKEND || "local").toLowerCase(); @@ -1403,10 +1403,7 @@ export function getDefaultLLM(): LLM { } if (!defaultApiLLM) { - defaultApiLLM = new ApiLLM({ - // During phased rollout, non-embedding methods can delegate to local backend. - fallbackLLM: getDefaultLlamaCpp(), - }); + defaultApiLLM = new ApiLLM(); } return defaultApiLLM; } @@ -1416,7 +1413,7 @@ export function getDefaultLLM(): LLM { */ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { defaultLlamaCpp = llm; - // Clear API wrapper so it can rebuild with the new fallback instance. + // Clear API wrapper so backend singletons can be rebuilt deterministically in tests. defaultApiLLM = null; } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 90ccfd38..dec94b8d 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -103,6 +103,15 @@ describe("ApiLLM (contract)", () => { consoleErrorSpy.mockRestore(); }); + test("generate fails explicitly for API backend", async () => { + const llm = new ApiLLM({}); + + await expect( + llm.generate("hello") + ).rejects.toThrow("not implemented for API backend"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + test("rerank sends Cohere-compatible /rerank request and maps response by index", async () => { fetchMock.mockResolvedValue( new Response( From b3a763c75263607d3faccb01bfdd83a00ecea9cd Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 21:28:00 +0000 Subject: [PATCH 09/20] refactor(api): simplify query expansion to line-format parsing only --- src/api.ts | 98 ++------------------------------------- test/api.contract.test.ts | 36 ++------------ test/api.live.test.ts | 19 +------- 3 files changed, 9 insertions(+), 144 deletions(-) diff --git a/src/api.ts b/src/api.ts index 1341ecb5..e7b5c47c 100644 --- a/src/api.ts +++ b/src/api.ts @@ -51,7 +51,6 @@ export type ApiLLMConfig = { chatBaseUrl?: string; chatApiKey?: string; chatModel?: string; - strictJsonOutput?: boolean; rerankBaseUrl?: string; rerankApiKey?: string; rerankModel?: string; @@ -68,7 +67,6 @@ export class ApiLLM implements LLM { private readonly chatBaseUrl: string; private readonly chatApiKey: string; private readonly chatModel: string; - private readonly strictJsonOutput: boolean; private readonly rerankBaseUrl: string; private readonly rerankApiKey: string; private readonly rerankModel: string; @@ -102,10 +100,6 @@ export class ApiLLM implements LLM { config.chatModel || process.env.QMD_CHAT_MODEL || DEFAULT_CHAT_MODEL; - this.strictJsonOutput = config.strictJsonOutput ?? this.parseBooleanEnv( - process.env.QMD_CHAT_STRICT_JSON_OUTPUT, - false - ); this.rerankBaseUrl = ( config.rerankBaseUrl || process.env.QMD_RERANK_BASE_URL @@ -121,14 +115,6 @@ export class ApiLLM implements LLM { || DEFAULT_RERANK_MODEL; } - private parseBooleanEnv(value: string | undefined, fallback: boolean): boolean { - if (value === undefined) return fallback; - const normalized = value.trim().toLowerCase(); - if (["1", "true", "yes", "on"].includes(normalized)) return true; - if (["0", "false", "no", "off"].includes(normalized)) return false; - return fallback; - } - private getHeaders(apiKey: string): Record { return { "Content-Type": "application/json", @@ -173,82 +159,12 @@ export class ApiLLM implements LLM { return ""; } - private parseExpandedQueries(content: string, strictJson: boolean): Queryable[] { + private parseExpandedQueries(content: string): Queryable[] { const trimmed = content.trim(); if (!trimmed) { throw new Error("ApiLLM expandQuery error: empty model output"); } - const parseQueryArray = (raw: string): Queryable[] | null => { - const parsed = JSON.parse(raw) as unknown; - const asArray = - Array.isArray(parsed) ? parsed : ( - typeof parsed === "object" - && parsed !== null - && Array.isArray((parsed as { queries?: unknown }).queries) - ? (parsed as { queries: unknown[] }).queries - : null - ); - if (!asArray) return null; - - const queries = asArray - .map(item => { - if (typeof item !== "object" || item === null) return null; - const type = (item as { type?: unknown }).type; - const text = (item as { text?: unknown }).text; - if ( - (type === "lex" || type === "vec" || type === "hyde") - && typeof text === "string" - && text.trim().length > 0 - ) { - return { type: type as QueryType, text: text.trim() }; - } - return null; - }) - .filter((q): q is Queryable => q !== null); - return queries.length > 0 ? queries : null; - }; - - const parseJsonWithWrappers = (raw: string): Queryable[] | null => { - const candidates: string[] = [raw]; - - const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/i); - if (fenceMatch?.[1]) { - candidates.push(fenceMatch[1].trim()); - } - - const firstArray = raw.indexOf("["); - const lastArray = raw.lastIndexOf("]"); - if (firstArray !== -1 && lastArray > firstArray) { - candidates.push(raw.slice(firstArray, lastArray + 1)); - } - - const firstObject = raw.indexOf("{"); - const lastObject = raw.lastIndexOf("}"); - if (firstObject !== -1 && lastObject > firstObject) { - candidates.push(raw.slice(firstObject, lastObject + 1)); - } - - for (const candidate of candidates) { - try { - const parsed = parseQueryArray(candidate); - if (parsed) return parsed; - } catch { - // Try next candidate - } - } - return null; - }; - - // Try strict JSON shape first: [{ type, text }, ...] or { queries: [...] } - const parsedFromJson = parseJsonWithWrappers(trimmed); - if (parsedFromJson) { - return parsedFromJson; - } - if (strictJson) { - throw new Error("ApiLLM expandQuery error: strict JSON output is enabled, but response did not contain valid JSON queries"); - } - // Line format: "lex: ...", "vec: ...", "hyde: ..." const fromLines = trimmed .split("\n") @@ -270,13 +186,12 @@ export class ApiLLM implements LLM { private async requestChatCompletions( messages: Array<{ role: "system" | "user"; content: string }>, - options?: { model?: string; strictJson?: boolean } + options?: { model?: string } ): Promise { if (!this.chatApiKey) { throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)"); } const model = options?.model || this.chatModel; - const strictJson = options?.strictJson ?? this.strictJsonOutput; let response: OpenAIChatResponse; try { @@ -383,10 +298,7 @@ export class ApiLLM implements LLM { async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise { const includeLexical = options?.includeLexical ?? true; - const strictJson = this.strictJsonOutput; - const formatInstruction = strictJson - ? "Return ONLY valid JSON as an array of objects: [{\"type\":\"lex|vec|hyde\",\"text\":\"...\"}, ...]. No markdown." - : "Return one query per line in format: type: text, where type is lex, vec, or hyde."; + const formatInstruction = "Return one query per line in format: type: text, where type is lex, vec, or hyde."; const lexicalInstruction = includeLexical ? "Include at least one lex query." : "Do not include any lex queries."; @@ -410,10 +322,10 @@ export class ApiLLM implements LLM { { role: "system", content: systemPrompt }, { role: "user", content: userPrompt }, ], - { model: this.chatModel, strictJson } + { model: this.chatModel } ); - const parsed = this.parseExpandedQueries(content, strictJson); + const parsed = this.parseExpandedQueries(content); const filteredByLex = includeLexical ? parsed : parsed.filter(q => q.type !== "lex"); const deduped = Array.from(new Map( filteredByLex diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index dec94b8d..49e5a073 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -6,7 +6,6 @@ describe("ApiLLM (contract)", () => { const originalFetch = globalThis.fetch; const originalQmdEmbedApiKey = process.env.QMD_EMBED_API_KEY; const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY; - const originalQmdChatStrictJsonOutput = process.env.QMD_CHAT_STRICT_JSON_OUTPUT; const originalQmdChatModel = process.env.QMD_CHAT_MODEL; const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY; @@ -19,7 +18,6 @@ describe("ApiLLM (contract)", () => { (globalThis as { fetch: typeof fetch }).fetch = originalFetch; process.env.QMD_EMBED_API_KEY = originalQmdEmbedApiKey; process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey; - process.env.QMD_CHAT_STRICT_JSON_OUTPUT = originalQmdChatStrictJsonOutput; process.env.QMD_CHAT_MODEL = originalQmdChatModel; process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; }); @@ -231,7 +229,7 @@ describe("ApiLLM (contract)", () => { expect(fetchMock).not.toHaveBeenCalled(); }); - test("expandQuery accepts line format when strict JSON is disabled (default)", async () => { + test("expandQuery accepts line format output", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -267,8 +265,7 @@ describe("ApiLLM (contract)", () => { }); }); - test("expandQuery uses strict JSON mode from env and parses JSON output", async () => { - process.env.QMD_CHAT_STRICT_JSON_OUTPUT = "true"; + test("expandQuery rejects JSON-only output", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -291,35 +288,8 @@ describe("ApiLLM (contract)", () => { chatModel: "gpt-4o-mini", }); - const result = await llm.expandQuery("api auth docs", { includeLexical: false }); - expect(result).toEqual([ - { type: "vec", text: "api authentication guide" }, - ]); - }); - - test("expandQuery rejects line output when strict JSON mode is enabled", async () => { - process.env.QMD_CHAT_STRICT_JSON_OUTPUT = "true"; - fetchMock.mockResolvedValue( - new Response( - JSON.stringify({ - choices: [{ - message: { - content: "vec: api authentication guide", - }, - }], - }), - { status: 200, headers: { "Content-Type": "application/json" } } - ) - ); - - const llm = new ApiLLM({ - chatBaseUrl: "https://chat.example.test/v1", - chatApiKey: "chat-key", - chatModel: "gpt-4o-mini", - }); - await expect( llm.expandQuery("api auth docs") - ).rejects.toThrow("strict JSON output is enabled"); + ).rejects.toThrow("could not parse query expansions"); }); }); diff --git a/test/api.live.test.ts b/test/api.live.test.ts index df8b0cbf..35cdbf06 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -69,12 +69,11 @@ describe("ApiLLM Embeddings (live)", () => { describe("ApiLLM Query Expansion (live)", () => { for (const provider of chatProviders) { - test.skipIf(!provider.key)(`${provider.name} chat completions expands query with non-strict output mode`, async () => { + test.skipIf(!provider.key)(`${provider.name} chat completions expands query with line output mode`, async () => { const llm = new ApiLLM({ chatBaseUrl: provider.baseUrl, chatApiKey: provider.key, chatModel: provider.chatModel, - strictJsonOutput: false, }); const result = await llm.expandQuery("how to authenticate API requests"); @@ -84,22 +83,6 @@ describe("ApiLLM Query Expansion (live)", () => { expect(item.text.length).toBeGreaterThan(0); } }, 30000); - - test.skipIf(!provider.key)(`${provider.name} chat completions expands query with strict JSON output mode`, async () => { - const llm = new ApiLLM({ - chatBaseUrl: provider.baseUrl, - chatApiKey: provider.key, - chatModel: provider.chatModel, - strictJsonOutput: true, - }); - - const result = await llm.expandQuery("how to authenticate API requests", { includeLexical: false }); - expect(result.length).toBeGreaterThanOrEqual(1); - for (const item of result) { - expect(["vec", "hyde"]).toContain(item.type); - expect(item.text.length).toBeGreaterThan(0); - } - }, 30000); } }); From 178a82d045cc8eee2936dc08fc0f25abb3a6dc06 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 23:05:12 +0000 Subject: [PATCH 10/20] refactor(api): simplify query expansion parsing, align endpoint error handling, and make QMD_LLM_BACKEND validation strict --- src/api.ts | 44 ++++++++++++++++++---------------- src/llm.ts | 18 +++++++++----- test/api.contract.test.ts | 50 ++++++++++++++++++++++++++++++++------- 3 files changed, 77 insertions(+), 35 deletions(-) diff --git a/src/api.ts b/src/api.ts index e7b5c47c..b77beaa4 100644 --- a/src/api.ts +++ b/src/api.ts @@ -3,6 +3,9 @@ * * Current phase: embeddings (/v1/embeddings), query expansion (/v1/chat/completions), * and rerank (/v1/rerank). + * Query expansion currently prompts model for line-format output ("lex|vec|hyde: ..."), + * but does not use constrained output. Possibly upgrade to structured output. + * This path works in current provider-gated tests but is not extensively battle-tested yet. * Text generation is intentionally unsupported in this backend for now. */ @@ -72,13 +75,12 @@ export class ApiLLM implements LLM { private readonly rerankModel: string; constructor(config: ApiLLMConfig = {}) { - const normalizedEmbedBaseUrl = ( + // Embedding API config + this.embedBaseUrl = ( config.embedBaseUrl || process.env.QMD_EMBED_BASE_URL || DEFAULT_EMBED_BASE_URL ).replace(/\/+$/, ""); - this.embedBaseUrl = normalizedEmbedBaseUrl; - this.embedApiKey = config.embedApiKey || process.env.QMD_EMBED_API_KEY @@ -87,6 +89,7 @@ export class ApiLLM implements LLM { config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; + // Chat API config this.chatBaseUrl = ( config.chatBaseUrl || process.env.QMD_CHAT_BASE_URL @@ -100,6 +103,7 @@ export class ApiLLM implements LLM { config.chatModel || process.env.QMD_CHAT_MODEL || DEFAULT_CHAT_MODEL; + // Rerank API config this.rerankBaseUrl = ( config.rerankBaseUrl || process.env.QMD_RERANK_BASE_URL @@ -161,9 +165,7 @@ export class ApiLLM implements LLM { private parseExpandedQueries(content: string): Queryable[] { const trimmed = content.trim(); - if (!trimmed) { - throw new Error("ApiLLM expandQuery error: empty model output"); - } + if (!trimmed) return []; // Line format: "lex: ...", "vec: ...", "hyde: ..." const fromLines = trimmed @@ -180,8 +182,7 @@ export class ApiLLM implements LLM { }) .filter((q): q is Queryable => q !== null); - if (fromLines.length > 0) return fromLines; - throw new Error("ApiLLM expandQuery error: could not parse query expansions"); + return fromLines; } private async requestChatCompletions( @@ -217,16 +218,12 @@ export class ApiLLM implements LLM { } const content = this.extractChatContent(response); - if (!content.trim()) { - throw new Error("ApiLLM chat error: empty response content"); - } return content; } private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { if (!this.embedApiKey) { - console.error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)"); - return null; + throw new Error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)"); } const model = this.resolveModel(modelOverride, this.embedModel); @@ -298,15 +295,17 @@ export class ApiLLM implements LLM { async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise { const includeLexical = options?.includeLexical ?? true; - const formatInstruction = "Return one query per line in format: type: text, where type is lex, vec, or hyde."; + const searchScope = includeLexical ? "lexical and semantic" : "semantic"; + const allowedTypes = includeLexical ? "lex, vec, or hyde" : "vec or hyde"; + const allowedTypesList = includeLexical ? "lex, vec, hyde" : "vec, hyde"; const lexicalInstruction = includeLexical ? "Include at least one lex query." : "Do not include any lex queries."; const systemPrompt = [ "You expand search queries for hybrid retrieval.", - "Produce useful variations for lexical and semantic search.", - formatInstruction, + `Produce useful variations for ${searchScope} search.`, + `Return one query per line in format: type: text, where type is ${allowedTypes}.`, ].join(" "); const userPrompt = [ @@ -314,7 +313,7 @@ export class ApiLLM implements LLM { options?.context ? `Context: ${options.context}` : "", lexicalInstruction, "Return 2-4 total items. Keep each text concise and relevant.", - "Allowed types: lex, vec, hyde.", + `Allowed types: ${allowedTypesList}.`, ].filter(Boolean).join("\n"); const content = await this.requestChatCompletions( @@ -325,6 +324,10 @@ export class ApiLLM implements LLM { { model: this.chatModel } ); + if (!content.trim()) { + return []; + } + const parsed = this.parseExpandedQueries(content); const filteredByLex = includeLexical ? parsed : parsed.filter(q => q.type !== "lex"); const deduped = Array.from(new Map( @@ -334,10 +337,11 @@ export class ApiLLM implements LLM { .map(q => [`${q.type}|${q.text.toLowerCase()}`, q] as const) ).values()); - if (deduped.length === 0) { - throw new Error("ApiLLM expandQuery error: no valid expansions produced"); + if (deduped.length > 0) { + return deduped; } - return deduped; + console.warn("ApiLLM expandQuery warning: no valid expansions produced; returning empty expansion set"); + return []; } async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { diff --git a/src/llm.ts b/src/llm.ts index 08c000ec..2467123f 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -1397,15 +1397,21 @@ export function getDefaultLlamaCpp(): LlamaCpp { * Selects local or API backend based on QMD_LLM_BACKEND. */ export function getDefaultLLM(): LLM { - const backend = (process.env.QMD_LLM_BACKEND || "local").toLowerCase(); - if (backend !== "api") { + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + if (backend === "local") { return getDefaultLlamaCpp(); } - if (!defaultApiLLM) { - defaultApiLLM = new ApiLLM(); + if (backend === "api") { + if (!defaultApiLLM) { + defaultApiLLM = new ApiLLM(); + } + return defaultApiLLM; } - return defaultApiLLM; + + throw new Error( + `Invalid QMD_LLM_BACKEND="${process.env.QMD_LLM_BACKEND}". Expected "local" or "api".` + ); } /** @@ -1413,7 +1419,7 @@ export function getDefaultLLM(): LLM { */ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { defaultLlamaCpp = llm; - // Clear API wrapper so backend singletons can be rebuilt deterministically in tests. + // Function appears unused - clearing defaultApiLLM probably right thing to do anyway? defaultApiLLM = null; } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 49e5a073..240292a1 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -85,9 +85,8 @@ describe("ApiLLM (contract)", () => { expect(results[2]?.embedding).toEqual([3, 4]); }); - test("embed returns null and avoids fetch when API key is missing", async () => { + test("embed throws and avoids fetch when API key is missing", async () => { process.env.QMD_EMBED_API_KEY = ""; - const consoleErrorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); const llm = new ApiLLM({ embedBaseUrl: "https://example.test/v1", @@ -95,10 +94,10 @@ describe("ApiLLM (contract)", () => { embedModel: "test-embed-model", }); - const result = await llm.embed("hello"); - expect(result).toBeNull(); + await expect( + llm.embed("hello") + ).rejects.toThrow("missing API key"); expect(fetchMock).not.toHaveBeenCalled(); - consoleErrorSpy.mockRestore(); }); test("generate fails explicitly for API backend", async () => { @@ -265,7 +264,38 @@ describe("ApiLLM (contract)", () => { }); }); - test("expandQuery rejects JSON-only output", async () => { + test("expandQuery throws and avoids fetch when chat API key is missing", async () => { + process.env.QMD_CHAT_API_KEY = ""; + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "", + chatModel: "gpt-4o-mini", + }); + + await expect( + llm.expandQuery("api auth docs") + ).rejects.toThrow("missing API key"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + test("expandQuery throws on chat request failure", async () => { + fetchMock.mockResolvedValue( + new Response("upstream error", { status: 503, statusText: "Service Unavailable" }) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + await expect( + llm.expandQuery("api auth docs") + ).rejects.toThrow("chat request failed"); + }); + + test("expandQuery returns empty expansion set when output is not parseable line format", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -288,8 +318,10 @@ describe("ApiLLM (contract)", () => { chatModel: "gpt-4o-mini", }); - await expect( - llm.expandQuery("api auth docs") - ).rejects.toThrow("could not parse query expansions"); + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); + const result = await llm.expandQuery("api auth docs"); + expect(result).toEqual([]); + expect(warnSpy).toHaveBeenCalledTimes(1); + warnSpy.mockRestore(); }); }); From dc5f6e5f43b3f4853ac64f63208feeb199d9b75a Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 23:37:53 +0000 Subject: [PATCH 11/20] refactor(api): ignore per-call embed/rerank model overrides and always use configured API models --- src/api.ts | 51 ++++++++++++--------------------------- test/api.contract.test.ts | 8 +++--- 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/src/api.ts b/src/api.ts index b77beaa4..ef2e806f 100644 --- a/src/api.ts +++ b/src/api.ts @@ -127,6 +127,7 @@ export class ApiLLM implements LLM { } private usesVoyageRerankApi(): boolean { + // Voyage uses different result shape, if we support more providers maybe add env var selector try { const hostname = new URL(this.rerankBaseUrl).hostname.toLowerCase(); return hostname === "api.voyageai.com" || hostname.endsWith(".voyageai.com"); @@ -135,22 +136,6 @@ export class ApiLLM implements LLM { } } - private isLikelyLocalModel(model: string): boolean { - const lower = model.toLowerCase(); - return ( - model.startsWith("hf:") - || lower.includes(".gguf") - || lower === "embeddinggemma" - || lower.includes("qwen3-reranker") - || lower.startsWith("expedientfalcon/") - ); - } - - private resolveModel(modelOverride: string | undefined, configuredModel: string): string { - if (!modelOverride) return configuredModel; - return this.isLikelyLocalModel(modelOverride) ? configuredModel : modelOverride; - } - private extractChatContent(response: OpenAIChatResponse): string { const content = response.choices?.[0]?.message?.content; if (typeof content === "string") return content; @@ -186,18 +171,15 @@ export class ApiLLM implements LLM { } private async requestChatCompletions( - messages: Array<{ role: "system" | "user"; content: string }>, - options?: { model?: string } + messages: Array<{ role: "system" | "user"; content: string }> ): Promise { if (!this.chatApiKey) { throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)"); } - const model = options?.model || this.chatModel; - let response: OpenAIChatResponse; try { const payload: Record = { - model, + model: this.chatModel, messages, temperature: 0.2, }; @@ -221,18 +203,17 @@ export class ApiLLM implements LLM { return content; } - private async requestEmbeddings(texts: string[], modelOverride?: string): Promise { + private async requestEmbeddings(texts: string[]): Promise { if (!this.embedApiKey) { throw new Error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)"); } - const model = this.resolveModel(modelOverride, this.embedModel); try { const resp = await fetch(`${this.embedBaseUrl}/embeddings`, { method: "POST", headers: this.getHeaders(this.embedApiKey), body: JSON.stringify({ - model, + model: this.embedModel, input: texts, }), }); @@ -249,14 +230,14 @@ export class ApiLLM implements LLM { } async embed(text: string, options: EmbedOptions = {}): Promise { - const model = this.resolveModel(options.model, this.embedModel); - const response = await this.requestEmbeddings([text], model); + void options; // Seems used for model override in local backend, ignoring here + const response = await this.requestEmbeddings([text]); const vector = response?.data?.[0]?.embedding; if (!vector || !Array.isArray(vector)) return null; return { embedding: vector, - model, + model: this.embedModel, }; } @@ -316,13 +297,10 @@ export class ApiLLM implements LLM { `Allowed types: ${allowedTypesList}.`, ].filter(Boolean).join("\n"); - const content = await this.requestChatCompletions( - [ - { role: "system", content: systemPrompt }, - { role: "user", content: userPrompt }, - ], - { model: this.chatModel } - ); + const content = await this.requestChatCompletions([ + { role: "system", content: systemPrompt }, + { role: "user", content: userPrompt }, + ]); if (!content.trim()) { return []; @@ -345,14 +323,15 @@ export class ApiLLM implements LLM { } async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { + void options; // Seems used for model override in local backend, ignoring here if (!this.rerankApiKey) { throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY)"); } if (documents.length === 0) { - return { results: [], model: this.resolveModel(options.model, this.rerankModel) }; + return { results: [], model: this.rerankModel }; } - const model = this.resolveModel(options.model, this.rerankModel); + const model = this.rerankModel; let response: RerankResponse; const topCountField = this.usesVoyageRerankApi() ? "top_k" : "top_n"; diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 240292a1..22a7a954 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -22,7 +22,7 @@ describe("ApiLLM (contract)", () => { process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; }); - test("embed sends OpenAI-compatible /embeddings request, normalizes model, and parses response", async () => { + test("embed sends OpenAI-compatible /embeddings request, ignores per-call model override, and parses response", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -38,7 +38,7 @@ describe("ApiLLM (contract)", () => { embedModel: "test-embed-model", }); - const result = await llm.embed("hello", { model: "embeddinggemma" }); + const result = await llm.embed("hello", { model: "override-embed-model" }); expect(result).not.toBeNull(); expect(result?.embedding).toEqual([0.1, 0.2, 0.3]); @@ -109,7 +109,7 @@ describe("ApiLLM (contract)", () => { expect(fetchMock).not.toHaveBeenCalled(); }); - test("rerank sends Cohere-compatible /rerank request and maps response by index", async () => { + test("rerank sends Cohere-compatible /rerank request, ignores per-call model override, and maps response by index", async () => { fetchMock.mockResolvedValue( new Response( JSON.stringify({ @@ -136,7 +136,7 @@ describe("ApiLLM (contract)", () => { { file: "a.md", text: "Berlin is the capital of Germany." }, { file: "b.md", text: "Paris is the capital of France." }, ], - { model: "ExpedientFalcon/qwen3-reranker:0.6b-q8_0" } + { model: "override-rerank-model" } ); expect(result.model).toBe("rerank-v3.5"); From bd7b85c9af1fe20846d480f895c88e3e80b0382f Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Mon, 16 Feb 2026 23:51:42 +0000 Subject: [PATCH 12/20] refactor(api): drop model overrides and simplify chat/rerank error handling --- src/api.ts | 74 +++++++++++++++++---------------------- test/api.contract.test.ts | 2 +- 2 files changed, 34 insertions(+), 42 deletions(-) diff --git a/src/api.ts b/src/api.ts index ef2e806f..3699d279 100644 --- a/src/api.ts +++ b/src/api.ts @@ -176,29 +176,23 @@ export class ApiLLM implements LLM { if (!this.chatApiKey) { throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)"); } - let response: OpenAIChatResponse; - try { - const payload: Record = { - model: this.chatModel, - messages, - temperature: 0.2, - }; + const payload: Record = { + model: this.chatModel, + messages, + temperature: 0.2, + }; - const resp = await fetch(`${this.chatBaseUrl}/chat/completions`, { - method: "POST", - headers: this.getHeaders(this.chatApiKey), - body: JSON.stringify(payload), - }); - if (!resp.ok) { - const body = await resp.text().catch(() => ""); - throw new Error(`ApiLLM chat error: ${resp.status} ${resp.statusText} ${body}`.trim()); - } - response = await resp.json() as OpenAIChatResponse; - } catch (error) { - const detail = error instanceof Error ? error.message : String(error); - throw new Error(`ApiLLM chat request failed: ${detail}`); + const resp = await fetch(`${this.chatBaseUrl}/chat/completions`, { + method: "POST", + headers: this.getHeaders(this.chatApiKey), + body: JSON.stringify(payload), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM chat error: ${resp.status} ${resp.statusText} ${body}`.trim()); } + const response = await resp.json() as OpenAIChatResponse; const content = this.extractChatContent(response); return content; } @@ -224,13 +218,14 @@ export class ApiLLM implements LLM { } return await resp.json() as OpenAIEmbeddingResponse; } catch (error) { + // Local backend seems to return null, so we do as well to keep consistent console.error("ApiLLM embedding error:", error); return null; } } async embed(text: string, options: EmbedOptions = {}): Promise { - void options; // Seems used for model override in local backend, ignoring here + void options; // Seems to be used for model override in local backend, ignoring here const response = await this.requestEmbeddings([text]); const vector = response?.data?.[0]?.embedding; if (!vector || !Array.isArray(vector)) return null; @@ -249,6 +244,7 @@ export class ApiLLM implements LLM { return texts.map(() => null); } + // Keep output index-aligned with inputs; missing/invalid embeddings become null. const results: (EmbeddingResult | null)[] = []; for (let i = 0; i < texts.length; i++) { const vector = response.data[i]?.embedding; @@ -267,10 +263,12 @@ export class ApiLLM implements LLM { async generate(prompt: string, options: GenerateOptions = {}): Promise { void prompt; void options; + // generate() doesn't seem to be called from anywhere in the codebase, so we just throw for now throw new Error("ApiLLM generate is not implemented for API backend (use QMD_LLM_BACKEND=local)"); } async modelExists(model: string): Promise { + // Used only in local backend tests? return { name: model, exists: true }; } @@ -333,28 +331,22 @@ export class ApiLLM implements LLM { const model = this.rerankModel; - let response: RerankResponse; const topCountField = this.usesVoyageRerankApi() ? "top_k" : "top_n"; - try { - const resp = await fetch(`${this.rerankBaseUrl}/rerank`, { - method: "POST", - headers: this.getHeaders(this.rerankApiKey), - body: JSON.stringify({ - model, - query, - documents: documents.map((doc) => doc.text), - [topCountField]: documents.length, - }), - }); - if (!resp.ok) { - const body = await resp.text().catch(() => ""); - throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim()); - } - response = await resp.json() as RerankResponse; - } catch (error) { - const detail = error instanceof Error ? error.message : String(error); - throw new Error(`ApiLLM rerank request failed: ${detail}`); + const resp = await fetch(`${this.rerankBaseUrl}/rerank`, { + method: "POST", + headers: this.getHeaders(this.rerankApiKey), + body: JSON.stringify({ + model, + query, + documents: documents.map((doc) => doc.text), + [topCountField]: documents.length, + }), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim()); } + const response = await resp.json() as RerankResponse; const responseResults = Array.isArray(response.results) ? response.results diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index 22a7a954..b459bab1 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -292,7 +292,7 @@ describe("ApiLLM (contract)", () => { await expect( llm.expandQuery("api auth docs") - ).rejects.toThrow("chat request failed"); + ).rejects.toThrow("chat error: 503"); }); test("expandQuery returns empty expansion set when output is not parseable line format", async () => { From 277593b6292af86715e5190dc68a6b2356ca2da4 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Tue, 17 Feb 2026 00:27:48 +0000 Subject: [PATCH 13/20] test: add mini comment --- test/api.live.test.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/api.live.test.ts b/test/api.live.test.ts index 35cdbf06..14786c59 100644 --- a/test/api.live.test.ts +++ b/test/api.live.test.ts @@ -1,6 +1,11 @@ import { describe, expect, test } from "vitest"; import { ApiLLM } from "../src/api.js"; +/** + * Live API tests (provider-gated by env vars). + * Required keys: OPENAI_API_KEY, OPENROUTER_API_KEY, COHERE_API_KEY, VOYAGE_API_KEY. + * Tests for a provider are skipped when that provider key is not set. + */ const embeddingProviders = [ { name: "OpenAI", From 22326cf0b5a02bc39037c4d7becb080a3346a17e Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Tue, 17 Feb 2026 00:38:52 +0000 Subject: [PATCH 14/20] refactor(api): rerquire explicit API key for chat/renark providers --- src/api.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api.ts b/src/api.ts index 3699d279..e1027676 100644 --- a/src/api.ts +++ b/src/api.ts @@ -98,7 +98,7 @@ export class ApiLLM implements LLM { this.chatApiKey = config.chatApiKey || process.env.QMD_CHAT_API_KEY - || this.embedApiKey; + || ""; this.chatModel = config.chatModel || process.env.QMD_CHAT_MODEL @@ -112,7 +112,7 @@ export class ApiLLM implements LLM { this.rerankApiKey = config.rerankApiKey || process.env.QMD_RERANK_API_KEY - || this.embedApiKey; + || ""; this.rerankModel = config.rerankModel || process.env.QMD_RERANK_MODEL From 793a94d97168f73c48f95d092d9b9a6915739ca6 Mon Sep 17 00:00:00 2001 From: Pip The Molty Date: Tue, 17 Feb 2026 10:42:45 +0100 Subject: [PATCH 15/20] refactor(api): make tokenization optional and fallback to char-based chunking --- src/api.ts | 4 ++++ src/llm.ts | 16 ++++++++++++++++ src/store.ts | 19 ++++++++++++++----- test/store.helpers.unit.test.ts | 23 +++++++++++++++++++++++ 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/api.ts b/src/api.ts index e1027676..4b3e582f 100644 --- a/src/api.ts +++ b/src/api.ts @@ -170,6 +170,10 @@ export class ApiLLM implements LLM { return fromLines; } + canTokenize(): boolean { + return false; + } + private async requestChatCompletions( messages: Array<{ role: "system" | "user"; content: string }> ): Promise { diff --git a/src/llm.ts b/src/llm.ts index 2467123f..c44b9b0a 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -326,6 +326,18 @@ export interface LLM { */ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise; + /** + * Whether this backend supports tokenizer access. + * API backends may return false and omit tokenize(). + */ + canTokenize?(): boolean; + + /** + * Tokenize text when tokenizer access is available. + * API backend doesn't currently expose tokenization. + */ + tokenize?(text: string): Promise; + /** * Dispose of resources */ @@ -771,6 +783,10 @@ export class LlamaCpp implements LLM { // Tokenization // ========================================================================== + canTokenize(): boolean { + return true; + } + /** * Tokenize text using the embedding model's tokenizer * Returns tokenizer tokens (opaque type from node-llama-cpp) diff --git a/src/store.ts b/src/store.ts index d39b30c8..0d853c1e 100644 --- a/src/store.ts +++ b/src/store.ts @@ -17,7 +17,6 @@ import picomatch from "picomatch"; import { createHash } from "crypto"; import { realpathSync, statSync, mkdirSync } from "node:fs"; import { - getDefaultLlamaCpp, getDefaultLLM, formatQueryForEmbedding, formatDocForEmbedding, @@ -1427,7 +1426,7 @@ export async function chunkDocumentByTokens( overlapTokens: number = CHUNK_OVERLAP_TOKENS, windowTokens: number = CHUNK_WINDOW_TOKENS ): Promise<{ text: string; pos: number; tokens: number }[]> { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // If chunks exceed limit, they'll be re-split with actual ratio @@ -1437,13 +1436,23 @@ export async function chunkDocumentByTokens( const windowChars = windowTokens * avgCharsPerToken; // Chunk in character space with conservative estimate - let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars); + const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars); + + // API backend doesn't expose tokenizer APIs; keep chunking approximate and avoid local model init. + if (!llm.canTokenize?.() || !llm.tokenize) { + return charChunks.map((chunk) => ({ + text: chunk.text, + pos: chunk.pos, + tokens: Math.max(1, Math.ceil(chunk.text.length / avgCharsPerToken)), + })); + } + const tokenize = llm.tokenize.bind(llm); // Tokenize and split any chunks that still exceed limit const results: { text: string; pos: number; tokens: number }[] = []; for (const chunk of charChunks) { - const tokens = await llm.tokenize(chunk.text); + const tokens = await tokenize(chunk.text); if (tokens.length <= maxTokens) { results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length }); @@ -1456,7 +1465,7 @@ export async function chunkDocumentByTokens( const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2)); for (const subChunk of subChunks) { - const subTokens = await llm.tokenize(subChunk.text); + const subTokens = await tokenize(subChunk.text); results.push({ text: subChunk.text, pos: chunk.pos + subChunk.pos, diff --git a/test/store.helpers.unit.test.ts b/test/store.helpers.unit.test.ts index 3303187c..868237de 100644 --- a/test/store.helpers.unit.test.ts +++ b/test/store.helpers.unit.test.ts @@ -15,6 +15,7 @@ import { normalizeDocid, isDocid, handelize, + chunkDocumentByTokens, } from "../src/store"; // ============================================================================= @@ -203,3 +204,25 @@ describe("handelize", () => { expect(isDocid("12345")).toBe(false); }); }); + +describe("Token Chunking Fallback", () => { + test("chunkDocumentByTokens uses char-based fallback when backend cannot tokenize", async () => { + const originalBackend = process.env.QMD_LLM_BACKEND; + process.env.QMD_LLM_BACKEND = "api"; + + try { + const content = "This is a document sentence. ".repeat(400); + const chunks = await chunkDocumentByTokens(content, 120, 18, 40); + expect(chunks.length).toBeGreaterThan(1); + for (const chunk of chunks) { + expect(chunk.tokens).toBeGreaterThan(0); + } + } finally { + if (originalBackend === undefined) { + delete process.env.QMD_LLM_BACKEND; + } else { + process.env.QMD_LLM_BACKEND = originalBackend; + } + } + }); +}); From ff1878cb5429d8081118661b1e0798a4e39b2ace Mon Sep 17 00:00:00 2001 From: Pip The Molty Date: Tue, 17 Feb 2026 11:07:23 +0100 Subject: [PATCH 16/20] refactor(api): route withLLMSession through backend-specific sessions --- src/llm-session.ts | 114 ++++++++++++++++++++++++++++++++++++++ src/llm.ts | 30 +++++++--- test/api.contract.test.ts | 17 ++++++ 3 files changed, 153 insertions(+), 8 deletions(-) create mode 100644 src/llm-session.ts diff --git a/src/llm-session.ts b/src/llm-session.ts new file mode 100644 index 00000000..bb8b3ea0 --- /dev/null +++ b/src/llm-session.ts @@ -0,0 +1,114 @@ +import type { + LLM, + EmbedOptions, + EmbeddingResult, + ILLMSession, + LLMSessionOptions, + Queryable, + RerankDocument, + RerankOptions, + RerankResult, +} from "./llm.js"; + +/** + * Scoped session wrapper for non-local backends. + * Enforces release/abort semantics but delegates operations directly to the backend. + */ +export class PassthroughLLMSession implements ILLMSession { + private llm: LLM; + private released = false; + private abortController: AbortController; + private maxDurationTimer: ReturnType | null = null; + private name: string; + private createReleasedError: (message?: string) => Error; + + constructor( + llm: LLM, + options: LLMSessionOptions = {}, + createReleasedError: (message?: string) => Error = (message) => + new Error(message || "LLM session has been released or aborted") + ) { + this.llm = llm; + this.name = options.name || "unnamed"; + this.abortController = new AbortController(); + this.createReleasedError = createReleasedError; + + // Link external abort signal if provided + if (options.signal) { + if (options.signal.aborted) { + this.abortController.abort(options.signal.reason); + } else { + options.signal.addEventListener("abort", () => { + this.abortController.abort(options.signal!.reason); + }, { once: true }); + } + } + + // Set up max duration timer + const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes + if (maxDuration > 0) { + this.maxDurationTimer = setTimeout(() => { + this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`)); + }, maxDuration); + this.maxDurationTimer.unref(); // Don't keep process alive + } + } + + get isValid(): boolean { + return !this.released && !this.abortController.signal.aborted; + } + + get signal(): AbortSignal { + return this.abortController.signal; + } + + release(): void { + if (this.released) return; + this.released = true; + + if (this.maxDurationTimer) { + clearTimeout(this.maxDurationTimer); + this.maxDurationTimer = null; + } + + this.abortController.abort(new Error("Session released")); + } + + private async withOperation(fn: () => Promise): Promise { + if (!this.isValid) { + throw this.createReleasedError(); + } + + if (this.abortController.signal.aborted) { + throw this.createReleasedError( + this.abortController.signal.reason?.message || "Session aborted" + ); + } + + return await fn(); + } + + async embed(text: string, options?: EmbedOptions): Promise { + return this.withOperation(() => this.llm.embed(text, options)); + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + return this.withOperation(() => this.llm.embedBatch(texts)); + } + + async expandQuery( + query: string, + options?: { context?: string; includeLexical?: boolean } + ): Promise { + return this.withOperation(() => this.llm.expandQuery(query, options)); + } + + async rerank( + query: string, + documents: RerankDocument[], + options?: RerankOptions + ): Promise { + return this.withOperation(() => this.llm.rerank(query, documents, options)); + } +} + diff --git a/src/llm.ts b/src/llm.ts index c44b9b0a..233d5658 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -19,6 +19,7 @@ import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs"; import { ApiLLM } from "./api.js"; +import { PassthroughLLMSession } from "./llm-session.js"; // ============================================================================= // Embedding Formatting Functions @@ -1346,8 +1347,7 @@ let defaultSessionManager: LLMSessionManager | null = null; /** * Get the session manager for the default LlamaCpp instance. */ -function getSessionManager(): LLMSessionManager { - const llm = getDefaultLlamaCpp(); +function getSessionManager(llm: LlamaCpp = getDefaultLlamaCpp()): LLMSessionManager { if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) { defaultSessionManager = new LLMSessionManager(llm); } @@ -1372,13 +1372,27 @@ export async function withLLMSession( fn: (session: ILLMSession) => Promise, options?: LLMSessionOptions ): Promise { - const manager = getSessionManager(); - const session = new LLMSession(manager, options); + const llm = getDefaultLLM(); - try { - return await fn(session); - } finally { - session.release(); + if (llm instanceof LlamaCpp) { + const manager = getSessionManager(llm); + const session = new LLMSession(manager, options); + try { + return await fn(session); + } finally { + session.release(); + } + } else { + const session = new PassthroughLLMSession( + llm, + options, + (message?: string) => new SessionReleasedError(message) + ); + try { + return await fn(session); + } finally { + session.release(); + } } } diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts index b459bab1..63f696d4 100644 --- a/test/api.contract.test.ts +++ b/test/api.contract.test.ts @@ -1,5 +1,6 @@ import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; import { ApiLLM } from "../src/api.js"; +import { canUnloadLLM, withLLMSession } from "../src/llm.js"; describe("ApiLLM (contract)", () => { const fetchMock = vi.fn(); @@ -8,6 +9,7 @@ describe("ApiLLM (contract)", () => { const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY; const originalQmdChatModel = process.env.QMD_CHAT_MODEL; const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY; + const originalQmdLlmBackend = process.env.QMD_LLM_BACKEND; beforeEach(() => { fetchMock.mockReset(); @@ -20,6 +22,7 @@ describe("ApiLLM (contract)", () => { process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey; process.env.QMD_CHAT_MODEL = originalQmdChatModel; process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; + process.env.QMD_LLM_BACKEND = originalQmdLlmBackend; }); test("embed sends OpenAI-compatible /embeddings request, ignores per-call model override, and parses response", async () => { @@ -324,4 +327,18 @@ describe("ApiLLM (contract)", () => { expect(warnSpy).toHaveBeenCalledTimes(1); warnSpy.mockRestore(); }); + + test("withLLMSession does not acquire local unload lock when backend is api", async () => { + process.env.QMD_LLM_BACKEND = "api"; + + const unloadBefore = canUnloadLLM(); + expect(unloadBefore).toBe(true); + + await withLLMSession(async (session) => { + expect(session.isValid).toBe(true); + expect(canUnloadLLM()).toBe(true); + }, { maxDuration: 1000, name: "api-contract-session" }); + + expect(canUnloadLLM()).toBe(true); + }); }); From 979bed2d980e4e878c63171972a56146218a8e93 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Wed, 18 Feb 2026 13:39:26 +0100 Subject: [PATCH 17/20] refactor(api): centralize API backend default URLs and models in shared constants --- src/api-defaults.ts | 6 ++++++ src/api.ts | 27 ++++++++++++++------------- 2 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 src/api-defaults.ts diff --git a/src/api-defaults.ts b/src/api-defaults.ts new file mode 100644 index 00000000..31925dbb --- /dev/null +++ b/src/api-defaults.ts @@ -0,0 +1,6 @@ +export const DEFAULT_API_EMBED_BASE_URL = "https://api.openai.com/v1"; +export const DEFAULT_API_EMBED_MODEL = "text-embedding-3-small"; +export const DEFAULT_API_CHAT_BASE_URL = "https://api.openai.com/v1"; +export const DEFAULT_API_CHAT_MODEL = "gpt-4o-mini"; +export const DEFAULT_API_RERANK_BASE_URL = "https://api.cohere.com/v1"; +export const DEFAULT_API_RERANK_MODEL = "rerank-v3.5"; diff --git a/src/api.ts b/src/api.ts index 4b3e582f..4603be1b 100644 --- a/src/api.ts +++ b/src/api.ts @@ -22,13 +22,14 @@ import type { RerankOptions, RerankResult, } from "./llm.js"; - -const DEFAULT_EMBED_BASE_URL = "https://api.openai.com/v1"; -const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; -const DEFAULT_CHAT_BASE_URL = "https://api.openai.com/v1"; -const DEFAULT_CHAT_MODEL = "gpt-4o-mini"; -const DEFAULT_RERANK_BASE_URL = "https://api.cohere.com/v1"; -const DEFAULT_RERANK_MODEL = "rerank-v3.5"; +import { + DEFAULT_API_CHAT_BASE_URL, + DEFAULT_API_CHAT_MODEL, + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, + DEFAULT_API_RERANK_BASE_URL, + DEFAULT_API_RERANK_MODEL, +} from "./api-defaults.js"; type OpenAIEmbeddingResponse = { data?: Array<{ embedding?: number[] }>; @@ -79,7 +80,7 @@ export class ApiLLM implements LLM { this.embedBaseUrl = ( config.embedBaseUrl || process.env.QMD_EMBED_BASE_URL - || DEFAULT_EMBED_BASE_URL + || DEFAULT_API_EMBED_BASE_URL ).replace(/\/+$/, ""); this.embedApiKey = config.embedApiKey @@ -88,12 +89,12 @@ export class ApiLLM implements LLM { this.embedModel = config.embedModel || process.env.QMD_EMBED_MODEL - || DEFAULT_EMBED_MODEL; + || DEFAULT_API_EMBED_MODEL; // Chat API config this.chatBaseUrl = ( config.chatBaseUrl || process.env.QMD_CHAT_BASE_URL - || DEFAULT_CHAT_BASE_URL + || DEFAULT_API_CHAT_BASE_URL ).replace(/\/+$/, ""); this.chatApiKey = config.chatApiKey @@ -102,12 +103,12 @@ export class ApiLLM implements LLM { this.chatModel = config.chatModel || process.env.QMD_CHAT_MODEL - || DEFAULT_CHAT_MODEL; + || DEFAULT_API_CHAT_MODEL; // Rerank API config this.rerankBaseUrl = ( config.rerankBaseUrl || process.env.QMD_RERANK_BASE_URL - || DEFAULT_RERANK_BASE_URL + || DEFAULT_API_RERANK_BASE_URL ).replace(/\/+$/, ""); this.rerankApiKey = config.rerankApiKey @@ -116,7 +117,7 @@ export class ApiLLM implements LLM { this.rerankModel = config.rerankModel || process.env.QMD_RERANK_MODEL - || DEFAULT_RERANK_MODEL; + || DEFAULT_API_RERANK_MODEL; } private getHeaders(apiKey: string): Record { From ea7651da7761704e5da04c42a77cf163ed84609d Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Wed, 18 Feb 2026 14:01:50 +0100 Subject: [PATCH 18/20] fix(index): add API embedding scope guard with actionable mismatch errors and tests --- src/qmd.ts | 20 +++++ src/store.ts | 24 ++++++ src/vector-scope-guard.ts | 124 ++++++++++++++++++++++++++++ test/store.scope-guard.unit.test.ts | 102 +++++++++++++++++++++++ 4 files changed, 270 insertions(+) create mode 100644 src/vector-scope-guard.ts create mode 100644 test/store.scope-guard.unit.test.ts diff --git a/src/qmd.ts b/src/qmd.ts index 9b870ba1..2ebf8875 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -70,6 +70,11 @@ import { createStore, getDefaultDbPath, } from "./store.js"; +import { + clearApiEmbeddingScope, + getVectorScopeGuardMessage, + setApiEmbeddingScopeFromCurrentEnv, +} from "./vector-scope-guard.js"; import { disposeDefaultLLM, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; import { formatSearchResults, @@ -1533,11 +1538,23 @@ function renderProgressBar(percent: number, width: number = 30): string { async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise { const db = getDb(); const now = new Date().toISOString(); + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + const isApiBackend = backend === "api"; + + if (!force) { + const guardMessage = getVectorScopeGuardMessage(db); + if (guardMessage) { + throw new Error(guardMessage); + } + } // If force, clear all vectors if (force) { console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`); clearAllEmbeddings(db); + if (!isApiBackend) { + clearApiEmbeddingScope(db); + } } // Find unique hashes that need embedding (from active documents) @@ -1615,6 +1632,9 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = throw new Error("Failed to get embedding dimensions from first chunk"); } ensureVecTable(db, firstResult.embedding.length); + if (isApiBackend) { + setApiEmbeddingScopeFromCurrentEnv(db); + } let chunksEmbedded = 0, errors = 0, bytesProcessed = 0; const startTime = Date.now(); diff --git a/src/store.ts b/src/store.ts index 0d853c1e..9a6046d7 100644 --- a/src/store.ts +++ b/src/store.ts @@ -37,6 +37,7 @@ import { loadConfig as collectionsLoadConfig, type NamedCollection, } from "./collections.js"; +import { getVectorScopeGuardMessage } from "./vector-scope-guard.js"; // ============================================================================= // Configuration @@ -674,6 +675,14 @@ function initializeDatabase(db: Database): void { ) `); + // API embedding scope metadata (used to guard mixed local/API vector usage). + db.exec(` + CREATE TABLE IF NOT EXISTS api_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ) + `); + // Content vectors const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[]; const hasSeqColumn = cvInfo.some(col => col.name === 'seq'); @@ -2149,6 +2158,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle // ============================================================================= export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise { + const guardMessage = getVectorScopeGuardMessage(db); + if (guardMessage) { + throw new Error(guardMessage); + } + const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (!tableExists) return []; @@ -2909,6 +2923,11 @@ export async function hybridQuery( query: string, options?: HybridQueryOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; @@ -3122,6 +3141,11 @@ export async function vectorSearchQuery( query: string, options?: VectorSearchOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0.3; const collection = options?.collection; diff --git a/src/vector-scope-guard.ts b/src/vector-scope-guard.ts new file mode 100644 index 00000000..42ef5dfb --- /dev/null +++ b/src/vector-scope-guard.ts @@ -0,0 +1,124 @@ +import type { Database } from "./db.js"; +import { + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, +} from "./api-defaults.js"; + +export type ApiEmbeddingScope = { + embedBaseUrl: string; + embedModel: string; +}; + +function getConfiguredBackend(): string { + return process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; +} + +function resolveCurrentApiEmbeddingScopeFromEnv(): ApiEmbeddingScope { + const embedBaseUrl = ( + process.env.QMD_EMBED_BASE_URL?.trim() + || DEFAULT_API_EMBED_BASE_URL + ).replace(/\/+$/, ""); + const embedModel = process.env.QMD_EMBED_MODEL?.trim() || DEFAULT_API_EMBED_MODEL; + return { embedBaseUrl, embedModel }; +} + +function getApiMetaValue(db: Database, key: string): string | null { + try { + const row = db.prepare(`SELECT value FROM api_meta WHERE key = ?`).get(key) as { value: string } | null; + return row?.value || null; + } catch { + // Older DBs or test fixtures may not include api_meta. + return null; + } +} + +function setApiMetaValue(db: Database, key: string, value: string): void { + db.prepare(`INSERT OR REPLACE INTO api_meta (key, value) VALUES (?, ?)`).run(key, value); +} + +function hasAnyVectors(db: Database): boolean { + const cvCount = db.prepare(`SELECT COUNT(*) as c FROM content_vectors`).get() as { c: number }; + if (cvCount.c > 0) return true; + + const tableExists = db.prepare(` + SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec' + `).get(); + if (!tableExists) return false; + + try { + const vvCount = db.prepare(`SELECT COUNT(*) as c FROM vectors_vec`).get() as { c: number }; + return vvCount.c > 0; + } catch { + // If vec table exists but count fails, treat as non-empty/unknown for safety. + return true; + } +} + +function formatApiScope(scope: ApiEmbeddingScope): string { + return `${scope.embedBaseUrl} | ${scope.embedModel}`; +} + +export function getStoredApiEmbeddingScope(db: Database): ApiEmbeddingScope | null { + const embedBaseUrl = getApiMetaValue(db, "embed_base_url"); + const embedModel = getApiMetaValue(db, "embed_model"); + if (!embedBaseUrl || !embedModel) return null; + return { embedBaseUrl, embedModel }; +} + +export function setApiEmbeddingScopeFromCurrentEnv(db: Database): void { + const scope = resolveCurrentApiEmbeddingScopeFromEnv(); + setApiMetaValue(db, "embed_base_url", scope.embedBaseUrl); + setApiMetaValue(db, "embed_model", scope.embedModel); +} + +export function clearApiEmbeddingScope(db: Database): void { + db.exec(`DELETE FROM api_meta`); +} + +export function getVectorScopeGuardMessage(db: Database): string | null { + const backend = getConfiguredBackend(); + const storedScope = getStoredApiEmbeddingScope(db); + + if (backend === "local") { + if (!storedScope) return null; + return [ + "Index is marked for API embeddings, but current backend is local.", + `Stored API embedding scope: ${formatApiScope(storedScope)}`, + "Choose one:", + " 1) Set QMD_LLM_BACKEND=api with matching embedding settings", + " 2) Use a different index via --index", + " 3) Run 'qmd embed -f' to clear vectors and remove API scope metadata", + ].join("\n"); + } + + if (backend === "api") { + const currentScope = resolveCurrentApiEmbeddingScopeFromEnv(); + + if (!storedScope) { + if (!hasAnyVectors(db)) return null; + return [ + "This index has vectors but no API scope metadata (legacy/ambiguous state).", + "Choose one:", + " 1) Use a different index via --index", + " 2) Run 'qmd embed -f' to reset vectors for the current API embedding scope", + ].join("\n"); + } + + const isMatch = storedScope.embedBaseUrl === currentScope.embedBaseUrl + && storedScope.embedModel === currentScope.embedModel; + if (isMatch) return null; + + return [ + "API embedding scope mismatch for this index.", + `Stored scope (in index db): ${formatApiScope(storedScope)}`, + `Current scope (from environment): ${formatApiScope(currentScope)}`, + "Choose one:", + " 1) Revert API embedding settings to match the stored scope", + " 2) Use a different index via --index", + " 3) Run 'qmd embed -f' to reset vectors for the current API embedding scope", + ].join("\n"); + } + + // Unknown backend values are validated elsewhere; don't block here. + return null; +} diff --git a/test/store.scope-guard.unit.test.ts b/test/store.scope-guard.unit.test.ts new file mode 100644 index 00000000..56030deb --- /dev/null +++ b/test/store.scope-guard.unit.test.ts @@ -0,0 +1,102 @@ +import { afterEach, beforeEach, describe, expect, test } from "vitest"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createStore, type Store } from "../src/store.js"; +import { + clearApiEmbeddingScope, + getVectorScopeGuardMessage, + setApiEmbeddingScopeFromCurrentEnv, +} from "../src/vector-scope-guard.js"; + +describe("Vector scope guard (API metadata)", () => { + let testDir: string; + let store: Store; + + const originalBackend = process.env.QMD_LLM_BACKEND; + const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL; + const originalEmbedModel = process.env.QMD_EMBED_MODEL; + + beforeEach(async () => { + testDir = await mkdtemp(join(tmpdir(), "qmd-scope-guard-")); + store = createStore(join(testDir, "index.sqlite")); + + delete process.env.QMD_LLM_BACKEND; + delete process.env.QMD_EMBED_BASE_URL; + delete process.env.QMD_EMBED_MODEL; + }); + + afterEach(async () => { + store.close(); + await rm(testDir, { recursive: true, force: true }); + + if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND; + else process.env.QMD_LLM_BACKEND = originalBackend; + + if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL; + else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl; + + if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL; + else process.env.QMD_EMBED_MODEL = originalEmbedModel; + }); + + test("local backend with no api metadata does not block vector paths", () => { + process.env.QMD_LLM_BACKEND = "local"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toBeNull(); + }); + + test("local backend blocks when api metadata exists", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_LLM_BACKEND = "local"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("current backend is local"); + expect(message).toContain("qmd embed -f"); + }); + + test("api backend blocks legacy vectors when api metadata is missing", () => { + process.env.QMD_LLM_BACKEND = "api"; + clearApiEmbeddingScope(store.db); + + store.ensureVecTable(3); + store.insertEmbedding( + "hash-1", + 0, + 0, + new Float32Array([0.1, 0.2, 0.3]), + "legacy-model", + new Date().toISOString() + ); + + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("legacy/ambiguous"); + expect(message).toContain("qmd embed -f"); + }); + + test("api backend allows matching stored scope", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + const message = getVectorScopeGuardMessage(store.db); + expect(message).toBeNull(); + }); + + test("api backend blocks mismatched stored scope", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_EMBED_MODEL = "text-embedding-3-large"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("scope mismatch"); + expect(message).toContain("Stored scope"); + expect(message).toContain("Current scope"); + }); +}); From e4c5edaac1f254cdb7f2791ed3e43144fc8974db Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Wed, 18 Feb 2026 19:32:04 +0100 Subject: [PATCH 19/20] feat(status): show API model/base-url configuration in status output --- src/qmd.ts | 85 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/src/qmd.ts b/src/qmd.ts index 2ebf8875..ecfe492e 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -75,7 +75,15 @@ import { getVectorScopeGuardMessage, setApiEmbeddingScopeFromCurrentEnv, } from "./vector-scope-guard.js"; -import { disposeDefaultLLM, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { disposeDefaultLLM, getDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { + DEFAULT_API_CHAT_BASE_URL, + DEFAULT_API_CHAT_MODEL, + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, + DEFAULT_API_RERANK_BASE_URL, + DEFAULT_API_RERANK_MODEL, +} from "./api-defaults.js"; import { formatSearchResults, formatDocuments, @@ -389,43 +397,62 @@ async function showStatus(): Promise { // Models { - // hf:org/repo/file.gguf → https://huggingface.co/org/repo - const hfLink = (uri: string) => { - const match = uri.match(/^hf:([^/]+\/[^/]+)\//); - return match ? `https://huggingface.co/${match[1]}` : uri; - }; console.log(`\n${c.bold}Models${c.reset}`); - console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); - console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); - console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + if (backend === "api") { + const embedBaseUrl = (process.env.QMD_EMBED_BASE_URL || DEFAULT_API_EMBED_BASE_URL).replace(/\/+$/, ""); + const embedModel = process.env.QMD_EMBED_MODEL || DEFAULT_API_EMBED_MODEL; + const chatBaseUrl = (process.env.QMD_CHAT_BASE_URL || DEFAULT_API_CHAT_BASE_URL).replace(/\/+$/, ""); + const chatModel = process.env.QMD_CHAT_MODEL || DEFAULT_API_CHAT_MODEL; + const rerankBaseUrl = (process.env.QMD_RERANK_BASE_URL || DEFAULT_API_RERANK_BASE_URL).replace(/\/+$/, ""); + const rerankModel = process.env.QMD_RERANK_MODEL || DEFAULT_API_RERANK_MODEL; + + console.log(` Embedding: ${embedModel} ${c.dim}(${embedBaseUrl})${c.reset}`); + console.log(` Chat: ${chatModel} ${c.dim}(${chatBaseUrl})${c.reset}`); + console.log(` Reranking: ${rerankModel} ${c.dim}(${rerankBaseUrl})${c.reset}`); + } else { + // hf:org/repo/file.gguf → https://huggingface.co/org/repo + const hfLink = (uri: string) => { + const match = uri.match(/^hf:([^/]+\/[^/]+)\//); + return match ? `https://huggingface.co/${match[1]}` : uri; + }; + console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); + console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); + console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + } } // Device / GPU info try { - const llm = getDefaultLlamaCpp(); - const device = await llm.getDeviceInfo(); - console.log(`\n${c.bold}Device${c.reset}`); - if (device.gpu) { - console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); - if (device.gpuDevices.length > 0) { - // Deduplicate and count GPUs - const counts = new Map(); - for (const name of device.gpuDevices) { - counts.set(name, (counts.get(name) || 0) + 1); + const llm = getDefaultLLM(); + if (llm instanceof LlamaCpp) { + const device = await llm.getDeviceInfo(); + console.log(`\n${c.bold}Device${c.reset}`); + if (device.gpu) { + console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); + if (device.gpuDevices.length > 0) { + // Deduplicate and count GPUs + const counts = new Map(); + for (const name of device.gpuDevices) { + counts.set(name, (counts.get(name) || 0) + 1); + } + const deviceStr = Array.from(counts.entries()) + .map(([name, count]) => count > 1 ? `${count}× ${name}` : name) + .join(', '); + console.log(` Devices: ${deviceStr}`); } - const deviceStr = Array.from(counts.entries()) - .map(([name, count]) => count > 1 ? `${count}× ${name}` : name) - .join(', '); - console.log(` Devices: ${deviceStr}`); - } - if (device.vram) { - console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`); + if (device.vram) { + console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`); + } + } else { + console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`); + console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); } + console.log(` CPU: ${device.cpuCores} math cores`); } else { - console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`); - console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); + console.log(`\n${c.bold}Device${c.reset}`); + console.log(` ${c.dim}Backend is API mode; local device probe skipped.${c.reset}`); } - console.log(` CPU: ${device.cpuCores} math cores`); } catch { // Don't fail status if LLM init fails } From 59aa0c2e1f65f22307e8479d69359d245d39abe0 Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Thu, 19 Feb 2026 13:02:26 +0100 Subject: [PATCH 20/20] fix(store): enforce vector scope guard in structuredSearch and use getDefaultLLM for embeddings --- src/store.ts | 7 ++++++- test/structured-search.test.ts | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/src/store.ts b/src/store.ts index 9a6046d7..bee35efb 100644 --- a/src/store.ts +++ b/src/store.ts @@ -3236,6 +3236,11 @@ export async function structuredSearch( searches: StructuredSubSearch[], options?: StructuredSearchOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; @@ -3304,7 +3309,7 @@ export async function structuredSearch( if (hasVectors) { const vecSearches = searches.filter(s => s.type === 'vec' || s.type === 'hyde'); if (vecSearches.length > 0) { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts index 0a4c8c6d..e60dcd38 100644 --- a/test/structured-search.test.ts +++ b/test/structured-search.test.ts @@ -20,6 +20,10 @@ import { type Store, } from "../src/store.js"; import { disposeDefaultLlamaCpp } from "../src/llm.js"; +import { + clearApiEmbeddingScope, + setApiEmbeddingScopeFromCurrentEnv, +} from "../src/vector-scope-guard.js"; // ============================================================================= // parseStructuredQuery Tests (CLI Parser) @@ -317,6 +321,34 @@ describe("structuredSearch", () => { expect(r.score).toBeGreaterThanOrEqual(0.5); } }); + + test("applies API scope guard on structured query path", async () => { + const originalBackend = process.env.QMD_LLM_BACKEND; + const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL; + const originalEmbedModel = process.env.QMD_EMBED_MODEL; + + try { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_LLM_BACKEND = "local"; + await expect(structuredSearch(store, [{ type: "lex", query: "test" }])) + .rejects.toThrow("current backend is local"); + } finally { + clearApiEmbeddingScope(store.db); + + if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND; + else process.env.QMD_LLM_BACKEND = originalBackend; + + if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL; + else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl; + + if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL; + else process.env.QMD_EMBED_MODEL = originalEmbedModel; + } + }); }); // =============================================================================