diff --git a/src/api-defaults.ts b/src/api-defaults.ts new file mode 100644 index 0000000..31925db --- /dev/null +++ b/src/api-defaults.ts @@ -0,0 +1,6 @@ +export const DEFAULT_API_EMBED_BASE_URL = "https://api.openai.com/v1"; +export const DEFAULT_API_EMBED_MODEL = "text-embedding-3-small"; +export const DEFAULT_API_CHAT_BASE_URL = "https://api.openai.com/v1"; +export const DEFAULT_API_CHAT_MODEL = "gpt-4o-mini"; +export const DEFAULT_API_RERANK_BASE_URL = "https://api.cohere.com/v1"; +export const DEFAULT_API_RERANK_MODEL = "rerank-v3.5"; diff --git a/src/api.ts b/src/api.ts new file mode 100644 index 0000000..4603be1 --- /dev/null +++ b/src/api.ts @@ -0,0 +1,389 @@ +/** + * api.ts - API-backed LLM implementation (incremental rollout) + * + * Current phase: embeddings (/v1/embeddings), query expansion (/v1/chat/completions), + * and rerank (/v1/rerank). + * Query expansion currently prompts model for line-format output ("lex|vec|hyde: ..."), + * but does not use constrained output. Possibly upgrade to structured output. + * This path works in current provider-gated tests but is not extensively battle-tested yet. + * Text generation is intentionally unsupported in this backend for now. + */ + +import type { + LLM, + EmbedOptions, + EmbeddingResult, + GenerateOptions, + GenerateResult, + ModelInfo, + QueryType, + Queryable, + RerankDocument, + RerankOptions, + RerankResult, +} from "./llm.js"; +import { + DEFAULT_API_CHAT_BASE_URL, + DEFAULT_API_CHAT_MODEL, + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, + DEFAULT_API_RERANK_BASE_URL, + DEFAULT_API_RERANK_MODEL, +} from "./api-defaults.js"; + +type OpenAIEmbeddingResponse = { + data?: Array<{ embedding?: number[] }>; +}; + +type RerankResponse = { + results?: Array<{ index?: number; relevance_score?: number }>; + data?: Array<{ index?: number; relevance_score?: number }>; +}; + +type OpenAIChatResponse = { + choices?: Array<{ + message?: { + content?: string | Array<{ type?: string; text?: string }>; + }; + }>; +}; + +export type ApiLLMConfig = { + embedBaseUrl?: string; + embedApiKey?: string; + embedModel?: string; + chatBaseUrl?: string; + chatApiKey?: string; + chatModel?: string; + rerankBaseUrl?: string; + rerankApiKey?: string; + rerankModel?: string; +}; + +/** + * API-backed LLM implementation. + * Embeddings/query-expansion/reranking are remote; text generation is unsupported. + */ +export class ApiLLM implements LLM { + private readonly embedBaseUrl: string; + private readonly embedApiKey: string; + private readonly embedModel: string; + private readonly chatBaseUrl: string; + private readonly chatApiKey: string; + private readonly chatModel: string; + private readonly rerankBaseUrl: string; + private readonly rerankApiKey: string; + private readonly rerankModel: string; + + constructor(config: ApiLLMConfig = {}) { + // Embedding API config + this.embedBaseUrl = ( + config.embedBaseUrl + || process.env.QMD_EMBED_BASE_URL + || DEFAULT_API_EMBED_BASE_URL + ).replace(/\/+$/, ""); + this.embedApiKey = + config.embedApiKey + || process.env.QMD_EMBED_API_KEY + || ""; + this.embedModel = + config.embedModel + || process.env.QMD_EMBED_MODEL + || DEFAULT_API_EMBED_MODEL; + // Chat API config + this.chatBaseUrl = ( + config.chatBaseUrl + || process.env.QMD_CHAT_BASE_URL + || DEFAULT_API_CHAT_BASE_URL + ).replace(/\/+$/, ""); + this.chatApiKey = + config.chatApiKey + || process.env.QMD_CHAT_API_KEY + || ""; + this.chatModel = + config.chatModel + || process.env.QMD_CHAT_MODEL + || DEFAULT_API_CHAT_MODEL; + // Rerank API config + this.rerankBaseUrl = ( + config.rerankBaseUrl + || process.env.QMD_RERANK_BASE_URL + || DEFAULT_API_RERANK_BASE_URL + ).replace(/\/+$/, ""); + this.rerankApiKey = + config.rerankApiKey + || process.env.QMD_RERANK_API_KEY + || ""; + this.rerankModel = + config.rerankModel + || process.env.QMD_RERANK_MODEL + || DEFAULT_API_RERANK_MODEL; + } + + private getHeaders(apiKey: string): Record { + return { + "Content-Type": "application/json", + "Authorization": `Bearer ${apiKey}`, + }; + } + + private usesVoyageRerankApi(): boolean { + // Voyage uses different result shape, if we support more providers maybe add env var selector + try { + const hostname = new URL(this.rerankBaseUrl).hostname.toLowerCase(); + return hostname === "api.voyageai.com" || hostname.endsWith(".voyageai.com"); + } catch { + return this.rerankBaseUrl.toLowerCase().includes("voyageai.com"); + } + } + + private extractChatContent(response: OpenAIChatResponse): string { + const content = response.choices?.[0]?.message?.content; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .filter(part => part.type === "text" && typeof part.text === "string") + .map(part => part.text as string) + .join("\n"); + } + return ""; + } + + private parseExpandedQueries(content: string): Queryable[] { + const trimmed = content.trim(); + if (!trimmed) return []; + + // Line format: "lex: ...", "vec: ...", "hyde: ..." + const fromLines = trimmed + .split("\n") + .map(line => line.trim()) + .filter(Boolean) + .map(line => { + const match = line.match(/^(?:[-*•\d\.\)\s]*)?(lex|vec|hyde)\s*:\s*(.+)$/i); + if (!match) return null; + const type = match[1]!.toLowerCase() as QueryType; + const text = match[2]!.trim(); + if (!text) return null; + return { type, text }; + }) + .filter((q): q is Queryable => q !== null); + + return fromLines; + } + + canTokenize(): boolean { + return false; + } + + private async requestChatCompletions( + messages: Array<{ role: "system" | "user"; content: string }> + ): Promise { + if (!this.chatApiKey) { + throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)"); + } + const payload: Record = { + model: this.chatModel, + messages, + temperature: 0.2, + }; + + const resp = await fetch(`${this.chatBaseUrl}/chat/completions`, { + method: "POST", + headers: this.getHeaders(this.chatApiKey), + body: JSON.stringify(payload), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM chat error: ${resp.status} ${resp.statusText} ${body}`.trim()); + } + + const response = await resp.json() as OpenAIChatResponse; + const content = this.extractChatContent(response); + return content; + } + + private async requestEmbeddings(texts: string[]): Promise { + if (!this.embedApiKey) { + throw new Error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)"); + } + + try { + const resp = await fetch(`${this.embedBaseUrl}/embeddings`, { + method: "POST", + headers: this.getHeaders(this.embedApiKey), + body: JSON.stringify({ + model: this.embedModel, + input: texts, + }), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + console.error(`ApiLLM embedding error: ${resp.status} ${resp.statusText} ${body}`.trim()); + return null; + } + return await resp.json() as OpenAIEmbeddingResponse; + } catch (error) { + // Local backend seems to return null, so we do as well to keep consistent + console.error("ApiLLM embedding error:", error); + return null; + } + } + + async embed(text: string, options: EmbedOptions = {}): Promise { + void options; // Seems to be used for model override in local backend, ignoring here + const response = await this.requestEmbeddings([text]); + const vector = response?.data?.[0]?.embedding; + if (!vector || !Array.isArray(vector)) return null; + + return { + embedding: vector, + model: this.embedModel, + }; + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + if (texts.length === 0) return []; + + const response = await this.requestEmbeddings(texts); + if (!response?.data || !Array.isArray(response.data)) { + return texts.map(() => null); + } + + // Keep output index-aligned with inputs; missing/invalid embeddings become null. + const results: (EmbeddingResult | null)[] = []; + for (let i = 0; i < texts.length; i++) { + const vector = response.data[i]?.embedding; + if (!vector || !Array.isArray(vector)) { + results.push(null); + } else { + results.push({ + embedding: vector, + model: this.embedModel, + }); + } + } + return results; + } + + async generate(prompt: string, options: GenerateOptions = {}): Promise { + void prompt; + void options; + // generate() doesn't seem to be called from anywhere in the codebase, so we just throw for now + throw new Error("ApiLLM generate is not implemented for API backend (use QMD_LLM_BACKEND=local)"); + } + + async modelExists(model: string): Promise { + // Used only in local backend tests? + return { name: model, exists: true }; + } + + async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise { + const includeLexical = options?.includeLexical ?? true; + const searchScope = includeLexical ? "lexical and semantic" : "semantic"; + const allowedTypes = includeLexical ? "lex, vec, or hyde" : "vec or hyde"; + const allowedTypesList = includeLexical ? "lex, vec, hyde" : "vec, hyde"; + const lexicalInstruction = includeLexical + ? "Include at least one lex query." + : "Do not include any lex queries."; + + const systemPrompt = [ + "You expand search queries for hybrid retrieval.", + `Produce useful variations for ${searchScope} search.`, + `Return one query per line in format: type: text, where type is ${allowedTypes}.`, + ].join(" "); + + const userPrompt = [ + `Original query: ${query}`, + options?.context ? `Context: ${options.context}` : "", + lexicalInstruction, + "Return 2-4 total items. Keep each text concise and relevant.", + `Allowed types: ${allowedTypesList}.`, + ].filter(Boolean).join("\n"); + + const content = await this.requestChatCompletions([ + { role: "system", content: systemPrompt }, + { role: "user", content: userPrompt }, + ]); + + if (!content.trim()) { + return []; + } + + const parsed = this.parseExpandedQueries(content); + const filteredByLex = includeLexical ? parsed : parsed.filter(q => q.type !== "lex"); + const deduped = Array.from(new Map( + filteredByLex + .map(q => ({ ...q, text: q.text.trim() })) + .filter(q => q.text.length > 0) + .map(q => [`${q.type}|${q.text.toLowerCase()}`, q] as const) + ).values()); + + if (deduped.length > 0) { + return deduped; + } + console.warn("ApiLLM expandQuery warning: no valid expansions produced; returning empty expansion set"); + return []; + } + + async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise { + void options; // Seems used for model override in local backend, ignoring here + if (!this.rerankApiKey) { + throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY)"); + } + if (documents.length === 0) { + return { results: [], model: this.rerankModel }; + } + + const model = this.rerankModel; + + const topCountField = this.usesVoyageRerankApi() ? "top_k" : "top_n"; + const resp = await fetch(`${this.rerankBaseUrl}/rerank`, { + method: "POST", + headers: this.getHeaders(this.rerankApiKey), + body: JSON.stringify({ + model, + query, + documents: documents.map((doc) => doc.text), + [topCountField]: documents.length, + }), + }); + if (!resp.ok) { + const body = await resp.text().catch(() => ""); + throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim()); + } + const response = await resp.json() as RerankResponse; + + const responseResults = Array.isArray(response.results) + ? response.results + : Array.isArray(response.data) + ? response.data + : null; + + if (!Array.isArray(responseResults)) { + throw new Error("ApiLLM rerank error: invalid response (missing results/data array)"); + } + + const scoreByIndex = new Map(); + for (const item of responseResults) { + if (typeof item.index !== "number" || typeof item.relevance_score !== "number") continue; + scoreByIndex.set(item.index, item.relevance_score); + } + + const results = documents + .map((doc, index) => ({ + file: doc.file, + score: scoreByIndex.get(index) ?? 0, + index, + })) + .sort((a, b) => b.score - a.score); + + return { + results, + model, + }; + } + + async dispose(): Promise { + // No API client resources to dispose in this implementation. + } +} diff --git a/src/llm-session.ts b/src/llm-session.ts new file mode 100644 index 0000000..bb8b3ea --- /dev/null +++ b/src/llm-session.ts @@ -0,0 +1,114 @@ +import type { + LLM, + EmbedOptions, + EmbeddingResult, + ILLMSession, + LLMSessionOptions, + Queryable, + RerankDocument, + RerankOptions, + RerankResult, +} from "./llm.js"; + +/** + * Scoped session wrapper for non-local backends. + * Enforces release/abort semantics but delegates operations directly to the backend. + */ +export class PassthroughLLMSession implements ILLMSession { + private llm: LLM; + private released = false; + private abortController: AbortController; + private maxDurationTimer: ReturnType | null = null; + private name: string; + private createReleasedError: (message?: string) => Error; + + constructor( + llm: LLM, + options: LLMSessionOptions = {}, + createReleasedError: (message?: string) => Error = (message) => + new Error(message || "LLM session has been released or aborted") + ) { + this.llm = llm; + this.name = options.name || "unnamed"; + this.abortController = new AbortController(); + this.createReleasedError = createReleasedError; + + // Link external abort signal if provided + if (options.signal) { + if (options.signal.aborted) { + this.abortController.abort(options.signal.reason); + } else { + options.signal.addEventListener("abort", () => { + this.abortController.abort(options.signal!.reason); + }, { once: true }); + } + } + + // Set up max duration timer + const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes + if (maxDuration > 0) { + this.maxDurationTimer = setTimeout(() => { + this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`)); + }, maxDuration); + this.maxDurationTimer.unref(); // Don't keep process alive + } + } + + get isValid(): boolean { + return !this.released && !this.abortController.signal.aborted; + } + + get signal(): AbortSignal { + return this.abortController.signal; + } + + release(): void { + if (this.released) return; + this.released = true; + + if (this.maxDurationTimer) { + clearTimeout(this.maxDurationTimer); + this.maxDurationTimer = null; + } + + this.abortController.abort(new Error("Session released")); + } + + private async withOperation(fn: () => Promise): Promise { + if (!this.isValid) { + throw this.createReleasedError(); + } + + if (this.abortController.signal.aborted) { + throw this.createReleasedError( + this.abortController.signal.reason?.message || "Session aborted" + ); + } + + return await fn(); + } + + async embed(text: string, options?: EmbedOptions): Promise { + return this.withOperation(() => this.llm.embed(text, options)); + } + + async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> { + return this.withOperation(() => this.llm.embedBatch(texts)); + } + + async expandQuery( + query: string, + options?: { context?: string; includeLexical?: boolean } + ): Promise { + return this.withOperation(() => this.llm.expandQuery(query, options)); + } + + async rerank( + query: string, + documents: RerankDocument[], + options?: RerankOptions + ): Promise { + return this.withOperation(() => this.llm.rerank(query, documents, options)); + } +} + diff --git a/src/llm.ts b/src/llm.ts index 46c6295..233d565 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -18,6 +18,8 @@ import { import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs"; +import { ApiLLM } from "./api.js"; +import { PassthroughLLMSession } from "./llm-session.js"; // ============================================================================= // Embedding Formatting Functions @@ -298,6 +300,11 @@ export interface LLM { */ embed(text: string, options?: EmbedOptions): Promise; + /** + * Get embeddings for multiple texts + */ + embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>; + /** * Generate text completion */ @@ -320,6 +327,18 @@ export interface LLM { */ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise; + /** + * Whether this backend supports tokenizer access. + * API backends may return false and omit tokenize(). + */ + canTokenize?(): boolean; + + /** + * Tokenize text when tokenizer access is available. + * API backend doesn't currently expose tokenization. + */ + tokenize?(text: string): Promise; + /** * Dispose of resources */ @@ -765,6 +784,10 @@ export class LlamaCpp implements LLM { // Tokenization // ========================================================================== + canTokenize(): boolean { + return true; + } + /** * Tokenize text using the embedding model's tokenizer * Returns tokenizer tokens (opaque type from node-llama-cpp) @@ -1324,8 +1347,7 @@ let defaultSessionManager: LLMSessionManager | null = null; /** * Get the session manager for the default LlamaCpp instance. */ -function getSessionManager(): LLMSessionManager { - const llm = getDefaultLlamaCpp(); +function getSessionManager(llm: LlamaCpp = getDefaultLlamaCpp()): LLMSessionManager { if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) { defaultSessionManager = new LLMSessionManager(llm); } @@ -1350,13 +1372,27 @@ export async function withLLMSession( fn: (session: ILLMSession) => Promise, options?: LLMSessionOptions ): Promise { - const manager = getSessionManager(); - const session = new LLMSession(manager, options); + const llm = getDefaultLLM(); - try { - return await fn(session); - } finally { - session.release(); + if (llm instanceof LlamaCpp) { + const manager = getSessionManager(llm); + const session = new LLMSession(manager, options); + try { + return await fn(session); + } finally { + session.release(); + } + } else { + const session = new PassthroughLLMSession( + llm, + options, + (message?: string) => new SessionReleasedError(message) + ); + try { + return await fn(session); + } finally { + session.release(); + } } } @@ -1374,6 +1410,7 @@ export function canUnloadLLM(): boolean { // ============================================================================= let defaultLlamaCpp: LlamaCpp | null = null; +let defaultApiLLM: ApiLLM | null = null; /** * Get the default LlamaCpp instance (creates one if needed) @@ -1385,11 +1422,35 @@ export function getDefaultLlamaCpp(): LlamaCpp { return defaultLlamaCpp; } +/** + * Get the default LLM backend instance. + * Selects local or API backend based on QMD_LLM_BACKEND. + */ +export function getDefaultLLM(): LLM { + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + if (backend === "local") { + return getDefaultLlamaCpp(); + } + + if (backend === "api") { + if (!defaultApiLLM) { + defaultApiLLM = new ApiLLM(); + } + return defaultApiLLM; + } + + throw new Error( + `Invalid QMD_LLM_BACKEND="${process.env.QMD_LLM_BACKEND}". Expected "local" or "api".` + ); +} + /** * Set a custom default LlamaCpp instance (useful for testing) */ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void { defaultLlamaCpp = llm; + // Function appears unused - clearing defaultApiLLM probably right thing to do anyway? + defaultApiLLM = null; } /** @@ -1402,3 +1463,15 @@ export async function disposeDefaultLlamaCpp(): Promise { defaultLlamaCpp = null; } } + +/** + * Dispose the default LLM backend instance. + * Currently aliases LlamaCpp disposal. + */ +export async function disposeDefaultLLM(): Promise { + if (defaultApiLLM) { + await defaultApiLLM.dispose(); + defaultApiLLM = null; + } + await disposeDefaultLlamaCpp(); +} diff --git a/src/mcp.ts b/src/mcp.ts index 323f469..9cedcb7 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -24,7 +24,7 @@ import { } from "./store.js"; import type { Store, StructuredSubSearch } from "./store.js"; import { getCollection, getGlobalContext, getDefaultCollectionNames } from "./collections.js"; -import { disposeDefaultLlamaCpp } from "./llm.js"; +import { disposeDefaultLLM } from "./llm.js"; // ============================================================================= // Types for structured content @@ -717,7 +717,7 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole await transport.close(); httpServer.close(); store.close(); - await disposeDefaultLlamaCpp(); + await disposeDefaultLLM(); }; process.on("SIGTERM", async () => { diff --git a/src/qmd.ts b/src/qmd.ts index d57b7e8..ecfe492 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -70,7 +70,20 @@ import { createStore, getDefaultDbPath, } from "./store.js"; -import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { + clearApiEmbeddingScope, + getVectorScopeGuardMessage, + setApiEmbeddingScopeFromCurrentEnv, +} from "./vector-scope-guard.js"; +import { disposeDefaultLLM, getDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { + DEFAULT_API_CHAT_BASE_URL, + DEFAULT_API_CHAT_MODEL, + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, + DEFAULT_API_RERANK_BASE_URL, + DEFAULT_API_RERANK_MODEL, +} from "./api-defaults.js"; import { formatSearchResults, formatDocuments, @@ -384,43 +397,62 @@ async function showStatus(): Promise { // Models { - // hf:org/repo/file.gguf → https://huggingface.co/org/repo - const hfLink = (uri: string) => { - const match = uri.match(/^hf:([^/]+\/[^/]+)\//); - return match ? `https://huggingface.co/${match[1]}` : uri; - }; console.log(`\n${c.bold}Models${c.reset}`); - console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); - console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); - console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + if (backend === "api") { + const embedBaseUrl = (process.env.QMD_EMBED_BASE_URL || DEFAULT_API_EMBED_BASE_URL).replace(/\/+$/, ""); + const embedModel = process.env.QMD_EMBED_MODEL || DEFAULT_API_EMBED_MODEL; + const chatBaseUrl = (process.env.QMD_CHAT_BASE_URL || DEFAULT_API_CHAT_BASE_URL).replace(/\/+$/, ""); + const chatModel = process.env.QMD_CHAT_MODEL || DEFAULT_API_CHAT_MODEL; + const rerankBaseUrl = (process.env.QMD_RERANK_BASE_URL || DEFAULT_API_RERANK_BASE_URL).replace(/\/+$/, ""); + const rerankModel = process.env.QMD_RERANK_MODEL || DEFAULT_API_RERANK_MODEL; + + console.log(` Embedding: ${embedModel} ${c.dim}(${embedBaseUrl})${c.reset}`); + console.log(` Chat: ${chatModel} ${c.dim}(${chatBaseUrl})${c.reset}`); + console.log(` Reranking: ${rerankModel} ${c.dim}(${rerankBaseUrl})${c.reset}`); + } else { + // hf:org/repo/file.gguf → https://huggingface.co/org/repo + const hfLink = (uri: string) => { + const match = uri.match(/^hf:([^/]+\/[^/]+)\//); + return match ? `https://huggingface.co/${match[1]}` : uri; + }; + console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); + console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); + console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + } } // Device / GPU info try { - const llm = getDefaultLlamaCpp(); - const device = await llm.getDeviceInfo(); - console.log(`\n${c.bold}Device${c.reset}`); - if (device.gpu) { - console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); - if (device.gpuDevices.length > 0) { - // Deduplicate and count GPUs - const counts = new Map(); - for (const name of device.gpuDevices) { - counts.set(name, (counts.get(name) || 0) + 1); + const llm = getDefaultLLM(); + if (llm instanceof LlamaCpp) { + const device = await llm.getDeviceInfo(); + console.log(`\n${c.bold}Device${c.reset}`); + if (device.gpu) { + console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); + if (device.gpuDevices.length > 0) { + // Deduplicate and count GPUs + const counts = new Map(); + for (const name of device.gpuDevices) { + counts.set(name, (counts.get(name) || 0) + 1); + } + const deviceStr = Array.from(counts.entries()) + .map(([name, count]) => count > 1 ? `${count}× ${name}` : name) + .join(', '); + console.log(` Devices: ${deviceStr}`); } - const deviceStr = Array.from(counts.entries()) - .map(([name, count]) => count > 1 ? `${count}× ${name}` : name) - .join(', '); - console.log(` Devices: ${deviceStr}`); - } - if (device.vram) { - console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`); + if (device.vram) { + console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`); + } + } else { + console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`); + console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); } + console.log(` CPU: ${device.cpuCores} math cores`); } else { - console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`); - console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); + console.log(`\n${c.bold}Device${c.reset}`); + console.log(` ${c.dim}Backend is API mode; local device probe skipped.${c.reset}`); } - console.log(` CPU: ${device.cpuCores} math cores`); } catch { // Don't fail status if LLM init fails } @@ -1533,11 +1565,23 @@ function renderProgressBar(percent: number, width: number = 30): string { async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise { const db = getDb(); const now = new Date().toISOString(); + const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; + const isApiBackend = backend === "api"; + + if (!force) { + const guardMessage = getVectorScopeGuardMessage(db); + if (guardMessage) { + throw new Error(guardMessage); + } + } // If force, clear all vectors if (force) { console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`); clearAllEmbeddings(db); + if (!isApiBackend) { + clearApiEmbeddingScope(db); + } } // Find unique hashes that need embedding (from active documents) @@ -1615,6 +1659,9 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = throw new Error("Failed to get embedding dimensions from first chunk"); } ensureVecTable(db, firstResult.embedding.length); + if (isApiBackend) { + setApiEmbeddingScopeFromCurrentEnv(db); + } let chunksEmbedded = 0, errors = 0, bytesProcessed = 0; const startTime = Date.now(); @@ -2828,7 +2875,7 @@ if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsW } if (cli.command !== "mcp") { - await disposeDefaultLlamaCpp(); + await disposeDefaultLLM(); process.exit(0); } diff --git a/src/store.ts b/src/store.ts index ff08c2a..bee35ef 100644 --- a/src/store.ts +++ b/src/store.ts @@ -17,8 +17,7 @@ import picomatch from "picomatch"; import { createHash } from "crypto"; import { realpathSync, statSync, mkdirSync } from "node:fs"; import { - LlamaCpp, - getDefaultLlamaCpp, + getDefaultLLM, formatQueryForEmbedding, formatDocForEmbedding, type RerankDocument, @@ -38,6 +37,7 @@ import { loadConfig as collectionsLoadConfig, type NamedCollection, } from "./collections.js"; +import { getVectorScopeGuardMessage } from "./vector-scope-guard.js"; // ============================================================================= // Configuration @@ -675,6 +675,14 @@ function initializeDatabase(db: Database): void { ) `); + // API embedding scope metadata (used to guard mixed local/API vector usage). + db.exec(` + CREATE TABLE IF NOT EXISTS api_meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ) + `); + // Content vectors const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[]; const hasSeqColumn = cvInfo.some(col => col.name === 'seq'); @@ -1427,7 +1435,7 @@ export async function chunkDocumentByTokens( overlapTokens: number = CHUNK_OVERLAP_TOKENS, windowTokens: number = CHUNK_WINDOW_TOKENS ): Promise<{ text: string; pos: number; tokens: number }[]> { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // If chunks exceed limit, they'll be re-split with actual ratio @@ -1437,13 +1445,23 @@ export async function chunkDocumentByTokens( const windowChars = windowTokens * avgCharsPerToken; // Chunk in character space with conservative estimate - let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars); + const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars); + + // API backend doesn't expose tokenizer APIs; keep chunking approximate and avoid local model init. + if (!llm.canTokenize?.() || !llm.tokenize) { + return charChunks.map((chunk) => ({ + text: chunk.text, + pos: chunk.pos, + tokens: Math.max(1, Math.ceil(chunk.text.length / avgCharsPerToken)), + })); + } + const tokenize = llm.tokenize.bind(llm); // Tokenize and split any chunks that still exceed limit const results: { text: string; pos: number; tokens: number }[] = []; for (const chunk of charChunks) { - const tokens = await llm.tokenize(chunk.text); + const tokens = await tokenize(chunk.text); if (tokens.length <= maxTokens) { results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length }); @@ -1456,7 +1474,7 @@ export async function chunkDocumentByTokens( const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2)); for (const subChunk of subChunks) { - const subTokens = await llm.tokenize(subChunk.text); + const subTokens = await tokenize(subChunk.text); results.push({ text: subChunk.text, pos: chunk.pos + subChunk.pos, @@ -2140,6 +2158,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle // ============================================================================= export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise { + const guardMessage = getVectorScopeGuardMessage(db); + if (guardMessage) { + throw new Error(guardMessage); + } + const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (!tableExists) return []; @@ -2234,7 +2257,7 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text); const result = session ? await session.embed(formattedText, { model, isQuery }) - : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery }); + : await getDefaultLLM().embed(formattedText, { model, isQuery }); return result?.embedding || null; } @@ -2299,8 +2322,8 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M } } - const llm = getDefaultLlamaCpp(); - // Note: LlamaCpp uses hardcoded model, model parameter is ignored + const llm = getDefaultLLM(); + // Note: current local backend uses a configured default model; `model` may be ignored. const results = await llm.expandQuery(query); // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals). @@ -2337,9 +2360,9 @@ export async function rerank(query: string, documents: { file: string; text: str } } - // Rerank uncached documents using LlamaCpp + // Rerank uncached documents using the configured LLM backend if (uncachedDocs.length > 0) { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const rerankResult = await llm.rerank(query, uncachedDocs, { model }); // Cache results — use original doc.text for cache key (result.file lacks chunk text) @@ -2900,6 +2923,11 @@ export async function hybridQuery( query: string, options?: HybridQueryOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; @@ -2973,7 +3001,7 @@ export async function hybridQuery( } // Batch embed all vector queries in a single call - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); @@ -3113,6 +3141,11 @@ export async function vectorSearchQuery( query: string, options?: VectorSearchOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0.3; const collection = options?.collection; @@ -3203,6 +3236,11 @@ export async function structuredSearch( searches: StructuredSubSearch[], options?: StructuredSearchOptions ): Promise { + const guardMessage = getVectorScopeGuardMessage(store.db); + if (guardMessage) { + throw new Error(guardMessage); + } + const limit = options?.limit ?? 10; const minScore = options?.minScore ?? 0; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; @@ -3271,7 +3309,7 @@ export async function structuredSearch( if (hasVectors) { const vecSearches = searches.filter(s => s.type === 'vec' || s.type === 'hyde'); if (vecSearches.length > 0) { - const llm = getDefaultLlamaCpp(); + const llm = getDefaultLLM(); const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query)); hooks?.onEmbedStart?.(textsToEmbed.length); const embedStart = Date.now(); diff --git a/src/vector-scope-guard.ts b/src/vector-scope-guard.ts new file mode 100644 index 0000000..42ef5df --- /dev/null +++ b/src/vector-scope-guard.ts @@ -0,0 +1,124 @@ +import type { Database } from "./db.js"; +import { + DEFAULT_API_EMBED_BASE_URL, + DEFAULT_API_EMBED_MODEL, +} from "./api-defaults.js"; + +export type ApiEmbeddingScope = { + embedBaseUrl: string; + embedModel: string; +}; + +function getConfiguredBackend(): string { + return process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local"; +} + +function resolveCurrentApiEmbeddingScopeFromEnv(): ApiEmbeddingScope { + const embedBaseUrl = ( + process.env.QMD_EMBED_BASE_URL?.trim() + || DEFAULT_API_EMBED_BASE_URL + ).replace(/\/+$/, ""); + const embedModel = process.env.QMD_EMBED_MODEL?.trim() || DEFAULT_API_EMBED_MODEL; + return { embedBaseUrl, embedModel }; +} + +function getApiMetaValue(db: Database, key: string): string | null { + try { + const row = db.prepare(`SELECT value FROM api_meta WHERE key = ?`).get(key) as { value: string } | null; + return row?.value || null; + } catch { + // Older DBs or test fixtures may not include api_meta. + return null; + } +} + +function setApiMetaValue(db: Database, key: string, value: string): void { + db.prepare(`INSERT OR REPLACE INTO api_meta (key, value) VALUES (?, ?)`).run(key, value); +} + +function hasAnyVectors(db: Database): boolean { + const cvCount = db.prepare(`SELECT COUNT(*) as c FROM content_vectors`).get() as { c: number }; + if (cvCount.c > 0) return true; + + const tableExists = db.prepare(` + SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec' + `).get(); + if (!tableExists) return false; + + try { + const vvCount = db.prepare(`SELECT COUNT(*) as c FROM vectors_vec`).get() as { c: number }; + return vvCount.c > 0; + } catch { + // If vec table exists but count fails, treat as non-empty/unknown for safety. + return true; + } +} + +function formatApiScope(scope: ApiEmbeddingScope): string { + return `${scope.embedBaseUrl} | ${scope.embedModel}`; +} + +export function getStoredApiEmbeddingScope(db: Database): ApiEmbeddingScope | null { + const embedBaseUrl = getApiMetaValue(db, "embed_base_url"); + const embedModel = getApiMetaValue(db, "embed_model"); + if (!embedBaseUrl || !embedModel) return null; + return { embedBaseUrl, embedModel }; +} + +export function setApiEmbeddingScopeFromCurrentEnv(db: Database): void { + const scope = resolveCurrentApiEmbeddingScopeFromEnv(); + setApiMetaValue(db, "embed_base_url", scope.embedBaseUrl); + setApiMetaValue(db, "embed_model", scope.embedModel); +} + +export function clearApiEmbeddingScope(db: Database): void { + db.exec(`DELETE FROM api_meta`); +} + +export function getVectorScopeGuardMessage(db: Database): string | null { + const backend = getConfiguredBackend(); + const storedScope = getStoredApiEmbeddingScope(db); + + if (backend === "local") { + if (!storedScope) return null; + return [ + "Index is marked for API embeddings, but current backend is local.", + `Stored API embedding scope: ${formatApiScope(storedScope)}`, + "Choose one:", + " 1) Set QMD_LLM_BACKEND=api with matching embedding settings", + " 2) Use a different index via --index", + " 3) Run 'qmd embed -f' to clear vectors and remove API scope metadata", + ].join("\n"); + } + + if (backend === "api") { + const currentScope = resolveCurrentApiEmbeddingScopeFromEnv(); + + if (!storedScope) { + if (!hasAnyVectors(db)) return null; + return [ + "This index has vectors but no API scope metadata (legacy/ambiguous state).", + "Choose one:", + " 1) Use a different index via --index", + " 2) Run 'qmd embed -f' to reset vectors for the current API embedding scope", + ].join("\n"); + } + + const isMatch = storedScope.embedBaseUrl === currentScope.embedBaseUrl + && storedScope.embedModel === currentScope.embedModel; + if (isMatch) return null; + + return [ + "API embedding scope mismatch for this index.", + `Stored scope (in index db): ${formatApiScope(storedScope)}`, + `Current scope (from environment): ${formatApiScope(currentScope)}`, + "Choose one:", + " 1) Revert API embedding settings to match the stored scope", + " 2) Use a different index via --index", + " 3) Run 'qmd embed -f' to reset vectors for the current API embedding scope", + ].join("\n"); + } + + // Unknown backend values are validated elsewhere; don't block here. + return null; +} diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts new file mode 100644 index 0000000..63f696d --- /dev/null +++ b/test/api.contract.test.ts @@ -0,0 +1,344 @@ +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { ApiLLM } from "../src/api.js"; +import { canUnloadLLM, withLLMSession } from "../src/llm.js"; + +describe("ApiLLM (contract)", () => { + const fetchMock = vi.fn(); + const originalFetch = globalThis.fetch; + const originalQmdEmbedApiKey = process.env.QMD_EMBED_API_KEY; + const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY; + const originalQmdChatModel = process.env.QMD_CHAT_MODEL; + const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY; + const originalQmdLlmBackend = process.env.QMD_LLM_BACKEND; + + beforeEach(() => { + fetchMock.mockReset(); + (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch; + }); + + afterEach(() => { + (globalThis as { fetch: typeof fetch }).fetch = originalFetch; + process.env.QMD_EMBED_API_KEY = originalQmdEmbedApiKey; + process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey; + process.env.QMD_CHAT_MODEL = originalQmdChatModel; + process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey; + process.env.QMD_LLM_BACKEND = originalQmdLlmBackend; + }); + + test("embed sends OpenAI-compatible /embeddings request, ignores per-call model override, and parses response", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [{ embedding: [0.1, 0.2, 0.3] }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "test-key", + embedModel: "test-embed-model", + }); + + const result = await llm.embed("hello", { model: "override-embed-model" }); + + expect(result).not.toBeNull(); + expect(result?.embedding).toEqual([0.1, 0.2, 0.3]); + expect(result?.model).toBe("test-embed-model"); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://example.test/v1/embeddings"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer test-key", + }); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "test-embed-model", + input: ["hello"], + }); + }); + + test("embedBatch returns one result per input and null for missing vectors", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [ + { embedding: [1, 2] }, + {}, + { embedding: [3, 4] }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "test-key", + embedModel: "test-embed-model", + }); + + const results = await llm.embedBatch(["a", "b", "c"]); + expect(results).toHaveLength(3); + expect(results[0]?.embedding).toEqual([1, 2]); + expect(results[1]).toBeNull(); + expect(results[2]?.embedding).toEqual([3, 4]); + }); + + test("embed throws and avoids fetch when API key is missing", async () => { + process.env.QMD_EMBED_API_KEY = ""; + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "", + embedModel: "test-embed-model", + }); + + await expect( + llm.embed("hello") + ).rejects.toThrow("missing API key"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + test("generate fails explicitly for API backend", async () => { + const llm = new ApiLLM({}); + + await expect( + llm.generate("hello") + ).rejects.toThrow("not implemented for API backend"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + test("rerank sends Cohere-compatible /rerank request, ignores per-call model override, and maps response by index", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + results: [ + { index: 1, relevance_score: 0.91 }, + { index: 0, relevance_score: 0.24 }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "embed-key", + rerankBaseUrl: "https://rerank.test/v1", + rerankApiKey: "rerank-key", + rerankModel: "rerank-v3.5", + }); + + const result = await llm.rerank( + "capital of france", + [ + { file: "a.md", text: "Berlin is the capital of Germany." }, + { file: "b.md", text: "Paris is the capital of France." }, + ], + { model: "override-rerank-model" } + ); + + expect(result.model).toBe("rerank-v3.5"); + expect(result.results).toEqual([ + { file: "b.md", score: 0.91, index: 1 }, + { file: "a.md", score: 0.24, index: 0 }, + ]); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://rerank.test/v1/rerank"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer rerank-key", + }); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "rerank-v3.5", + query: "capital of france", + documents: [ + "Berlin is the capital of Germany.", + "Paris is the capital of France.", + ], + top_n: 2, + }); + }); + + test("rerank sends Voyage-compatible top_k and accepts data response shape", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + data: [ + { index: 0, relevance_score: 0.12 }, + { index: 1, relevance_score: 0.95 }, + ], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + rerankBaseUrl: "https://api.voyageai.com/v1", + rerankApiKey: "voyage-key", + rerankModel: "rerank-2.5-lite", + }); + + const result = await llm.rerank( + "capital of france", + [ + { file: "a.md", text: "Berlin is the capital of Germany." }, + { file: "b.md", text: "Paris is the capital of France." }, + ] + ); + + expect(result.model).toBe("rerank-2.5-lite"); + expect(result.results).toEqual([ + { file: "b.md", score: 0.95, index: 1 }, + { file: "a.md", score: 0.12, index: 0 }, + ]); + + expect(fetchMock).toHaveBeenCalledTimes(1); + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://api.voyageai.com/v1/rerank"); + expect(JSON.parse(String(init?.body))).toEqual({ + model: "rerank-2.5-lite", + query: "capital of france", + documents: [ + "Berlin is the capital of Germany.", + "Paris is the capital of France.", + ], + top_k: 2, + }); + }); + + test("rerank throws and avoids fetch when rerank API key is missing", async () => { + process.env.QMD_EMBED_API_KEY = ""; + process.env.QMD_RERANK_API_KEY = ""; + + const llm = new ApiLLM({ + embedBaseUrl: "https://example.test/v1", + embedApiKey: "", + rerankApiKey: "", + rerankModel: "rerank-v3.5", + }); + + await expect( + llm.rerank("q", [{ file: "doc.md", text: "t" }]) + ).rejects.toThrow("missing API key"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + test("expandQuery accepts line format output", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + choices: [{ + message: { + content: "lex: api auth docs\nvec: api authentication guide\nhyde: A guide to API authentication setup", + }, + }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + const result = await llm.expandQuery("api auth docs"); + expect(result).toEqual([ + { type: "lex", text: "api auth docs" }, + { type: "vec", text: "api authentication guide" }, + { type: "hyde", text: "A guide to API authentication setup" }, + ]); + + const [url, init] = fetchMock.mock.calls[0]!; + expect(url).toBe("https://chat.example.test/v1/chat/completions"); + expect(init?.method).toBe("POST"); + expect(init?.headers).toEqual({ + "Content-Type": "application/json", + "Authorization": "Bearer chat-key", + }); + }); + + test("expandQuery throws and avoids fetch when chat API key is missing", async () => { + process.env.QMD_CHAT_API_KEY = ""; + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "", + chatModel: "gpt-4o-mini", + }); + + await expect( + llm.expandQuery("api auth docs") + ).rejects.toThrow("missing API key"); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + test("expandQuery throws on chat request failure", async () => { + fetchMock.mockResolvedValue( + new Response("upstream error", { status: 503, statusText: "Service Unavailable" }) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + await expect( + llm.expandQuery("api auth docs") + ).rejects.toThrow("chat error: 503"); + }); + + test("expandQuery returns empty expansion set when output is not parseable line format", async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + choices: [{ + message: { + content: JSON.stringify([ + { type: "lex", text: "api auth docs" }, + { type: "vec", text: "api authentication guide" }, + ]), + }, + }], + }), + { status: 200, headers: { "Content-Type": "application/json" } } + ) + ); + + const llm = new ApiLLM({ + chatBaseUrl: "https://chat.example.test/v1", + chatApiKey: "chat-key", + chatModel: "gpt-4o-mini", + }); + + const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); + const result = await llm.expandQuery("api auth docs"); + expect(result).toEqual([]); + expect(warnSpy).toHaveBeenCalledTimes(1); + warnSpy.mockRestore(); + }); + + test("withLLMSession does not acquire local unload lock when backend is api", async () => { + process.env.QMD_LLM_BACKEND = "api"; + + const unloadBefore = canUnloadLLM(); + expect(unloadBefore).toBe(true); + + await withLLMSession(async (session) => { + expect(session.isValid).toBe(true); + expect(canUnloadLLM()).toBe(true); + }, { maxDuration: 1000, name: "api-contract-session" }); + + expect(canUnloadLLM()).toBe(true); + }); +}); diff --git a/test/api.live.test.ts b/test/api.live.test.ts new file mode 100644 index 0000000..14786c5 --- /dev/null +++ b/test/api.live.test.ts @@ -0,0 +1,131 @@ +import { describe, expect, test } from "vitest"; +import { ApiLLM } from "../src/api.js"; + +/** + * Live API tests (provider-gated by env vars). + * Required keys: OPENAI_API_KEY, OPENROUTER_API_KEY, COHERE_API_KEY, VOYAGE_API_KEY. + * Tests for a provider are skipped when that provider key is not set. + */ +const embeddingProviders = [ + { + name: "OpenAI", + key: process.env.OPENAI_API_KEY || "", + baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small", + }, + { + name: "OpenRouter", + key: process.env.OPENROUTER_API_KEY || "", + baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1", + embedModel: process.env.OPENROUTER_EMBED_MODEL || "openai/text-embedding-3-small", + }, + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1", + embedModel: process.env.COHERE_EMBED_MODEL || "embed-v4.0", + }, + { + name: "Voyage", + key: process.env.VOYAGE_API_KEY || "", + baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1", + embedModel: process.env.VOYAGE_EMBED_MODEL || "voyage-3.5-lite", + }, +]; + +const chatProviders = [ + { + name: "OpenAI", + key: process.env.OPENAI_API_KEY || "", + baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1", + chatModel: process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini", + }, + { + name: "OpenRouter", + key: process.env.OPENROUTER_API_KEY || "", + baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1", + chatModel: process.env.OPENROUTER_CHAT_MODEL || "openai/gpt-4o-mini", + }, + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1", + chatModel: process.env.COHERE_CHAT_MODEL || "command-a-03-2025", + }, +]; + +describe("ApiLLM Embeddings (live)", () => { + for (const provider of embeddingProviders) { + test.skipIf(!provider.key)(`${provider.name} /v1/embeddings returns a non-empty vector`, async () => { + const llm = new ApiLLM({ + embedBaseUrl: provider.baseUrl, + embedApiKey: provider.key, + embedModel: provider.embedModel, + }); + + const result = await llm.embed(`QMD embedding live test (${provider.name})`); + expect(result).not.toBeNull(); + expect(Array.isArray(result?.embedding)).toBe(true); + expect(result!.embedding.length).toBeGreaterThan(10); + expect(Number.isFinite(result!.embedding[0])).toBe(true); + }, 30000); + } +}); + +describe("ApiLLM Query Expansion (live)", () => { + for (const provider of chatProviders) { + test.skipIf(!provider.key)(`${provider.name} chat completions expands query with line output mode`, async () => { + const llm = new ApiLLM({ + chatBaseUrl: provider.baseUrl, + chatApiKey: provider.key, + chatModel: provider.chatModel, + }); + + const result = await llm.expandQuery("how to authenticate API requests"); + expect(result.length).toBeGreaterThanOrEqual(1); + for (const item of result) { + expect(["lex", "vec", "hyde"]).toContain(item.type); + expect(item.text.length).toBeGreaterThan(0); + } + }, 30000); + } +}); + +const rerankProviders = [ + { + name: "Cohere", + key: process.env.COHERE_API_KEY || "", + baseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1", + rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5", + }, + { + name: "Voyage", + key: process.env.VOYAGE_API_KEY || "", + baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1", + rerankModel: process.env.VOYAGE_RERANK_MODEL || "rerank-2.5-lite", + }, +]; + +describe("ApiLLM Rerank (live)", () => { + for (const provider of rerankProviders) { + test.skipIf(!provider.key)(`${provider.name} /v1/rerank returns ranked documents with finite scores`, async () => { + const llm = new ApiLLM({ + rerankBaseUrl: provider.baseUrl, + rerankApiKey: provider.key, + rerankModel: provider.rerankModel, + }); + + const docs = [ + { file: "france.md", text: "Paris is the capital city of France." }, + { file: "pets.md", text: "Cats and dogs are common household pets." }, + { file: "germany.md", text: "Berlin is the capital city of Germany." }, + ]; + + const result = await llm.rerank("What is the capital of France?", docs); + expect(result.results.length).toBe(3); + expect(result.results[0]!.file).toBe("france.md"); + expect(Number.isFinite(result.results[0]!.score)).toBe(true); + expect(result.results[0]!.score).toBeGreaterThanOrEqual(result.results[1]!.score); + }, 30000); + } +}); diff --git a/test/store.helpers.unit.test.ts b/test/store.helpers.unit.test.ts index 3303187..868237d 100644 --- a/test/store.helpers.unit.test.ts +++ b/test/store.helpers.unit.test.ts @@ -15,6 +15,7 @@ import { normalizeDocid, isDocid, handelize, + chunkDocumentByTokens, } from "../src/store"; // ============================================================================= @@ -203,3 +204,25 @@ describe("handelize", () => { expect(isDocid("12345")).toBe(false); }); }); + +describe("Token Chunking Fallback", () => { + test("chunkDocumentByTokens uses char-based fallback when backend cannot tokenize", async () => { + const originalBackend = process.env.QMD_LLM_BACKEND; + process.env.QMD_LLM_BACKEND = "api"; + + try { + const content = "This is a document sentence. ".repeat(400); + const chunks = await chunkDocumentByTokens(content, 120, 18, 40); + expect(chunks.length).toBeGreaterThan(1); + for (const chunk of chunks) { + expect(chunk.tokens).toBeGreaterThan(0); + } + } finally { + if (originalBackend === undefined) { + delete process.env.QMD_LLM_BACKEND; + } else { + process.env.QMD_LLM_BACKEND = originalBackend; + } + } + }); +}); diff --git a/test/store.scope-guard.unit.test.ts b/test/store.scope-guard.unit.test.ts new file mode 100644 index 0000000..56030de --- /dev/null +++ b/test/store.scope-guard.unit.test.ts @@ -0,0 +1,102 @@ +import { afterEach, beforeEach, describe, expect, test } from "vitest"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createStore, type Store } from "../src/store.js"; +import { + clearApiEmbeddingScope, + getVectorScopeGuardMessage, + setApiEmbeddingScopeFromCurrentEnv, +} from "../src/vector-scope-guard.js"; + +describe("Vector scope guard (API metadata)", () => { + let testDir: string; + let store: Store; + + const originalBackend = process.env.QMD_LLM_BACKEND; + const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL; + const originalEmbedModel = process.env.QMD_EMBED_MODEL; + + beforeEach(async () => { + testDir = await mkdtemp(join(tmpdir(), "qmd-scope-guard-")); + store = createStore(join(testDir, "index.sqlite")); + + delete process.env.QMD_LLM_BACKEND; + delete process.env.QMD_EMBED_BASE_URL; + delete process.env.QMD_EMBED_MODEL; + }); + + afterEach(async () => { + store.close(); + await rm(testDir, { recursive: true, force: true }); + + if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND; + else process.env.QMD_LLM_BACKEND = originalBackend; + + if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL; + else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl; + + if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL; + else process.env.QMD_EMBED_MODEL = originalEmbedModel; + }); + + test("local backend with no api metadata does not block vector paths", () => { + process.env.QMD_LLM_BACKEND = "local"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toBeNull(); + }); + + test("local backend blocks when api metadata exists", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_LLM_BACKEND = "local"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("current backend is local"); + expect(message).toContain("qmd embed -f"); + }); + + test("api backend blocks legacy vectors when api metadata is missing", () => { + process.env.QMD_LLM_BACKEND = "api"; + clearApiEmbeddingScope(store.db); + + store.ensureVecTable(3); + store.insertEmbedding( + "hash-1", + 0, + 0, + new Float32Array([0.1, 0.2, 0.3]), + "legacy-model", + new Date().toISOString() + ); + + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("legacy/ambiguous"); + expect(message).toContain("qmd embed -f"); + }); + + test("api backend allows matching stored scope", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + const message = getVectorScopeGuardMessage(store.db); + expect(message).toBeNull(); + }); + + test("api backend blocks mismatched stored scope", () => { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_EMBED_MODEL = "text-embedding-3-large"; + const message = getVectorScopeGuardMessage(store.db); + expect(message).toContain("scope mismatch"); + expect(message).toContain("Stored scope"); + expect(message).toContain("Current scope"); + }); +}); diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts index 0a4c8c6..e60dcd3 100644 --- a/test/structured-search.test.ts +++ b/test/structured-search.test.ts @@ -20,6 +20,10 @@ import { type Store, } from "../src/store.js"; import { disposeDefaultLlamaCpp } from "../src/llm.js"; +import { + clearApiEmbeddingScope, + setApiEmbeddingScopeFromCurrentEnv, +} from "../src/vector-scope-guard.js"; // ============================================================================= // parseStructuredQuery Tests (CLI Parser) @@ -317,6 +321,34 @@ describe("structuredSearch", () => { expect(r.score).toBeGreaterThanOrEqual(0.5); } }); + + test("applies API scope guard on structured query path", async () => { + const originalBackend = process.env.QMD_LLM_BACKEND; + const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL; + const originalEmbedModel = process.env.QMD_EMBED_MODEL; + + try { + process.env.QMD_LLM_BACKEND = "api"; + process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1"; + process.env.QMD_EMBED_MODEL = "text-embedding-3-small"; + setApiEmbeddingScopeFromCurrentEnv(store.db); + + process.env.QMD_LLM_BACKEND = "local"; + await expect(structuredSearch(store, [{ type: "lex", query: "test" }])) + .rejects.toThrow("current backend is local"); + } finally { + clearApiEmbeddingScope(store.db); + + if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND; + else process.env.QMD_LLM_BACKEND = originalBackend; + + if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL; + else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl; + + if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL; + else process.env.QMD_EMBED_MODEL = originalEmbedModel; + } + }); }); // =============================================================================