diff --git a/src/api-defaults.ts b/src/api-defaults.ts
new file mode 100644
index 0000000..31925db
--- /dev/null
+++ b/src/api-defaults.ts
@@ -0,0 +1,6 @@
+export const DEFAULT_API_EMBED_BASE_URL = "https://api.openai.com/v1";
+export const DEFAULT_API_EMBED_MODEL = "text-embedding-3-small";
+export const DEFAULT_API_CHAT_BASE_URL = "https://api.openai.com/v1";
+export const DEFAULT_API_CHAT_MODEL = "gpt-4o-mini";
+export const DEFAULT_API_RERANK_BASE_URL = "https://api.cohere.com/v1";
+export const DEFAULT_API_RERANK_MODEL = "rerank-v3.5";
diff --git a/src/api.ts b/src/api.ts
new file mode 100644
index 0000000..4603be1
--- /dev/null
+++ b/src/api.ts
@@ -0,0 +1,389 @@
+/**
+ * api.ts - API-backed LLM implementation (incremental rollout)
+ *
+ * Current phase: embeddings (/v1/embeddings), query expansion (/v1/chat/completions),
+ * and rerank (/v1/rerank).
+ * Query expansion currently prompts model for line-format output ("lex|vec|hyde: ..."),
+ * but does not use constrained output. Possibly upgrade to structured output.
+ * This path works in current provider-gated tests but is not extensively battle-tested yet.
+ * Text generation is intentionally unsupported in this backend for now.
+ */
+
+import type {
+  LLM,
+  EmbedOptions,
+  EmbeddingResult,
+  GenerateOptions,
+  GenerateResult,
+  ModelInfo,
+  QueryType,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+import {
+  DEFAULT_API_CHAT_BASE_URL,
+  DEFAULT_API_CHAT_MODEL,
+  DEFAULT_API_EMBED_BASE_URL,
+  DEFAULT_API_EMBED_MODEL,
+  DEFAULT_API_RERANK_BASE_URL,
+  DEFAULT_API_RERANK_MODEL,
+} from "./api-defaults.js";
+
+type OpenAIEmbeddingResponse = {
+  data?: Array<{ embedding?: number[] }>;
+};
+
+type RerankResponse = {
+  results?: Array<{ index?: number; relevance_score?: number }>;
+  data?: Array<{ index?: number; relevance_score?: number }>;
+};
+
+type OpenAIChatResponse = {
+  choices?: Array<{
+    message?: {
+      content?: string | Array<{ type?: string; text?: string }>;
+    };
+  }>;
+};
+
+export type ApiLLMConfig = {
+  embedBaseUrl?: string;
+  embedApiKey?: string;
+  embedModel?: string;
+  chatBaseUrl?: string;
+  chatApiKey?: string;
+  chatModel?: string;
+  rerankBaseUrl?: string;
+  rerankApiKey?: string;
+  rerankModel?: string;
+};
+
+/**
+ * API-backed LLM implementation.
+ * Embeddings/query-expansion/reranking are remote; text generation is unsupported.
+ */
+export class ApiLLM implements LLM {
+  private readonly embedBaseUrl: string;
+  private readonly embedApiKey: string;
+  private readonly embedModel: string;
+  private readonly chatBaseUrl: string;
+  private readonly chatApiKey: string;
+  private readonly chatModel: string;
+  private readonly rerankBaseUrl: string;
+  private readonly rerankApiKey: string;
+  private readonly rerankModel: string;
+
+  constructor(config: ApiLLMConfig = {}) {
+    // Embedding API config
+    this.embedBaseUrl = (
+      config.embedBaseUrl
+      || process.env.QMD_EMBED_BASE_URL
+      || DEFAULT_API_EMBED_BASE_URL
+    ).replace(/\/+$/, "");
+    this.embedApiKey =
+      config.embedApiKey
+      || process.env.QMD_EMBED_API_KEY
+      || "";
+    this.embedModel =
+      config.embedModel
+      || process.env.QMD_EMBED_MODEL
+      || DEFAULT_API_EMBED_MODEL;
+      // Chat API config
+    this.chatBaseUrl = (
+      config.chatBaseUrl
+      || process.env.QMD_CHAT_BASE_URL
+      || DEFAULT_API_CHAT_BASE_URL
+    ).replace(/\/+$/, "");
+    this.chatApiKey =
+      config.chatApiKey
+      || process.env.QMD_CHAT_API_KEY
+      || "";
+    this.chatModel =
+      config.chatModel
+      || process.env.QMD_CHAT_MODEL
+      || DEFAULT_API_CHAT_MODEL;
+    // Rerank API config
+    this.rerankBaseUrl = (
+      config.rerankBaseUrl
+      || process.env.QMD_RERANK_BASE_URL
+      || DEFAULT_API_RERANK_BASE_URL
+    ).replace(/\/+$/, "");
+    this.rerankApiKey =
+      config.rerankApiKey
+      || process.env.QMD_RERANK_API_KEY
+      || "";
+    this.rerankModel =
+      config.rerankModel
+      || process.env.QMD_RERANK_MODEL
+      || DEFAULT_API_RERANK_MODEL;
+  }
+
+  private getHeaders(apiKey: string): Record<string, string> {
+    return {
+      "Content-Type": "application/json",
+      "Authorization": `Bearer ${apiKey}`,
+    };
+  }
+
+  private usesVoyageRerankApi(): boolean {
+    // Voyage uses different result shape, if we support more providers maybe add env var selector
+    try {
+      const hostname = new URL(this.rerankBaseUrl).hostname.toLowerCase();
+      return hostname === "api.voyageai.com" || hostname.endsWith(".voyageai.com");
+    } catch {
+      return this.rerankBaseUrl.toLowerCase().includes("voyageai.com");
+    }
+  }
+
+  private extractChatContent(response: OpenAIChatResponse): string {
+    const content = response.choices?.[0]?.message?.content;
+    if (typeof content === "string") return content;
+    if (Array.isArray(content)) {
+      return content
+        .filter(part => part.type === "text" && typeof part.text === "string")
+        .map(part => part.text as string)
+        .join("\n");
+    }
+    return "";
+  }
+
+  private parseExpandedQueries(content: string): Queryable[] {
+    const trimmed = content.trim();
+    if (!trimmed) return [];
+
+    // Line format: "lex: ...", "vec: ...", "hyde: ..."
+    const fromLines = trimmed
+      .split("\n")
+      .map(line => line.trim())
+      .filter(Boolean)
+      .map(line => {
+        const match = line.match(/^(?:[-*•\d\.\)\s]*)?(lex|vec|hyde)\s*:\s*(.+)$/i);
+        if (!match) return null;
+        const type = match[1]!.toLowerCase() as QueryType;
+        const text = match[2]!.trim();
+        if (!text) return null;
+        return { type, text };
+      })
+      .filter((q): q is Queryable => q !== null);
+
+    return fromLines;
+  }
+
+  canTokenize(): boolean {
+    return false;
+  }
+
+  private async requestChatCompletions(
+    messages: Array<{ role: "system" | "user"; content: string }>
+  ): Promise<string> {
+    if (!this.chatApiKey) {
+      throw new Error("ApiLLM chat error: missing API key (set QMD_CHAT_API_KEY)");
+    }
+    const payload: Record<string, unknown> = {
+      model: this.chatModel,
+      messages,
+      temperature: 0.2,
+    };
+
+    const resp = await fetch(`${this.chatBaseUrl}/chat/completions`, {
+      method: "POST",
+      headers: this.getHeaders(this.chatApiKey),
+      body: JSON.stringify(payload),
+    });
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw new Error(`ApiLLM chat error: ${resp.status} ${resp.statusText} ${body}`.trim());
+    }
+
+    const response = await resp.json() as OpenAIChatResponse;
+    const content = this.extractChatContent(response);
+    return content;
+  }
+
+  private async requestEmbeddings(texts: string[]): Promise<OpenAIEmbeddingResponse | null> {
+    if (!this.embedApiKey) {
+      throw new Error("ApiLLM embedding error: missing API key (set QMD_EMBED_API_KEY)");
+    }
+
+    try {
+      const resp = await fetch(`${this.embedBaseUrl}/embeddings`, {
+        method: "POST",
+        headers: this.getHeaders(this.embedApiKey),
+        body: JSON.stringify({
+          model: this.embedModel,
+          input: texts,
+        }),
+      });
+      if (!resp.ok) {
+        const body = await resp.text().catch(() => "");
+        console.error(`ApiLLM embedding error: ${resp.status} ${resp.statusText} ${body}`.trim());
+        return null;
+      }
+      return await resp.json() as OpenAIEmbeddingResponse;
+    } catch (error) {
+      // Local backend seems to return null, so we do as well to keep consistent
+      console.error("ApiLLM embedding error:", error);
+      return null;
+    }
+  }
+
+  async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
+    void options;  // Seems to be used for model override in local backend, ignoring here
+    const response = await this.requestEmbeddings([text]);
+    const vector = response?.data?.[0]?.embedding;
+    if (!vector || !Array.isArray(vector)) return null;
+
+    return {
+      embedding: vector,
+      model: this.embedModel,
+    };
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    if (texts.length === 0) return [];
+
+    const response = await this.requestEmbeddings(texts);
+    if (!response?.data || !Array.isArray(response.data)) {
+      return texts.map(() => null);
+    }
+
+    // Keep output index-aligned with inputs; missing/invalid embeddings become null.
+    const results: (EmbeddingResult | null)[] = [];
+    for (let i = 0; i < texts.length; i++) {
+      const vector = response.data[i]?.embedding;
+      if (!vector || !Array.isArray(vector)) {
+        results.push(null);
+      } else {
+        results.push({
+          embedding: vector,
+          model: this.embedModel,
+        });
+      }
+    }
+    return results;
+  }
+
+  async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
+    void prompt;
+    void options;
+    // generate() doesn't seem to be called from anywhere in the codebase, so we just throw for now
+    throw new Error("ApiLLM generate is not implemented for API backend (use QMD_LLM_BACKEND=local)");
+  }
+
+  async modelExists(model: string): Promise<ModelInfo> {
+    // Used only in local backend tests?
+    return { name: model, exists: true };
+  }
+
+  async expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]> {
+    const includeLexical = options?.includeLexical ?? true;
+    const searchScope = includeLexical ? "lexical and semantic" : "semantic";
+    const allowedTypes = includeLexical ? "lex, vec, or hyde" : "vec or hyde";
+    const allowedTypesList = includeLexical ? "lex, vec, hyde" : "vec, hyde";
+    const lexicalInstruction = includeLexical
+      ? "Include at least one lex query."
+      : "Do not include any lex queries.";
+
+    const systemPrompt = [
+      "You expand search queries for hybrid retrieval.",
+      `Produce useful variations for ${searchScope} search.`,
+      `Return one query per line in format: type: text, where type is ${allowedTypes}.`,
+    ].join(" ");
+
+    const userPrompt = [
+      `Original query: ${query}`,
+      options?.context ? `Context: ${options.context}` : "",
+      lexicalInstruction,
+      "Return 2-4 total items. Keep each text concise and relevant.",
+      `Allowed types: ${allowedTypesList}.`,
+    ].filter(Boolean).join("\n");
+
+    const content = await this.requestChatCompletions([
+      { role: "system", content: systemPrompt },
+      { role: "user", content: userPrompt },
+    ]);
+
+    if (!content.trim()) {
+      return [];
+    }
+
+    const parsed = this.parseExpandedQueries(content);
+    const filteredByLex = includeLexical ? parsed : parsed.filter(q => q.type !== "lex");
+    const deduped = Array.from(new Map(
+      filteredByLex
+        .map(q => ({ ...q, text: q.text.trim() }))
+        .filter(q => q.text.length > 0)
+        .map(q => [`${q.type}|${q.text.toLowerCase()}`, q] as const)
+    ).values());
+
+    if (deduped.length > 0) {
+      return deduped;
+    }
+    console.warn("ApiLLM expandQuery warning: no valid expansions produced; returning empty expansion set");
+    return [];
+  }
+
+  async rerank(query: string, documents: RerankDocument[], options: RerankOptions = {}): Promise<RerankResult> {
+    void options; // Seems used for model override in local backend, ignoring here
+    if (!this.rerankApiKey) {
+      throw new Error("ApiLLM rerank error: missing API key (set QMD_RERANK_API_KEY)");
+    }
+    if (documents.length === 0) {
+      return { results: [], model: this.rerankModel };
+    }
+
+    const model = this.rerankModel;
+
+    const topCountField = this.usesVoyageRerankApi() ? "top_k" : "top_n";
+    const resp = await fetch(`${this.rerankBaseUrl}/rerank`, {
+      method: "POST",
+      headers: this.getHeaders(this.rerankApiKey),
+      body: JSON.stringify({
+        model,
+        query,
+        documents: documents.map((doc) => doc.text),
+        [topCountField]: documents.length,
+      }),
+    });
+    if (!resp.ok) {
+      const body = await resp.text().catch(() => "");
+      throw new Error(`ApiLLM rerank error: ${resp.status} ${resp.statusText} ${body}`.trim());
+    }
+    const response = await resp.json() as RerankResponse;
+
+    const responseResults = Array.isArray(response.results)
+      ? response.results
+      : Array.isArray(response.data)
+        ? response.data
+        : null;
+
+    if (!Array.isArray(responseResults)) {
+      throw new Error("ApiLLM rerank error: invalid response (missing results/data array)");
+    }
+
+    const scoreByIndex = new Map<number, number>();
+    for (const item of responseResults) {
+      if (typeof item.index !== "number" || typeof item.relevance_score !== "number") continue;
+      scoreByIndex.set(item.index, item.relevance_score);
+    }
+
+    const results = documents
+      .map((doc, index) => ({
+        file: doc.file,
+        score: scoreByIndex.get(index) ?? 0,
+        index,
+      }))
+      .sort((a, b) => b.score - a.score);
+
+    return {
+      results,
+      model,
+    };
+  }
+
+  async dispose(): Promise<void> {
+    // No API client resources to dispose in this implementation.
+  }
+}
diff --git a/src/llm-session.ts b/src/llm-session.ts
new file mode 100644
index 0000000..bb8b3ea
--- /dev/null
+++ b/src/llm-session.ts
@@ -0,0 +1,114 @@
+import type {
+  LLM,
+  EmbedOptions,
+  EmbeddingResult,
+  ILLMSession,
+  LLMSessionOptions,
+  Queryable,
+  RerankDocument,
+  RerankOptions,
+  RerankResult,
+} from "./llm.js";
+
+/**
+ * Scoped session wrapper for non-local backends.
+ * Enforces release/abort semantics but delegates operations directly to the backend.
+ */
+export class PassthroughLLMSession implements ILLMSession {
+  private llm: LLM;
+  private released = false;
+  private abortController: AbortController;
+  private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
+  private name: string;
+  private createReleasedError: (message?: string) => Error;
+
+  constructor(
+    llm: LLM,
+    options: LLMSessionOptions = {},
+    createReleasedError: (message?: string) => Error = (message) =>
+      new Error(message || "LLM session has been released or aborted")
+  ) {
+    this.llm = llm;
+    this.name = options.name || "unnamed";
+    this.abortController = new AbortController();
+    this.createReleasedError = createReleasedError;
+
+    // Link external abort signal if provided
+    if (options.signal) {
+      if (options.signal.aborted) {
+        this.abortController.abort(options.signal.reason);
+      } else {
+        options.signal.addEventListener("abort", () => {
+          this.abortController.abort(options.signal!.reason);
+        }, { once: true });
+      }
+    }
+
+    // Set up max duration timer
+    const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
+    if (maxDuration > 0) {
+      this.maxDurationTimer = setTimeout(() => {
+        this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
+      }, maxDuration);
+      this.maxDurationTimer.unref(); // Don't keep process alive
+    }
+  }
+
+  get isValid(): boolean {
+    return !this.released && !this.abortController.signal.aborted;
+  }
+
+  get signal(): AbortSignal {
+    return this.abortController.signal;
+  }
+
+  release(): void {
+    if (this.released) return;
+    this.released = true;
+
+    if (this.maxDurationTimer) {
+      clearTimeout(this.maxDurationTimer);
+      this.maxDurationTimer = null;
+    }
+
+    this.abortController.abort(new Error("Session released"));
+  }
+
+  private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
+    if (!this.isValid) {
+      throw this.createReleasedError();
+    }
+
+    if (this.abortController.signal.aborted) {
+      throw this.createReleasedError(
+        this.abortController.signal.reason?.message || "Session aborted"
+      );
+    }
+
+    return await fn();
+  }
+
+  async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
+    return this.withOperation(() => this.llm.embed(text, options));
+  }
+
+  async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
+    return this.withOperation(() => this.llm.embedBatch(texts));
+  }
+
+  async expandQuery(
+    query: string,
+    options?: { context?: string; includeLexical?: boolean }
+  ): Promise<Queryable[]> {
+    return this.withOperation(() => this.llm.expandQuery(query, options));
+  }
+
+  async rerank(
+    query: string,
+    documents: RerankDocument[],
+    options?: RerankOptions
+  ): Promise<RerankResult> {
+    return this.withOperation(() => this.llm.rerank(query, documents, options));
+  }
+}
+
diff --git a/src/llm.ts b/src/llm.ts
index 46c6295..233d565 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -18,6 +18,8 @@ import {
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
+import { ApiLLM } from "./api.js";
+import { PassthroughLLMSession } from "./llm-session.js";
 
 // =============================================================================
 // Embedding Formatting Functions
@@ -298,6 +300,11 @@ export interface LLM {
    */
   embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
 
+  /**
+   * Get embeddings for multiple texts
+   */
+  embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
+
   /**
    * Generate text completion
    */
@@ -320,6 +327,18 @@ export interface LLM {
    */
   rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
 
+  /**
+   * Whether this backend supports tokenizer access.
+   * API backends may return false and omit tokenize().
+   */
+  canTokenize?(): boolean;
+
+  /**
+   * Tokenize text when tokenizer access is available.
+   * API backend doesn't currently expose tokenization.
+   */
+  tokenize?(text: string): Promise<readonly unknown[]>;
+
   /**
    * Dispose of resources
    */
@@ -765,6 +784,10 @@ export class LlamaCpp implements LLM {
   // Tokenization
   // ==========================================================================
 
+  canTokenize(): boolean {
+    return true;
+  }
+
   /**
    * Tokenize text using the embedding model's tokenizer
    * Returns tokenizer tokens (opaque type from node-llama-cpp)
@@ -1324,8 +1347,7 @@ let defaultSessionManager: LLMSessionManager | null = null;
 /**
  * Get the session manager for the default LlamaCpp instance.
  */
-function getSessionManager(): LLMSessionManager {
-  const llm = getDefaultLlamaCpp();
+function getSessionManager(llm: LlamaCpp = getDefaultLlamaCpp()): LLMSessionManager {
   if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
     defaultSessionManager = new LLMSessionManager(llm);
   }
@@ -1350,13 +1372,27 @@ export async function withLLMSession<T>(
   fn: (session: ILLMSession) => Promise<T>,
   options?: LLMSessionOptions
 ): Promise<T> {
-  const manager = getSessionManager();
-  const session = new LLMSession(manager, options);
+  const llm = getDefaultLLM();
 
-  try {
-    return await fn(session);
-  } finally {
-    session.release();
+  if (llm instanceof LlamaCpp) {
+    const manager = getSessionManager(llm);
+    const session = new LLMSession(manager, options);
+    try {
+      return await fn(session);
+    } finally {
+      session.release();
+    }
+  } else {
+    const session = new PassthroughLLMSession(
+      llm,
+      options,
+      (message?: string) => new SessionReleasedError(message)
+    );
+    try {
+      return await fn(session);
+    } finally {
+      session.release();
+    }
   }
 }
 
@@ -1374,6 +1410,7 @@ export function canUnloadLLM(): boolean {
 // =============================================================================
 
 let defaultLlamaCpp: LlamaCpp | null = null;
+let defaultApiLLM: ApiLLM | null = null;
 
 /**
  * Get the default LlamaCpp instance (creates one if needed)
@@ -1385,11 +1422,35 @@ export function getDefaultLlamaCpp(): LlamaCpp {
   return defaultLlamaCpp;
 }
 
+/**
+ * Get the default LLM backend instance.
+ * Selects local or API backend based on QMD_LLM_BACKEND.
+ */
+export function getDefaultLLM(): LLM {
+  const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local";
+  if (backend === "local") {
+    return getDefaultLlamaCpp();
+  }
+
+  if (backend === "api") {
+    if (!defaultApiLLM) {
+      defaultApiLLM = new ApiLLM();
+    }
+    return defaultApiLLM;
+  }
+
+  throw new Error(
+    `Invalid QMD_LLM_BACKEND="${process.env.QMD_LLM_BACKEND}". Expected "local" or "api".`
+  );
+}
+
 /**
  * Set a custom default LlamaCpp instance (useful for testing)
  */
 export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
   defaultLlamaCpp = llm;
+  // Function appears unused - clearing defaultApiLLM probably right thing to do anyway?
+  defaultApiLLM = null;
 }
 
 /**
@@ -1402,3 +1463,15 @@ export async function disposeDefaultLlamaCpp(): Promise<void> {
     defaultLlamaCpp = null;
   }
 }
+
+/**
+ * Dispose the default LLM backend instance.
+ * Currently aliases LlamaCpp disposal.
+ */
+export async function disposeDefaultLLM(): Promise<void> {
+  if (defaultApiLLM) {
+    await defaultApiLLM.dispose();
+    defaultApiLLM = null;
+  }
+  await disposeDefaultLlamaCpp();
+}
diff --git a/src/mcp.ts b/src/mcp.ts
index 323f469..9cedcb7 100644
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -24,7 +24,7 @@ import {
 } from "./store.js";
 import type { Store, StructuredSubSearch } from "./store.js";
 import { getCollection, getGlobalContext, getDefaultCollectionNames } from "./collections.js";
-import { disposeDefaultLlamaCpp } from "./llm.js";
+import { disposeDefaultLLM } from "./llm.js";
 
 // =============================================================================
 // Types for structured content
@@ -717,7 +717,7 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
     await transport.close();
     httpServer.close();
     store.close();
-    await disposeDefaultLlamaCpp();
+    await disposeDefaultLLM();
   };
 
   process.on("SIGTERM", async () => {
diff --git a/src/qmd.ts b/src/qmd.ts
index d57b7e8..ecfe492 100755
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -70,7 +70,20 @@ import {
   createStore,
   getDefaultDbPath,
 } from "./store.js";
-import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
+import {
+  clearApiEmbeddingScope,
+  getVectorScopeGuardMessage,
+  setApiEmbeddingScopeFromCurrentEnv,
+} from "./vector-scope-guard.js";
+import { disposeDefaultLLM, getDefaultLLM, LlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
+import {
+  DEFAULT_API_CHAT_BASE_URL,
+  DEFAULT_API_CHAT_MODEL,
+  DEFAULT_API_EMBED_BASE_URL,
+  DEFAULT_API_EMBED_MODEL,
+  DEFAULT_API_RERANK_BASE_URL,
+  DEFAULT_API_RERANK_MODEL,
+} from "./api-defaults.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -384,43 +397,62 @@ async function showStatus(): Promise<void> {
 
   // Models
   {
-    // hf:org/repo/file.gguf → https://huggingface.co/org/repo
-    const hfLink = (uri: string) => {
-      const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
-      return match ? `https://huggingface.co/${match[1]}` : uri;
-    };
     console.log(`\n${c.bold}Models${c.reset}`);
-    console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
-    console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
-    console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local";
+    if (backend === "api") {
+      const embedBaseUrl = (process.env.QMD_EMBED_BASE_URL || DEFAULT_API_EMBED_BASE_URL).replace(/\/+$/, "");
+      const embedModel = process.env.QMD_EMBED_MODEL || DEFAULT_API_EMBED_MODEL;
+      const chatBaseUrl = (process.env.QMD_CHAT_BASE_URL || DEFAULT_API_CHAT_BASE_URL).replace(/\/+$/, "");
+      const chatModel = process.env.QMD_CHAT_MODEL || DEFAULT_API_CHAT_MODEL;
+      const rerankBaseUrl = (process.env.QMD_RERANK_BASE_URL || DEFAULT_API_RERANK_BASE_URL).replace(/\/+$/, "");
+      const rerankModel = process.env.QMD_RERANK_MODEL || DEFAULT_API_RERANK_MODEL;
+
+      console.log(`  Embedding:   ${embedModel} ${c.dim}(${embedBaseUrl})${c.reset}`);
+      console.log(`  Chat:        ${chatModel} ${c.dim}(${chatBaseUrl})${c.reset}`);
+      console.log(`  Reranking:   ${rerankModel} ${c.dim}(${rerankBaseUrl})${c.reset}`);
+    } else {
+      // hf:org/repo/file.gguf → https://huggingface.co/org/repo
+      const hfLink = (uri: string) => {
+        const match = uri.match(/^hf:([^/]+\/[^/]+)\//);
+        return match ? `https://huggingface.co/${match[1]}` : uri;
+      };
+      console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
+      console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
+      console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    }
   }
 
   // Device / GPU info
   try {
-    const llm = getDefaultLlamaCpp();
-    const device = await llm.getDeviceInfo();
-    console.log(`\n${c.bold}Device${c.reset}`);
-    if (device.gpu) {
-      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
-      if (device.gpuDevices.length > 0) {
-        // Deduplicate and count GPUs
-        const counts = new Map<string, number>();
-        for (const name of device.gpuDevices) {
-          counts.set(name, (counts.get(name) || 0) + 1);
+    const llm = getDefaultLLM();
+    if (llm instanceof LlamaCpp) {
+      const device = await llm.getDeviceInfo();
+      console.log(`\n${c.bold}Device${c.reset}`);
+      if (device.gpu) {
+        console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+        if (device.gpuDevices.length > 0) {
+          // Deduplicate and count GPUs
+          const counts = new Map<string, number>();
+          for (const name of device.gpuDevices) {
+            counts.set(name, (counts.get(name) || 0) + 1);
+          }
+          const deviceStr = Array.from(counts.entries())
+            .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+            .join(', ');
+          console.log(`  Devices:  ${deviceStr}`);
         }
-        const deviceStr = Array.from(counts.entries())
-          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
-          .join(', ');
-        console.log(`  Devices:  ${deviceStr}`);
-      }
-      if (device.vram) {
-        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+        if (device.vram) {
+          console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+        }
+      } else {
+        console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+        console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
       }
+      console.log(`  CPU:      ${device.cpuCores} math cores`);
     } else {
-      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
-      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
+      console.log(`\n${c.bold}Device${c.reset}`);
+      console.log(`  ${c.dim}Backend is API mode; local device probe skipped.${c.reset}`);
     }
-    console.log(`  CPU:      ${device.cpuCores} math cores`);
   } catch {
     // Don't fail status if LLM init fails
   }
@@ -1533,11 +1565,23 @@ function renderProgressBar(percent: number, width: number = 30): string {
 async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
   const db = getDb();
   const now = new Date().toISOString();
+  const backend = process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local";
+  const isApiBackend = backend === "api";
+
+  if (!force) {
+    const guardMessage = getVectorScopeGuardMessage(db);
+    if (guardMessage) {
+      throw new Error(guardMessage);
+    }
+  }
 
   // If force, clear all vectors
   if (force) {
     console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
     clearAllEmbeddings(db);
+    if (!isApiBackend) {
+      clearApiEmbeddingScope(db);
+    }
   }
 
   // Find unique hashes that need embedding (from active documents)
@@ -1615,6 +1659,9 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
       throw new Error("Failed to get embedding dimensions from first chunk");
     }
     ensureVecTable(db, firstResult.embedding.length);
+    if (isApiBackend) {
+      setApiEmbeddingScopeFromCurrentEnv(db);
+    }
 
     let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
     const startTime = Date.now();
@@ -2828,7 +2875,7 @@ if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsW
   }
 
   if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
+    await disposeDefaultLLM();
     process.exit(0);
   }
 
diff --git a/src/store.ts b/src/store.ts
index ff08c2a..bee35ef 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -17,8 +17,7 @@ import picomatch from "picomatch";
 import { createHash } from "crypto";
 import { realpathSync, statSync, mkdirSync } from "node:fs";
 import {
-  LlamaCpp,
-  getDefaultLlamaCpp,
+  getDefaultLLM,
   formatQueryForEmbedding,
   formatDocForEmbedding,
   type RerankDocument,
@@ -38,6 +37,7 @@ import {
   loadConfig as collectionsLoadConfig,
   type NamedCollection,
 } from "./collections.js";
+import { getVectorScopeGuardMessage } from "./vector-scope-guard.js";
 
 // =============================================================================
 // Configuration
@@ -675,6 +675,14 @@ function initializeDatabase(db: Database): void {
     )
   `);
 
+  // API embedding scope metadata (used to guard mixed local/API vector usage).
+  db.exec(`
+    CREATE TABLE IF NOT EXISTS api_meta (
+      key TEXT PRIMARY KEY,
+      value TEXT NOT NULL
+    )
+  `);
+
   // Content vectors
   const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
   const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
@@ -1427,7 +1435,7 @@ export async function chunkDocumentByTokens(
   overlapTokens: number = CHUNK_OVERLAP_TOKENS,
   windowTokens: number = CHUNK_WINDOW_TOKENS
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
-  const llm = getDefaultLlamaCpp();
+  const llm = getDefaultLLM();
 
   // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
   // If chunks exceed limit, they'll be re-split with actual ratio
@@ -1437,13 +1445,23 @@ export async function chunkDocumentByTokens(
   const windowChars = windowTokens * avgCharsPerToken;
 
   // Chunk in character space with conservative estimate
-  let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
+  const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
+
+  // API backend doesn't expose tokenizer APIs; keep chunking approximate and avoid local model init.
+  if (!llm.canTokenize?.() || !llm.tokenize) {
+    return charChunks.map((chunk) => ({
+      text: chunk.text,
+      pos: chunk.pos,
+      tokens: Math.max(1, Math.ceil(chunk.text.length / avgCharsPerToken)),
+    }));
+  }
+  const tokenize = llm.tokenize.bind(llm);
 
   // Tokenize and split any chunks that still exceed limit
   const results: { text: string; pos: number; tokens: number }[] = [];
 
   for (const chunk of charChunks) {
-    const tokens = await llm.tokenize(chunk.text);
+    const tokens = await tokenize(chunk.text);
 
     if (tokens.length <= maxTokens) {
       results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
@@ -1456,7 +1474,7 @@ export async function chunkDocumentByTokens(
       const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
 
       for (const subChunk of subChunks) {
-        const subTokens = await llm.tokenize(subChunk.text);
+        const subTokens = await tokenize(subChunk.text);
         results.push({
           text: subChunk.text,
           pos: chunk.pos + subChunk.pos,
@@ -2140,6 +2158,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 // =============================================================================
 
 export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
+  const guardMessage = getVectorScopeGuardMessage(db);
+  if (guardMessage) {
+    throw new Error(guardMessage);
+  }
+
   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
   if (!tableExists) return [];
 
@@ -2234,7 +2257,7 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
   const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
   const result = session
     ? await session.embed(formattedText, { model, isQuery })
-    : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });
+    : await getDefaultLLM().embed(formattedText, { model, isQuery });
   return result?.embedding || null;
 }
 
@@ -2299,8 +2322,8 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M
     }
   }
 
-  const llm = getDefaultLlamaCpp();
-  // Note: LlamaCpp uses hardcoded model, model parameter is ignored
+  const llm = getDefaultLLM();
+  // Note: current local backend uses a configured default model; `model` may be ignored.
   const results = await llm.expandQuery(query);
 
   // Map Queryable[] → ExpandedQuery[] (same shape, decoupled from llm.ts internals).
@@ -2337,9 +2360,9 @@ export async function rerank(query: string, documents: { file: string; text: str
     }
   }
 
-  // Rerank uncached documents using LlamaCpp
+  // Rerank uncached documents using the configured LLM backend
   if (uncachedDocs.length > 0) {
-    const llm = getDefaultLlamaCpp();
+    const llm = getDefaultLLM();
     const rerankResult = await llm.rerank(query, uncachedDocs, { model });
 
     // Cache results — use original doc.text for cache key (result.file lacks chunk text)
@@ -2900,6 +2923,11 @@ export async function hybridQuery(
   query: string,
   options?: HybridQueryOptions
 ): Promise<HybridQueryResult[]> {
+  const guardMessage = getVectorScopeGuardMessage(store.db);
+  if (guardMessage) {
+    throw new Error(guardMessage);
+  }
+
   const limit = options?.limit ?? 10;
   const minScore = options?.minScore ?? 0;
   const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
@@ -2973,7 +3001,7 @@ export async function hybridQuery(
     }
 
     // Batch embed all vector queries in a single call
-    const llm = getDefaultLlamaCpp();
+    const llm = getDefaultLLM();
     const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
     hooks?.onEmbedStart?.(textsToEmbed.length);
     const embedStart = Date.now();
@@ -3113,6 +3141,11 @@ export async function vectorSearchQuery(
   query: string,
   options?: VectorSearchOptions
 ): Promise<VectorSearchResult[]> {
+  const guardMessage = getVectorScopeGuardMessage(store.db);
+  if (guardMessage) {
+    throw new Error(guardMessage);
+  }
+
   const limit = options?.limit ?? 10;
   const minScore = options?.minScore ?? 0.3;
   const collection = options?.collection;
@@ -3203,6 +3236,11 @@ export async function structuredSearch(
   searches: StructuredSubSearch[],
   options?: StructuredSearchOptions
 ): Promise<HybridQueryResult[]> {
+  const guardMessage = getVectorScopeGuardMessage(store.db);
+  if (guardMessage) {
+    throw new Error(guardMessage);
+  }
+
   const limit = options?.limit ?? 10;
   const minScore = options?.minScore ?? 0;
   const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
@@ -3271,7 +3309,7 @@ export async function structuredSearch(
   if (hasVectors) {
     const vecSearches = searches.filter(s => s.type === 'vec' || s.type === 'hyde');
     if (vecSearches.length > 0) {
-      const llm = getDefaultLlamaCpp();
+      const llm = getDefaultLLM();
       const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query));
       hooks?.onEmbedStart?.(textsToEmbed.length);
       const embedStart = Date.now();
diff --git a/src/vector-scope-guard.ts b/src/vector-scope-guard.ts
new file mode 100644
index 0000000..42ef5df
--- /dev/null
+++ b/src/vector-scope-guard.ts
@@ -0,0 +1,124 @@
+import type { Database } from "./db.js";
+import {
+  DEFAULT_API_EMBED_BASE_URL,
+  DEFAULT_API_EMBED_MODEL,
+} from "./api-defaults.js";
+
+export type ApiEmbeddingScope = {
+  embedBaseUrl: string;
+  embedModel: string;
+};
+
+function getConfiguredBackend(): string {
+  return process.env.QMD_LLM_BACKEND?.trim().toLowerCase() || "local";
+}
+
+function resolveCurrentApiEmbeddingScopeFromEnv(): ApiEmbeddingScope {
+  const embedBaseUrl = (
+    process.env.QMD_EMBED_BASE_URL?.trim()
+    || DEFAULT_API_EMBED_BASE_URL
+  ).replace(/\/+$/, "");
+  const embedModel = process.env.QMD_EMBED_MODEL?.trim() || DEFAULT_API_EMBED_MODEL;
+  return { embedBaseUrl, embedModel };
+}
+
+function getApiMetaValue(db: Database, key: string): string | null {
+  try {
+    const row = db.prepare(`SELECT value FROM api_meta WHERE key = ?`).get(key) as { value: string } | null;
+    return row?.value || null;
+  } catch {
+    // Older DBs or test fixtures may not include api_meta.
+    return null;
+  }
+}
+
+function setApiMetaValue(db: Database, key: string, value: string): void {
+  db.prepare(`INSERT OR REPLACE INTO api_meta (key, value) VALUES (?, ?)`).run(key, value);
+}
+
+function hasAnyVectors(db: Database): boolean {
+  const cvCount = db.prepare(`SELECT COUNT(*) as c FROM content_vectors`).get() as { c: number };
+  if (cvCount.c > 0) return true;
+
+  const tableExists = db.prepare(`
+    SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
+  `).get();
+  if (!tableExists) return false;
+
+  try {
+    const vvCount = db.prepare(`SELECT COUNT(*) as c FROM vectors_vec`).get() as { c: number };
+    return vvCount.c > 0;
+  } catch {
+    // If vec table exists but count fails, treat as non-empty/unknown for safety.
+    return true;
+  }
+}
+
+function formatApiScope(scope: ApiEmbeddingScope): string {
+  return `${scope.embedBaseUrl} | ${scope.embedModel}`;
+}
+
+export function getStoredApiEmbeddingScope(db: Database): ApiEmbeddingScope | null {
+  const embedBaseUrl = getApiMetaValue(db, "embed_base_url");
+  const embedModel = getApiMetaValue(db, "embed_model");
+  if (!embedBaseUrl || !embedModel) return null;
+  return { embedBaseUrl, embedModel };
+}
+
+export function setApiEmbeddingScopeFromCurrentEnv(db: Database): void {
+  const scope = resolveCurrentApiEmbeddingScopeFromEnv();
+  setApiMetaValue(db, "embed_base_url", scope.embedBaseUrl);
+  setApiMetaValue(db, "embed_model", scope.embedModel);
+}
+
+export function clearApiEmbeddingScope(db: Database): void {
+  db.exec(`DELETE FROM api_meta`);
+}
+
+export function getVectorScopeGuardMessage(db: Database): string | null {
+  const backend = getConfiguredBackend();
+  const storedScope = getStoredApiEmbeddingScope(db);
+
+  if (backend === "local") {
+    if (!storedScope) return null;
+    return [
+      "Index is marked for API embeddings, but current backend is local.",
+      `Stored API embedding scope: ${formatApiScope(storedScope)}`,
+      "Choose one:",
+      "  1) Set QMD_LLM_BACKEND=api with matching embedding settings",
+      "  2) Use a different index via --index",
+      "  3) Run 'qmd embed -f' to clear vectors and remove API scope metadata",
+    ].join("\n");
+  }
+
+  if (backend === "api") {
+    const currentScope = resolveCurrentApiEmbeddingScopeFromEnv();
+
+    if (!storedScope) {
+      if (!hasAnyVectors(db)) return null;
+      return [
+        "This index has vectors but no API scope metadata (legacy/ambiguous state).",
+        "Choose one:",
+        "  1) Use a different index via --index",
+        "  2) Run 'qmd embed -f' to reset vectors for the current API embedding scope",
+      ].join("\n");
+    }
+
+    const isMatch = storedScope.embedBaseUrl === currentScope.embedBaseUrl
+      && storedScope.embedModel === currentScope.embedModel;
+    if (isMatch) return null;
+
+    return [
+      "API embedding scope mismatch for this index.",
+      `Stored scope (in index db):  ${formatApiScope(storedScope)}`,
+      `Current scope (from environment): ${formatApiScope(currentScope)}`,
+      "Choose one:",
+      "  1) Revert API embedding settings to match the stored scope",
+      "  2) Use a different index via --index",
+      "  3) Run 'qmd embed -f' to reset vectors for the current API embedding scope",
+    ].join("\n");
+  }
+
+  // Unknown backend values are validated elsewhere; don't block here.
+  return null;
+}
diff --git a/test/api.contract.test.ts b/test/api.contract.test.ts
new file mode 100644
index 0000000..63f696d
--- /dev/null
+++ b/test/api.contract.test.ts
@@ -0,0 +1,344 @@
+import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
+import { ApiLLM } from "../src/api.js";
+import { canUnloadLLM, withLLMSession } from "../src/llm.js";
+
+describe("ApiLLM (contract)", () => {
+  const fetchMock = vi.fn();
+  const originalFetch = globalThis.fetch;
+  const originalQmdEmbedApiKey = process.env.QMD_EMBED_API_KEY;
+  const originalQmdChatApiKey = process.env.QMD_CHAT_API_KEY;
+  const originalQmdChatModel = process.env.QMD_CHAT_MODEL;
+  const originalQmdRerankApiKey = process.env.QMD_RERANK_API_KEY;
+  const originalQmdLlmBackend = process.env.QMD_LLM_BACKEND;
+
+  beforeEach(() => {
+    fetchMock.mockReset();
+    (globalThis as { fetch: typeof fetch }).fetch = fetchMock as unknown as typeof fetch;
+  });
+
+  afterEach(() => {
+    (globalThis as { fetch: typeof fetch }).fetch = originalFetch;
+    process.env.QMD_EMBED_API_KEY = originalQmdEmbedApiKey;
+    process.env.QMD_CHAT_API_KEY = originalQmdChatApiKey;
+    process.env.QMD_CHAT_MODEL = originalQmdChatModel;
+    process.env.QMD_RERANK_API_KEY = originalQmdRerankApiKey;
+    process.env.QMD_LLM_BACKEND = originalQmdLlmBackend;
+  });
+
+  test("embed sends OpenAI-compatible /embeddings request, ignores per-call model override, and parses response", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          data: [{ embedding: [0.1, 0.2, 0.3] }],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      embedBaseUrl: "https://example.test/v1",
+      embedApiKey: "test-key",
+      embedModel: "test-embed-model",
+    });
+
+    const result = await llm.embed("hello", { model: "override-embed-model" });
+
+    expect(result).not.toBeNull();
+    expect(result?.embedding).toEqual([0.1, 0.2, 0.3]);
+    expect(result?.model).toBe("test-embed-model");
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe("https://example.test/v1/embeddings");
+    expect(init?.method).toBe("POST");
+    expect(init?.headers).toEqual({
+      "Content-Type": "application/json",
+      "Authorization": "Bearer test-key",
+    });
+    expect(JSON.parse(String(init?.body))).toEqual({
+      model: "test-embed-model",
+      input: ["hello"],
+    });
+  });
+
+  test("embedBatch returns one result per input and null for missing vectors", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          data: [
+            { embedding: [1, 2] },
+            {},
+            { embedding: [3, 4] },
+          ],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      embedBaseUrl: "https://example.test/v1",
+      embedApiKey: "test-key",
+      embedModel: "test-embed-model",
+    });
+
+    const results = await llm.embedBatch(["a", "b", "c"]);
+    expect(results).toHaveLength(3);
+    expect(results[0]?.embedding).toEqual([1, 2]);
+    expect(results[1]).toBeNull();
+    expect(results[2]?.embedding).toEqual([3, 4]);
+  });
+
+  test("embed throws and avoids fetch when API key is missing", async () => {
+    process.env.QMD_EMBED_API_KEY = "";
+
+    const llm = new ApiLLM({
+      embedBaseUrl: "https://example.test/v1",
+      embedApiKey: "",
+      embedModel: "test-embed-model",
+    });
+
+    await expect(
+      llm.embed("hello")
+    ).rejects.toThrow("missing API key");
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+
+  test("generate fails explicitly for API backend", async () => {
+    const llm = new ApiLLM({});
+
+    await expect(
+      llm.generate("hello")
+    ).rejects.toThrow("not implemented for API backend");
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+
+  test("rerank sends Cohere-compatible /rerank request, ignores per-call model override, and maps response by index", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          results: [
+            { index: 1, relevance_score: 0.91 },
+            { index: 0, relevance_score: 0.24 },
+          ],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      embedBaseUrl: "https://example.test/v1",
+      embedApiKey: "embed-key",
+      rerankBaseUrl: "https://rerank.test/v1",
+      rerankApiKey: "rerank-key",
+      rerankModel: "rerank-v3.5",
+    });
+
+    const result = await llm.rerank(
+      "capital of france",
+      [
+        { file: "a.md", text: "Berlin is the capital of Germany." },
+        { file: "b.md", text: "Paris is the capital of France." },
+      ],
+      { model: "override-rerank-model" }
+    );
+
+    expect(result.model).toBe("rerank-v3.5");
+    expect(result.results).toEqual([
+      { file: "b.md", score: 0.91, index: 1 },
+      { file: "a.md", score: 0.24, index: 0 },
+    ]);
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe("https://rerank.test/v1/rerank");
+    expect(init?.method).toBe("POST");
+    expect(init?.headers).toEqual({
+      "Content-Type": "application/json",
+      "Authorization": "Bearer rerank-key",
+    });
+    expect(JSON.parse(String(init?.body))).toEqual({
+      model: "rerank-v3.5",
+      query: "capital of france",
+      documents: [
+        "Berlin is the capital of Germany.",
+        "Paris is the capital of France.",
+      ],
+      top_n: 2,
+    });
+  });
+
+  test("rerank sends Voyage-compatible top_k and accepts data response shape", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          data: [
+            { index: 0, relevance_score: 0.12 },
+            { index: 1, relevance_score: 0.95 },
+          ],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      rerankBaseUrl: "https://api.voyageai.com/v1",
+      rerankApiKey: "voyage-key",
+      rerankModel: "rerank-2.5-lite",
+    });
+
+    const result = await llm.rerank(
+      "capital of france",
+      [
+        { file: "a.md", text: "Berlin is the capital of Germany." },
+        { file: "b.md", text: "Paris is the capital of France." },
+      ]
+    );
+
+    expect(result.model).toBe("rerank-2.5-lite");
+    expect(result.results).toEqual([
+      { file: "b.md", score: 0.95, index: 1 },
+      { file: "a.md", score: 0.12, index: 0 },
+    ]);
+
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe("https://api.voyageai.com/v1/rerank");
+    expect(JSON.parse(String(init?.body))).toEqual({
+      model: "rerank-2.5-lite",
+      query: "capital of france",
+      documents: [
+        "Berlin is the capital of Germany.",
+        "Paris is the capital of France.",
+      ],
+      top_k: 2,
+    });
+  });
+
+  test("rerank throws and avoids fetch when rerank API key is missing", async () => {
+    process.env.QMD_EMBED_API_KEY = "";
+    process.env.QMD_RERANK_API_KEY = "";
+
+    const llm = new ApiLLM({
+      embedBaseUrl: "https://example.test/v1",
+      embedApiKey: "",
+      rerankApiKey: "",
+      rerankModel: "rerank-v3.5",
+    });
+
+    await expect(
+      llm.rerank("q", [{ file: "doc.md", text: "t" }])
+    ).rejects.toThrow("missing API key");
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+
+  test("expandQuery accepts line format output", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          choices: [{
+            message: {
+              content: "lex: api auth docs\nvec: api authentication guide\nhyde: A guide to API authentication setup",
+            },
+          }],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      chatBaseUrl: "https://chat.example.test/v1",
+      chatApiKey: "chat-key",
+      chatModel: "gpt-4o-mini",
+    });
+
+    const result = await llm.expandQuery("api auth docs");
+    expect(result).toEqual([
+      { type: "lex", text: "api auth docs" },
+      { type: "vec", text: "api authentication guide" },
+      { type: "hyde", text: "A guide to API authentication setup" },
+    ]);
+
+    const [url, init] = fetchMock.mock.calls[0]!;
+    expect(url).toBe("https://chat.example.test/v1/chat/completions");
+    expect(init?.method).toBe("POST");
+    expect(init?.headers).toEqual({
+      "Content-Type": "application/json",
+      "Authorization": "Bearer chat-key",
+    });
+  });
+
+  test("expandQuery throws and avoids fetch when chat API key is missing", async () => {
+    process.env.QMD_CHAT_API_KEY = "";
+
+    const llm = new ApiLLM({
+      chatBaseUrl: "https://chat.example.test/v1",
+      chatApiKey: "",
+      chatModel: "gpt-4o-mini",
+    });
+
+    await expect(
+      llm.expandQuery("api auth docs")
+    ).rejects.toThrow("missing API key");
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+
+  test("expandQuery throws on chat request failure", async () => {
+    fetchMock.mockResolvedValue(
+      new Response("upstream error", { status: 503, statusText: "Service Unavailable" })
+    );
+
+    const llm = new ApiLLM({
+      chatBaseUrl: "https://chat.example.test/v1",
+      chatApiKey: "chat-key",
+      chatModel: "gpt-4o-mini",
+    });
+
+    await expect(
+      llm.expandQuery("api auth docs")
+    ).rejects.toThrow("chat error: 503");
+  });
+
+  test("expandQuery returns empty expansion set when output is not parseable line format", async () => {
+    fetchMock.mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          choices: [{
+            message: {
+              content: JSON.stringify([
+                { type: "lex", text: "api auth docs" },
+                { type: "vec", text: "api authentication guide" },
+              ]),
+            },
+          }],
+        }),
+        { status: 200, headers: { "Content-Type": "application/json" } }
+      )
+    );
+
+    const llm = new ApiLLM({
+      chatBaseUrl: "https://chat.example.test/v1",
+      chatApiKey: "chat-key",
+      chatModel: "gpt-4o-mini",
+    });
+
+    const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
+    const result = await llm.expandQuery("api auth docs");
+    expect(result).toEqual([]);
+    expect(warnSpy).toHaveBeenCalledTimes(1);
+    warnSpy.mockRestore();
+  });
+
+  test("withLLMSession does not acquire local unload lock when backend is api", async () => {
+    process.env.QMD_LLM_BACKEND = "api";
+
+    const unloadBefore = canUnloadLLM();
+    expect(unloadBefore).toBe(true);
+
+    await withLLMSession(async (session) => {
+      expect(session.isValid).toBe(true);
+      expect(canUnloadLLM()).toBe(true);
+    }, { maxDuration: 1000, name: "api-contract-session" });
+
+    expect(canUnloadLLM()).toBe(true);
+  });
+});
diff --git a/test/api.live.test.ts b/test/api.live.test.ts
new file mode 100644
index 0000000..14786c5
--- /dev/null
+++ b/test/api.live.test.ts
@@ -0,0 +1,131 @@
+import { describe, expect, test } from "vitest";
+import { ApiLLM } from "../src/api.js";
+
+/**
+ * Live API tests (provider-gated by env vars).
+ * Required keys: OPENAI_API_KEY, OPENROUTER_API_KEY, COHERE_API_KEY, VOYAGE_API_KEY.
+ * Tests for a provider are skipped when that provider key is not set.
+ */
+const embeddingProviders = [
+  {
+    name: "OpenAI",
+    key: process.env.OPENAI_API_KEY || "",
+    baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1",
+    embedModel: process.env.OPENAI_EMBED_MODEL || "text-embedding-3-small",
+  },
+  {
+    name: "OpenRouter",
+    key: process.env.OPENROUTER_API_KEY || "",
+    baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1",
+    embedModel: process.env.OPENROUTER_EMBED_MODEL || "openai/text-embedding-3-small",
+  },
+  {
+    name: "Cohere",
+    key: process.env.COHERE_API_KEY || "",
+    baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1",
+    embedModel: process.env.COHERE_EMBED_MODEL || "embed-v4.0",
+  },
+  {
+    name: "Voyage",
+    key: process.env.VOYAGE_API_KEY || "",
+    baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1",
+    embedModel: process.env.VOYAGE_EMBED_MODEL || "voyage-3.5-lite",
+  },
+];
+
+const chatProviders = [
+  {
+    name: "OpenAI",
+    key: process.env.OPENAI_API_KEY || "",
+    baseUrl: process.env.OPENAI_BASE_URL || "https://api.openai.com/v1",
+    chatModel: process.env.OPENAI_CHAT_MODEL || "gpt-4o-mini",
+  },
+  {
+    name: "OpenRouter",
+    key: process.env.OPENROUTER_API_KEY || "",
+    baseUrl: process.env.OPENROUTER_BASE_URL || "https://openrouter.ai/api/v1",
+    chatModel: process.env.OPENROUTER_CHAT_MODEL || "openai/gpt-4o-mini",
+  },
+  {
+    name: "Cohere",
+    key: process.env.COHERE_API_KEY || "",
+    baseUrl: process.env.COHERE_COMPAT_BASE_URL || "https://api.cohere.ai/compatibility/v1",
+    chatModel: process.env.COHERE_CHAT_MODEL || "command-a-03-2025",
+  },
+];
+
+describe("ApiLLM Embeddings (live)", () => {
+  for (const provider of embeddingProviders) {
+    test.skipIf(!provider.key)(`${provider.name} /v1/embeddings returns a non-empty vector`, async () => {
+      const llm = new ApiLLM({
+        embedBaseUrl: provider.baseUrl,
+        embedApiKey: provider.key,
+        embedModel: provider.embedModel,
+      });
+
+      const result = await llm.embed(`QMD embedding live test (${provider.name})`);
+      expect(result).not.toBeNull();
+      expect(Array.isArray(result?.embedding)).toBe(true);
+      expect(result!.embedding.length).toBeGreaterThan(10);
+      expect(Number.isFinite(result!.embedding[0])).toBe(true);
+    }, 30000);
+  }
+});
+
+describe("ApiLLM Query Expansion (live)", () => {
+  for (const provider of chatProviders) {
+    test.skipIf(!provider.key)(`${provider.name} chat completions expands query with line output mode`, async () => {
+      const llm = new ApiLLM({
+        chatBaseUrl: provider.baseUrl,
+        chatApiKey: provider.key,
+        chatModel: provider.chatModel,
+      });
+
+      const result = await llm.expandQuery("how to authenticate API requests");
+      expect(result.length).toBeGreaterThanOrEqual(1);
+      for (const item of result) {
+        expect(["lex", "vec", "hyde"]).toContain(item.type);
+        expect(item.text.length).toBeGreaterThan(0);
+      }
+    }, 30000);
+  }
+});
+
+const rerankProviders = [
+  {
+    name: "Cohere",
+    key: process.env.COHERE_API_KEY || "",
+    baseUrl: process.env.COHERE_BASE_URL || "https://api.cohere.com/v1",
+    rerankModel: process.env.COHERE_RERANK_MODEL || "rerank-v3.5",
+  },
+  {
+    name: "Voyage",
+    key: process.env.VOYAGE_API_KEY || "",
+    baseUrl: process.env.VOYAGE_BASE_URL || "https://api.voyageai.com/v1",
+    rerankModel: process.env.VOYAGE_RERANK_MODEL || "rerank-2.5-lite",
+  },
+];
+
+describe("ApiLLM Rerank (live)", () => {
+  for (const provider of rerankProviders) {
+    test.skipIf(!provider.key)(`${provider.name} /v1/rerank returns ranked documents with finite scores`, async () => {
+      const llm = new ApiLLM({
+        rerankBaseUrl: provider.baseUrl,
+        rerankApiKey: provider.key,
+        rerankModel: provider.rerankModel,
+      });
+
+      const docs = [
+        { file: "france.md", text: "Paris is the capital city of France." },
+        { file: "pets.md", text: "Cats and dogs are common household pets." },
+        { file: "germany.md", text: "Berlin is the capital city of Germany." },
+      ];
+
+      const result = await llm.rerank("What is the capital of France?", docs);
+      expect(result.results.length).toBe(3);
+      expect(result.results[0]!.file).toBe("france.md");
+      expect(Number.isFinite(result.results[0]!.score)).toBe(true);
+      expect(result.results[0]!.score).toBeGreaterThanOrEqual(result.results[1]!.score);
+    }, 30000);
+  }
+});
diff --git a/test/store.helpers.unit.test.ts b/test/store.helpers.unit.test.ts
index 3303187..868237d 100644
--- a/test/store.helpers.unit.test.ts
+++ b/test/store.helpers.unit.test.ts
@@ -15,6 +15,7 @@ import {
   normalizeDocid,
   isDocid,
   handelize,
+  chunkDocumentByTokens,
 } from "../src/store";
 
 // =============================================================================
@@ -203,3 +204,25 @@ describe("handelize", () => {
     expect(isDocid("12345")).toBe(false);
   });
 });
+
+describe("Token Chunking Fallback", () => {
+  test("chunkDocumentByTokens uses char-based fallback when backend cannot tokenize", async () => {
+    const originalBackend = process.env.QMD_LLM_BACKEND;
+    process.env.QMD_LLM_BACKEND = "api";
+
+    try {
+      const content = "This is a document sentence. ".repeat(400);
+      const chunks = await chunkDocumentByTokens(content, 120, 18, 40);
+      expect(chunks.length).toBeGreaterThan(1);
+      for (const chunk of chunks) {
+        expect(chunk.tokens).toBeGreaterThan(0);
+      }
+    } finally {
+      if (originalBackend === undefined) {
+        delete process.env.QMD_LLM_BACKEND;
+      } else {
+        process.env.QMD_LLM_BACKEND = originalBackend;
+      }
+    }
+  });
+});
diff --git a/test/store.scope-guard.unit.test.ts b/test/store.scope-guard.unit.test.ts
new file mode 100644
index 0000000..56030de
--- /dev/null
+++ b/test/store.scope-guard.unit.test.ts
@@ -0,0 +1,102 @@
+import { afterEach, beforeEach, describe, expect, test } from "vitest";
+import { mkdtemp, rm } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { createStore, type Store } from "../src/store.js";
+import {
+  clearApiEmbeddingScope,
+  getVectorScopeGuardMessage,
+  setApiEmbeddingScopeFromCurrentEnv,
+} from "../src/vector-scope-guard.js";
+
+describe("Vector scope guard (API metadata)", () => {
+  let testDir: string;
+  let store: Store;
+
+  const originalBackend = process.env.QMD_LLM_BACKEND;
+  const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL;
+  const originalEmbedModel = process.env.QMD_EMBED_MODEL;
+
+  beforeEach(async () => {
+    testDir = await mkdtemp(join(tmpdir(), "qmd-scope-guard-"));
+    store = createStore(join(testDir, "index.sqlite"));
+
+    delete process.env.QMD_LLM_BACKEND;
+    delete process.env.QMD_EMBED_BASE_URL;
+    delete process.env.QMD_EMBED_MODEL;
+  });
+
+  afterEach(async () => {
+    store.close();
+    await rm(testDir, { recursive: true, force: true });
+
+    if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND;
+    else process.env.QMD_LLM_BACKEND = originalBackend;
+
+    if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL;
+    else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl;
+
+    if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL;
+    else process.env.QMD_EMBED_MODEL = originalEmbedModel;
+  });
+
+  test("local backend with no api metadata does not block vector paths", () => {
+    process.env.QMD_LLM_BACKEND = "local";
+    const message = getVectorScopeGuardMessage(store.db);
+    expect(message).toBeNull();
+  });
+
+  test("local backend blocks when api metadata exists", () => {
+    process.env.QMD_LLM_BACKEND = "api";
+    process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1";
+    process.env.QMD_EMBED_MODEL = "text-embedding-3-small";
+    setApiEmbeddingScopeFromCurrentEnv(store.db);
+
+    process.env.QMD_LLM_BACKEND = "local";
+    const message = getVectorScopeGuardMessage(store.db);
+    expect(message).toContain("current backend is local");
+    expect(message).toContain("qmd embed -f");
+  });
+
+  test("api backend blocks legacy vectors when api metadata is missing", () => {
+    process.env.QMD_LLM_BACKEND = "api";
+    clearApiEmbeddingScope(store.db);
+
+    store.ensureVecTable(3);
+    store.insertEmbedding(
+      "hash-1",
+      0,
+      0,
+      new Float32Array([0.1, 0.2, 0.3]),
+      "legacy-model",
+      new Date().toISOString()
+    );
+
+    const message = getVectorScopeGuardMessage(store.db);
+    expect(message).toContain("legacy/ambiguous");
+    expect(message).toContain("qmd embed -f");
+  });
+
+  test("api backend allows matching stored scope", () => {
+    process.env.QMD_LLM_BACKEND = "api";
+    process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1";
+    process.env.QMD_EMBED_MODEL = "text-embedding-3-small";
+    setApiEmbeddingScopeFromCurrentEnv(store.db);
+
+    const message = getVectorScopeGuardMessage(store.db);
+    expect(message).toBeNull();
+  });
+
+  test("api backend blocks mismatched stored scope", () => {
+    process.env.QMD_LLM_BACKEND = "api";
+    process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1";
+    process.env.QMD_EMBED_MODEL = "text-embedding-3-small";
+    setApiEmbeddingScopeFromCurrentEnv(store.db);
+
+    process.env.QMD_EMBED_MODEL = "text-embedding-3-large";
+    const message = getVectorScopeGuardMessage(store.db);
+    expect(message).toContain("scope mismatch");
+    expect(message).toContain("Stored scope");
+    expect(message).toContain("Current scope");
+  });
+});
diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts
index 0a4c8c6..e60dcd3 100644
--- a/test/structured-search.test.ts
+++ b/test/structured-search.test.ts
@@ -20,6 +20,10 @@ import {
   type Store,
 } from "../src/store.js";
 import { disposeDefaultLlamaCpp } from "../src/llm.js";
+import {
+  clearApiEmbeddingScope,
+  setApiEmbeddingScopeFromCurrentEnv,
+} from "../src/vector-scope-guard.js";
 
 // =============================================================================
 // parseStructuredQuery Tests (CLI Parser)
@@ -317,6 +321,34 @@ describe("structuredSearch", () => {
       expect(r.score).toBeGreaterThanOrEqual(0.5);
     }
   });
+
+  test("applies API scope guard on structured query path", async () => {
+    const originalBackend = process.env.QMD_LLM_BACKEND;
+    const originalEmbedBaseUrl = process.env.QMD_EMBED_BASE_URL;
+    const originalEmbedModel = process.env.QMD_EMBED_MODEL;
+
+    try {
+      process.env.QMD_LLM_BACKEND = "api";
+      process.env.QMD_EMBED_BASE_URL = "https://api.openai.com/v1";
+      process.env.QMD_EMBED_MODEL = "text-embedding-3-small";
+      setApiEmbeddingScopeFromCurrentEnv(store.db);
+
+      process.env.QMD_LLM_BACKEND = "local";
+      await expect(structuredSearch(store, [{ type: "lex", query: "test" }]))
+        .rejects.toThrow("current backend is local");
+    } finally {
+      clearApiEmbeddingScope(store.db);
+
+      if (originalBackend === undefined) delete process.env.QMD_LLM_BACKEND;
+      else process.env.QMD_LLM_BACKEND = originalBackend;
+
+      if (originalEmbedBaseUrl === undefined) delete process.env.QMD_EMBED_BASE_URL;
+      else process.env.QMD_EMBED_BASE_URL = originalEmbedBaseUrl;
+
+      if (originalEmbedModel === undefined) delete process.env.QMD_EMBED_MODEL;
+      else process.env.QMD_EMBED_MODEL = originalEmbedModel;
+    }
+  });
 });
 
 // =============================================================================