From fd9e5362b176c65a163b83ce132eed054ebb6d84 Mon Sep 17 00:00:00 2001 From: Rami Al-Haddad Date: Sat, 21 Feb 2026 17:33:59 +0400 Subject: [PATCH] fix(gpu): try all available backends before falling back to CPU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, when the first preferred GPU backend (e.g. CUDA) was detected as available but failed to initialize, QMD fell straight to CPU — skipping other viable backends like Vulkan entirely. This is a common scenario on machines where CUDA is installed but the prebuilt binary is incompatible (e.g. older Pascal GPUs, mismatched toolkit versions), while Vulkan is fully functional. Changes: - Try all detected GPU backends in priority order before falling to CPU: cuda -> metal -> vulkan This ensures CUDA/Metal users always get the best available backend, while Vulkan serves as a universal fallback. - Only fall back to CPU after every GPU backend has been attempted. - Emit clear, concise warnings listing which backends were tried and why each one failed — much easier to diagnose than a silent CPU fallback. - Respect QMD_FORCE_CPU / FORCE_CPU env vars to skip GPU selection entirely. - Remove @ts-expect-error hack; proper LlamaGpuType casting throughout. Tested on: GTX 1050 Ti (compute_61) + CUDA 13.1 + Vulkan, Node 25, Windows 11 Before: CPU fallback (GPU: none) After: GPU: vulkan (offloading: yes) --- src/llm.ts | 80 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 46c62957..7049ffd4 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -14,6 +14,7 @@ import { type LlamaModel, type LlamaEmbeddingContext, type Token as LlamaToken, + type LlamaGpuType, } from "node-llama-cpp"; import { homedir } from "os"; import { join } from "path"; @@ -497,28 +498,73 @@ export class LlamaCpp implements LLM { */ private async ensureLlama(): Promise { if (!this.llama) { - // Detect available GPU types and use the best one. - // We can't rely on gpu:"auto" — it returns false even when CUDA is available - // (likely a binary/build config issue in node-llama-cpp). - // @ts-expect-error node-llama-cpp API compat - const gpuTypes = await getLlamaGpuTypes(); - // Prefer CUDA > Metal > Vulkan > CPU - const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g)); - - let llama: Llama; - if (preferred) { + const isTruthyEnv = (value?: string): boolean => { + if (!value) return false; + const normalized = value.trim().toLowerCase(); + return normalized === "1" || normalized === "true" || normalized === "yes" || normalized === "on"; + }; + + const summarizeError = (error: unknown): string => { + const message = error instanceof Error ? error.message : String(error); + const firstLine = message.split(/\r?\n/)[0]?.trim() || "unknown error"; + return firstLine.length > 140 ? `${firstLine.slice(0, 137)}...` : firstLine; + }; + + const forceCpu = isTruthyEnv(process.env.QMD_FORCE_CPU) || isTruthyEnv(process.env.FORCE_CPU); + // Try backends in capability order: CUDA (fastest on NVIDIA), Metal (Apple Silicon), + // then Vulkan as a universal fallback. This ensures users with working CUDA/Metal + // always get the best backend, while older cards or broken CUDA installs gracefully + // fall through to Vulkan rather than dropping straight to CPU. + const preferredGpuOrder = ["cuda", "metal", "vulkan"] as const; + + let orderedGpuTypes: LlamaGpuType[] = []; + if (!forceCpu) { + // We can't rely on gpu:"auto" — it returns false even when CUDA is available + // (likely a binary/build config issue in node-llama-cpp). + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const rawGpuTypes: any[] = await (getLlamaGpuTypes as any)(); + const availableGpuTypes: LlamaGpuType[] = Array.isArray(rawGpuTypes) + ? [...new Set(rawGpuTypes.filter((g): g is LlamaGpuType => typeof g === "string" && g.length > 0))] + : []; + const orderedByPref: LlamaGpuType[] = preferredGpuOrder + .filter(g => availableGpuTypes.includes(g as LlamaGpuType)) + .map(g => g as LlamaGpuType); + const remaining: LlamaGpuType[] = availableGpuTypes.filter( + g => !preferredGpuOrder.includes(g as typeof preferredGpuOrder[number]) + ); + orderedGpuTypes = [...orderedByPref, ...remaining]; + } + + const attemptedGpuBackends: LlamaGpuType[] = []; + const gpuErrors: string[] = []; + let llama: Llama | null = null; + + for (const gpuType of orderedGpuTypes) { + attemptedGpuBackends.push(gpuType); try { - llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error }); - } catch { - llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); - process.stderr.write( - `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n` - ); + const candidate = await getLlama({ gpu: gpuType, logLevel: LlamaLogLevel.error }); + if (candidate.gpu) { + llama = candidate; + break; + } + gpuErrors.push(`${gpuType}: initialized without GPU`); + } catch (error) { + gpuErrors.push(`${gpuType}: ${summarizeError(error)}`); } - } else { + } + + if (!llama) { llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); } + if (forceCpu) { + process.stderr.write("QMD Warning: GPU disabled via QMD_FORCE_CPU/FORCE_CPU. Running on CPU.\n"); + } else if (!llama.gpu && attemptedGpuBackends.length > 0 && gpuErrors.length > 0) { + process.stderr.write( + `QMD Warning: GPU backends failed (${gpuErrors.join("; ")}). Tried: ${attemptedGpuBackends.join(", ")}.\n` + ); + } + if (!llama.gpu) { process.stderr.write( "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"