From fd9e5362b176c65a163b83ce132eed054ebb6d84 Mon Sep 17 00:00:00 2001
From: Rami Al-Haddad <haddadrm@gmail.com>
Date: Sat, 21 Feb 2026 17:33:59 +0400
Subject: [PATCH] fix(gpu): try all available backends before falling back to
 CPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, when the first preferred GPU backend (e.g. CUDA) was detected
as available but failed to initialize, QMD fell straight to CPU — skipping
other viable backends like Vulkan entirely.

This is a common scenario on machines where CUDA is installed but the
prebuilt binary is incompatible (e.g. older Pascal GPUs, mismatched toolkit
versions), while Vulkan is fully functional.

Changes:
- Try all detected GPU backends in priority order before falling to CPU:
    cuda -> metal -> vulkan
  This ensures CUDA/Metal users always get the best available backend,
  while Vulkan serves as a universal fallback.
- Only fall back to CPU after every GPU backend has been attempted.
- Emit clear, concise warnings listing which backends were tried and why
  each one failed — much easier to diagnose than a silent CPU fallback.
- Respect QMD_FORCE_CPU / FORCE_CPU env vars to skip GPU selection entirely.
- Remove @ts-expect-error hack; proper LlamaGpuType casting throughout.

Tested on: GTX 1050 Ti (compute_61) + CUDA 13.1 + Vulkan, Node 25, Windows 11
Before: CPU fallback (GPU: none)
After:  GPU: vulkan (offloading: yes)
---
 src/llm.ts | 80 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 17 deletions(-)
diff --git a/src/llm.ts b/src/llm.ts
index 46c62957..7049ffd4 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -14,6 +14,7 @@ import {
   type LlamaModel,
   type LlamaEmbeddingContext,
   type Token as LlamaToken,
+  type LlamaGpuType,
 } from "node-llama-cpp";
 import { homedir } from "os";
 import { join } from "path";
@@ -497,28 +498,73 @@ export class LlamaCpp implements LLM {
    */
   private async ensureLlama(): Promise<Llama> {
     if (!this.llama) {
-      // Detect available GPU types and use the best one.
-      // We can't rely on gpu:"auto" — it returns false even when CUDA is available
-      // (likely a binary/build config issue in node-llama-cpp).
-      // @ts-expect-error node-llama-cpp API compat
-      const gpuTypes = await getLlamaGpuTypes();
-      // Prefer CUDA > Metal > Vulkan > CPU
-      const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
-
-      let llama: Llama;
-      if (preferred) {
+      const isTruthyEnv = (value?: string): boolean => {
+        if (!value) return false;
+        const normalized = value.trim().toLowerCase();
+        return normalized === "1" || normalized === "true" || normalized === "yes" || normalized === "on";
+      };
+
+      const summarizeError = (error: unknown): string => {
+        const message = error instanceof Error ? error.message : String(error);
+        const firstLine = message.split(/\r?\n/)[0]?.trim() || "unknown error";
+        return firstLine.length > 140 ? `${firstLine.slice(0, 137)}...` : firstLine;
+      };
+
+      const forceCpu = isTruthyEnv(process.env.QMD_FORCE_CPU) || isTruthyEnv(process.env.FORCE_CPU);
+      // Try backends in capability order: CUDA (fastest on NVIDIA), Metal (Apple Silicon),
+      // then Vulkan as a universal fallback. This ensures users with working CUDA/Metal
+      // always get the best backend, while older cards or broken CUDA installs gracefully
+      // fall through to Vulkan rather than dropping straight to CPU.
+      const preferredGpuOrder = ["cuda", "metal", "vulkan"] as const;
+
+      let orderedGpuTypes: LlamaGpuType[] = [];
+      if (!forceCpu) {
+        // We can't rely on gpu:"auto" — it returns false even when CUDA is available
+        // (likely a binary/build config issue in node-llama-cpp).
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        const rawGpuTypes: any[] = await (getLlamaGpuTypes as any)();
+        const availableGpuTypes: LlamaGpuType[] = Array.isArray(rawGpuTypes)
+          ? [...new Set(rawGpuTypes.filter((g): g is LlamaGpuType => typeof g === "string" && g.length > 0))]
+          : [];
+        const orderedByPref: LlamaGpuType[] = preferredGpuOrder
+          .filter(g => availableGpuTypes.includes(g as LlamaGpuType))
+          .map(g => g as LlamaGpuType);
+        const remaining: LlamaGpuType[] = availableGpuTypes.filter(
+          g => !preferredGpuOrder.includes(g as typeof preferredGpuOrder[number])
+        );
+        orderedGpuTypes = [...orderedByPref, ...remaining];
+      }
+
+      const attemptedGpuBackends: LlamaGpuType[] = [];
+      const gpuErrors: string[] = [];
+      let llama: Llama | null = null;
+
+      for (const gpuType of orderedGpuTypes) {
+        attemptedGpuBackends.push(gpuType);
         try {
-          llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
-        } catch {
-          llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
-          process.stderr.write(
-            `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
-          );
+          const candidate = await getLlama({ gpu: gpuType, logLevel: LlamaLogLevel.error });
+          if (candidate.gpu) {
+            llama = candidate;
+            break;
+          }
+          gpuErrors.push(`${gpuType}: initialized without GPU`);
+        } catch (error) {
+          gpuErrors.push(`${gpuType}: ${summarizeError(error)}`);
         }
-      } else {
+      }
+
+      if (!llama) {
         llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
       }
 
+      if (forceCpu) {
+        process.stderr.write("QMD Warning: GPU disabled via QMD_FORCE_CPU/FORCE_CPU. Running on CPU.\n");
+      } else if (!llama.gpu && attemptedGpuBackends.length > 0 && gpuErrors.length > 0) {
+        process.stderr.write(
+          `QMD Warning: GPU backends failed (${gpuErrors.join("; ")}). Tried: ${attemptedGpuBackends.join(", ")}.\n`
+        );
+      }
+
       if (!llama.gpu) {
         process.stderr.write(
           "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"