From 7beb80912dc2162ddf2030c01d409e0c28324e70 Mon Sep 17 00:00:00 2001
From: JonBasse <jonathan.basse@gmail.com>
Date: Sat, 21 Feb 2026 08:02:22 +0100
Subject: [PATCH] fix(llm): try all available GPU types before falling back to
 CPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GPU initialization used `.find()` to pick only the first available
GPU type (typically CUDA), and on failure fell directly to CPU — skipping
Vulkan and Metal entirely. On Linux systems with Vulkan but no CUDA
toolkit, this meant queries always ran on CPU despite a working GPU.

Changed to `.filter()` + loop to try each available GPU in priority order
(CUDA > Metal > Vulkan) before falling back to CPU.

Fixes #213

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/llm.ts | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/llm.ts b/src/llm.ts
index 46c62957..6c05590f 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -502,20 +502,21 @@ export class LlamaCpp implements LLM {
       // (likely a binary/build config issue in node-llama-cpp).
       // @ts-expect-error node-llama-cpp API compat
       const gpuTypes = await getLlamaGpuTypes();
-      // Prefer CUDA > Metal > Vulkan > CPU
-      const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
+      // Prefer CUDA > Metal > Vulkan > CPU — try each in order
+      const gpuOrder = (["cuda", "metal", "vulkan"] as const).filter(g => gpuTypes.includes(g));
 
-      let llama: Llama;
-      if (preferred) {
+      let llama: Llama | undefined;
+      for (const gpu of gpuOrder) {
         try {
-          llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
+          llama = await getLlama({ gpu, logLevel: LlamaLogLevel.error });
+          break;
         } catch {
-          llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
           process.stderr.write(
-            `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
+            `QMD Warning: ${gpu} reported available but failed to initialize. Trying next...\n`
           );
         }
-      } else {
+      }
+      if (!llama) {
         llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
       }