From e547a403510c84139760d4d588cccc331885f09a Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:40:38 -0800
Subject: [PATCH 1/8] Introduce LiteLLM dependency and initial Gemini
 configuration

---
 .env.example                           |  9 ++--
 .gitignore                             |  3 ++
 README.md                              | 10 ++---
 pyproject.toml                         |  1 +
 src/applypilot/llm.py                  | 61 +++++++++++++++++++++++---
 src/applypilot/scoring/cover_letter.py |  2 +-
 src/applypilot/wizard/init.py          | 58 +++++++++++++++---------
 7 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/.env.example b/.env.example
index df7cc38..fbf9543 100644
--- a/.env.example
+++ b/.env.example
@@ -2,10 +2,11 @@
 # Copy to ~/.applypilot/.env and fill in your values.
 
 # LLM Provider (pick one)
-GEMINI_API_KEY=           # Gemini 2.0 Flash (recommended, cheapest)
-# OPENAI_API_KEY=         # OpenAI (GPT-4o-mini)
-# LLM_URL=http://127.0.0.1:8080/v1  # Local LLM (llama.cpp, Ollama)
-# LLM_MODEL=              # Override model name
+GEMINI_API_KEY=                 # Gemini (recommended, cheapest)
+# OPENAI_API_KEY=               # OpenAI
+# ANTHROPIC_API_KEY=            # Anthropic Claude
+# LLM_URL=http://127.0.0.1:8080/v1  # Local LLM (OpenAI-compatible: llama.cpp, Ollama, vLLM)
+# LLM_MODEL=                    # Override model name (provider-specific)
 
 # Auto-Apply (optional)
 CAPSOLVER_API_KEY=        # For CAPTCHA solving during auto-apply
diff --git a/.gitignore b/.gitignore
index 835589f..35c6a55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@ resume.pdf
 *.env
 .env.*
 !.env.example
+.venv/*
+
 
 # Runtime artifacts
 *.db
@@ -39,3 +41,4 @@ Thumbs.db
 
 # Claude Code
 .claude/
+tm_dev/nb.ipynb
diff --git a/README.md b/README.md
index e7fe08e..140e43c 100644
--- a/README.md
+++ b/README.md
@@ -43,12 +43,12 @@ applypilot apply --dry-run  # fill forms without submitting
 ## Two Paths
 
 ### Full Pipeline (recommended)
-**Requires:** Python 3.11+, Node.js (for npx), Gemini API key (free), Claude Code CLI, Chrome
+**Requires:** Python 3.11+, Node.js (for npx), an LLM key (Gemini/OpenAI/Claude) or `LLM_URL`, Claude Code CLI, Chrome
 
 Runs all 6 stages, from job discovery to autonomous application submission. This is the full power of ApplyPilot.
 
 ### Discovery + Tailoring Only
-**Requires:** Python 3.11+, Gemini API key (free)
+**Requires:** Python 3.11+, an LLM key (Gemini/OpenAI/Claude) or `LLM_URL`
 
 Runs stages 1-5: discovers jobs, scores them, tailors your resume, generates cover letters. You submit applications manually with the AI-prepared materials.
 
@@ -88,11 +88,11 @@ Each stage is independent. Run them all or pick what you need.
 |-----------|-------------|---------|
 | Python 3.11+ | Everything | Core runtime |
 | Node.js 18+ | Auto-apply | Needed for `npx` to run Playwright MCP server |
-| Gemini API key | Scoring, tailoring, cover letters | Free tier (15 RPM / 1M tokens/day) is enough |
+| LLM API key or local endpoint | Scoring, tailoring, cover letters | Set one of `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `LLM_URL` |
 | Chrome/Chromium | Auto-apply | Auto-detected on most systems |
 | Claude Code CLI | Auto-apply | Install from [claude.ai/code](https://claude.ai/code) |
 
-**Gemini API key is free.** Get one at [aistudio.google.com](https://aistudio.google.com). OpenAI and local models (Ollama/llama.cpp) are also supported.
+**Gemini API key is free.** Get one at [aistudio.google.com](https://aistudio.google.com). OpenAI, Claude, and local models (Ollama/llama.cpp/vLLM) are also supported.
 
 ### Optional
 
@@ -115,7 +115,7 @@ Your personal data in one structured file: contact info, work authorization, com
 Job search queries, target titles, locations, boards. Run multiple searches with different parameters.
 
 ### `.env`
-API keys and runtime config: `GEMINI_API_KEY`, `LLM_MODEL`, `CAPSOLVER_API_KEY` (optional).
+API keys and runtime config: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `LLM_URL`, `LLM_MODEL`, `CAPSOLVER_API_KEY` (optional).
 
 ### Package configs (shipped with ApplyPilot)
 - `config/employers.yaml` - Workday employer registry (48 preconfigured)
diff --git a/pyproject.toml b/pyproject.toml
index f5116d8..51268f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,7 @@ classifiers = [
 dependencies = [
     "typer>=0.9.0",
     "rich>=13.0",
+    "litellm",
     "httpx>=0.24",
     "beautifulsoup4>=4.12",
     "playwright>=1.40",
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index 1fb7be6..f2fab0b 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -9,6 +9,7 @@
 LLM_MODEL env var overrides the model name for any provider.
 """
 
+import json
 import logging
 import os
 import time
@@ -73,6 +74,28 @@ def _detect_provider() -> tuple[str, str, str]:
 
 _GEMINI_COMPAT_BASE = "https://generativelanguage.googleapis.com/v1beta/openai"
 _GEMINI_NATIVE_BASE = "https://generativelanguage.googleapis.com/v1beta"
+_GEMINI_THINKING_LEVELS = {"none", "minimal", "low", "medium", "high"}
+_GEMINI_COMPAT_REASONING_EFFORT = {
+    "none": "none",
+    "minimal": "low",
+    "low": "low",
+    "medium": "high",
+    "high": "high",
+}
+_GEMINI_25_THINKING_BUDGET = {
+    "none": 0,
+    "minimal": 1024,
+    "low": 1024,
+    "medium": 8192,
+    "high": 24576,
+}
+_GEMINI_NATIVE_THINKING_LEVEL = {
+    "none": "low",
+    "minimal": "low",
+    "low": "low",
+    "medium": "high",
+    "high": "high",
+}
 
 
 class LLMClient:
@@ -93,6 +116,20 @@ def __init__(self, base_url: str, model: str, api_key: str) -> None:
         self._use_native_gemini: bool = False
         self._is_gemini: bool = base_url.startswith(_GEMINI_COMPAT_BASE)
 
+    @staticmethod
+    def _normalize_thinking_level(thinking_level: str) -> str:
+        level = (thinking_level or "low").strip().lower()
+        if level not in _GEMINI_THINKING_LEVELS:
+            log.warning("Invalid thinking_level '%s', defaulting to 'low'.", thinking_level)
+            return "low"
+        return level
+
+    def _gemini_native_thinking_config(self, thinking_level: str) -> dict:
+        level = self._normalize_thinking_level(thinking_level)
+        if "2.5" in self.model:
+            return {"thinkingBudget": _GEMINI_25_THINKING_BUDGET[level]}
+        return {"thinkingLevel": _GEMINI_NATIVE_THINKING_LEVEL[level]}
+
     # -- Native Gemini API --------------------------------------------------
 
     def _chat_native_gemini(
@@ -100,6 +137,7 @@ def _chat_native_gemini(
         messages: list[dict],
         temperature: float,
         max_tokens: int,
+        thinking_level: str,
     ) -> str:
         """Call the native Gemini generateContent API.
 
@@ -128,6 +166,7 @@ def _chat_native_gemini(
             "generationConfig": {
                 "temperature": temperature,
                 "maxOutputTokens": max_tokens,
+                "thinkingConfig": self._gemini_native_thinking_config(thinking_level),
             },
         }
         if system_parts:
@@ -151,6 +190,7 @@ def _chat_compat(
         messages: list[dict],
         temperature: float,
         max_tokens: int,
+        thinking_level: str,
     ) -> str:
         """Call the OpenAI-compatible endpoint."""
         headers: dict[str, str] = {"Content-Type": "application/json"}
@@ -163,6 +203,9 @@ def _chat_compat(
             "temperature": temperature,
             "max_tokens": max_tokens,
         }
+        if self._is_gemini:
+            level = self._normalize_thinking_level(thinking_level)
+            payload["reasoning_effort"] = _GEMINI_COMPAT_REASONING_EFFORT[level]
 
         resp = self._client.post(
             f"{self.base_url}/chat/completions",
@@ -181,6 +224,10 @@ def _chat_compat(
     def _handle_compat_response(resp: httpx.Response) -> str:
         resp.raise_for_status()
         data = resp.json()
+        if resp.status_code == 200:
+            # Intentionally log the full JSON payload for every successful
+            # chat/completions response to aid truncation/debug analysis.
+            log.info("LLM compat full response JSON:\n%s", json.dumps(data, indent=2, ensure_ascii=False))
         return data["choices"][0]["message"]["content"]
 
     # -- public API ---------------------------------------------------------
@@ -189,9 +236,13 @@ def chat(
         self,
         messages: list[dict],
         temperature: float = 0.0,
-        max_tokens: int = 4096,
+        max_tokens: int = 10000,
+        thinking_level: str = "low",
     ) -> str:
-        """Send a chat completion request and return the assistant message text."""
+        """Send a chat completion request and return the assistant message text.
+
+        thinking_level applies to Gemini requests and defaults to "low".
+        """
         # Qwen3 optimization: prepend /no_think to skip chain-of-thought
         # reasoning, saving tokens on structured extraction tasks.
         if "qwen" in self.model.lower() and messages:
@@ -203,9 +254,9 @@ def chat(
             try:
                 # Route to native Gemini if we've already confirmed it's needed
                 if self._use_native_gemini:
-                    return self._chat_native_gemini(messages, temperature, max_tokens)
+                    return self._chat_native_gemini(messages, temperature, max_tokens, thinking_level)
 
-                return self._chat_compat(messages, temperature, max_tokens)
+                return self._chat_compat(messages, temperature, max_tokens, thinking_level)
 
             except _GeminiCompatForbidden as exc:
                 # Model not available on OpenAI-compat layer — switch to native.
@@ -218,7 +269,7 @@ def chat(
                 self._use_native_gemini = True
                 # Retry immediately with native — don't count as a rate-limit wait
                 try:
-                    return self._chat_native_gemini(messages, temperature, max_tokens)
+                    return self._chat_native_gemini(messages, temperature, max_tokens, thinking_level)
                 except httpx.HTTPStatusError as native_exc:
                     raise RuntimeError(
                         f"Both Gemini endpoints failed. Compat: 403 Forbidden. "
diff --git a/src/applypilot/scoring/cover_letter.py b/src/applypilot/scoring/cover_letter.py
index c16cdd5..bbbb053 100644
--- a/src/applypilot/scoring/cover_letter.py
+++ b/src/applypilot/scoring/cover_letter.py
@@ -165,7 +165,7 @@ def generate_cover_letter(
             )},
         ]
 
-        letter = client.chat(messages, max_tokens=1024, temperature=0.7)
+        letter = client.chat(messages, max_tokens=10000, temperature=0.7)
         letter = sanitize_text(letter)  # auto-fix em dashes, smart quotes
         letter = _strip_preamble(letter)  # remove any "Here is the letter:" prefix
 
diff --git a/src/applypilot/wizard/init.py b/src/applypilot/wizard/init.py
index 0f893c3..20bea2a 100644
--- a/src/applypilot/wizard/init.py
+++ b/src/applypilot/wizard/init.py
@@ -4,7 +4,7 @@
   - resume.txt (and optionally resume.pdf)
   - profile.json
   - searches.yaml
-  - .env (LLM API key)
+  - .env (LLM API keys and runtime settings)
 """
 
 from __future__ import annotations
@@ -245,33 +245,51 @@ def _setup_ai_features() -> None:
         console.print("[dim]Discovery-only mode. You can configure AI later with [bold]applypilot init[/bold].[/dim]")
         return
 
-    console.print("Supported providers: [bold]Gemini[/bold] (recommended, free tier), OpenAI, local (Ollama/llama.cpp)")
-    provider = Prompt.ask(
-        "Provider",
-        choices=["gemini", "openai", "local"],
-        default="gemini",
+    console.print(
+        "Supported providers: [bold]Gemini[/bold] (recommended, free tier), "
+        "OpenAI, Claude, local (Ollama/llama.cpp)."
     )
+    console.print("[dim]Enter any credentials you want to save now. Leave blank to skip each field.[/dim]")
 
     env_lines = ["# ApplyPilot configuration", ""]
+    configured_sources: list[str] = []
+
+    gemini_key = Prompt.ask("Gemini API key (optional, from aistudio.google.com)", default="").strip()
+    if gemini_key:
+        env_lines.append(f"GEMINI_API_KEY={gemini_key}")
+        configured_sources.append("gemini")
+
+    openai_key = Prompt.ask("OpenAI API key (optional)", default="").strip()
+    if openai_key:
+        env_lines.append(f"OPENAI_API_KEY={openai_key}")
+        configured_sources.append("openai")
+
+    anthropic_key = Prompt.ask("Anthropic API key (optional)", default="").strip()
+    if anthropic_key:
+        env_lines.append(f"ANTHROPIC_API_KEY={anthropic_key}")
+        configured_sources.append("anthropic")
+
+    local_url = Prompt.ask("Local LLM endpoint URL (optional)", default="").strip()
+    if local_url:
+        env_lines.append(f"LLM_URL={local_url}")
+        configured_sources.append("local")
+
+    if not configured_sources:
+        console.print("[dim]No AI provider configured. You can add one later with [bold]applypilot init[/bold].[/dim]")
+        return
 
-    if provider == "gemini":
-        api_key = Prompt.ask("Gemini API key (from aistudio.google.com)")
-        model = Prompt.ask("Model", default="gemini-2.0-flash")
-        env_lines.append(f"GEMINI_API_KEY={api_key}")
-        env_lines.append(f"LLM_MODEL={model}")
-    elif provider == "openai":
-        api_key = Prompt.ask("OpenAI API key")
-        model = Prompt.ask("Model", default="gpt-4o-mini")
-        env_lines.append(f"OPENAI_API_KEY={api_key}")
-        env_lines.append(f"LLM_MODEL={model}")
-    elif provider == "local":
-        url = Prompt.ask("Local LLM endpoint URL", default="http://localhost:8080/v1")
-        model = Prompt.ask("Model name", default="local-model")
-        env_lines.append(f"LLM_URL={url}")
+    model = Prompt.ask("LLM model override (optional, leave blank to use provider defaults)", default="").strip()
+    if model:
         env_lines.append(f"LLM_MODEL={model}")
 
     env_lines.append("")
     ENV_PATH.write_text("\n".join(env_lines), encoding="utf-8")
+    if len(configured_sources) > 1:
+        configured = ", ".join(configured_sources)
+        console.print(
+            f"[yellow]Multiple LLM providers saved ({configured}). "
+            "Deterministic provider resolution is added in the next phase.[/yellow]"
+        )
     console.print(f"[green]AI configuration saved to {ENV_PATH}[/green]")
 
 

From ce1007195b70908733d0cd74d22a3173055eddfc Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:40:42 -0800
Subject: [PATCH 2/8] Centralize provider resolution and refactor llm.py into a
 thin adapter

---
 src/applypilot/cli.py         |  33 +-
 src/applypilot/config.py      |  15 +-
 src/applypilot/llm.py         | 551 +++++++++++++++++-----------------
 src/applypilot/wizard/init.py |   2 +-
 4 files changed, 314 insertions(+), 287 deletions(-)

diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py
index 6c8be91..d524174 100644
--- a/src/applypilot/cli.py
+++ b/src/applypilot/cli.py
@@ -379,21 +379,24 @@ def doctor() -> None:
                         "pip install --no-deps python-jobspy && pip install pydantic tls-client requests markdownify regex"))
 
     # --- Tier 2 checks ---
-    import os
-    has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
-    has_openai = bool(os.environ.get("OPENAI_API_KEY"))
-    has_local = bool(os.environ.get("LLM_URL"))
-    if has_gemini:
-        model = os.environ.get("LLM_MODEL", "gemini-2.0-flash")
-        results.append(("LLM API key", ok_mark, f"Gemini ({model})"))
-    elif has_openai:
-        model = os.environ.get("LLM_MODEL", "gpt-4o-mini")
-        results.append(("LLM API key", ok_mark, f"OpenAI ({model})"))
-    elif has_local:
-        results.append(("LLM API key", ok_mark, f"Local: {os.environ.get('LLM_URL')}"))
-    else:
-        results.append(("LLM API key", fail_mark,
-                        "Set GEMINI_API_KEY in ~/.applypilot/.env (run 'applypilot init')"))
+    from applypilot.llm import resolve_llm_config
+
+    try:
+        llm_cfg = resolve_llm_config()
+        if llm_cfg.provider == "local":
+            results.append(("LLM API key", ok_mark, f"Local: {llm_cfg.base_url} ({llm_cfg.model})"))
+        else:
+            label = {
+                "gemini": "Gemini",
+                "openai": "OpenAI",
+                "anthropic": "Anthropic",
+            }.get(llm_cfg.provider, llm_cfg.provider)
+            results.append(("LLM API key", ok_mark, f"{label} ({llm_cfg.model})"))
+    except RuntimeError:
+        results.append(
+            ("LLM API key", fail_mark,
+             "Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, or LLM_URL in ~/.applypilot/.env")
+        )
 
     # --- Tier 3 checks ---
     # Claude Code CLI
diff --git a/src/applypilot/config.py b/src/applypilot/config.py
index 8c39780..9067245 100644
--- a/src/applypilot/config.py
+++ b/src/applypilot/config.py
@@ -206,7 +206,10 @@ def get_tier() -> int:
     """
     load_env()
 
-    has_llm = any(os.environ.get(k) for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "LLM_URL"))
+    has_llm = any(
+        os.environ.get(k)
+        for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "LLM_URL")
+    )
     if not has_llm:
         return 1
 
@@ -238,8 +241,14 @@ def check_tier(required: int, feature: str) -> None:
     _console = Console(stderr=True)
 
     missing: list[str] = []
-    if required >= 2 and not any(os.environ.get(k) for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "LLM_URL")):
-        missing.append("LLM API key — run [bold]applypilot init[/bold] or set GEMINI_API_KEY")
+    if required >= 2 and not any(
+        os.environ.get(k)
+        for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "LLM_URL")
+    ):
+        missing.append(
+            "LLM API key — run [bold]applypilot init[/bold] or set "
+            "GEMINI_API_KEY / OPENAI_API_KEY / ANTHROPIC_API_KEY / LLM_URL"
+        )
     if required >= 3:
         if not shutil.which("claude"):
             missing.append("Claude Code CLI — install from [bold]https://claude.ai/code[/bold]")
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index f2fab0b..c60aed8 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -1,79 +1,43 @@
-"""
-Unified LLM client for ApplyPilot.
+"""Unified LLM client for ApplyPilot using LiteLLM.
 
 Auto-detects provider from environment:
-  GEMINI_API_KEY  -> Google Gemini (default: gemini-2.0-flash)
-  OPENAI_API_KEY  -> OpenAI (default: gpt-4o-mini)
-  LLM_URL         -> Local llama.cpp / Ollama compatible endpoint
+  GEMINI_API_KEY      -> Google Gemini (default: gemini-2.0-flash)
+  OPENAI_API_KEY      -> OpenAI (default: gpt-4o-mini)
+  ANTHROPIC_API_KEY   -> Anthropic Claude (default: claude-3-5-haiku-latest)
+  LLM_URL             -> Local OpenAI-compatible endpoint
 
 LLM_MODEL env var overrides the model name for any provider.
 """
 
-import json
+from __future__ import annotations
+
+from collections.abc import Mapping
+from dataclasses import dataclass
 import logging
 import os
 import time
 
-import httpx
-
 log = logging.getLogger(__name__)
 
-# ---------------------------------------------------------------------------
-# Provider detection
-# ---------------------------------------------------------------------------
-
-def _detect_provider() -> tuple[str, str, str]:
-    """Return (base_url, model, api_key) based on environment variables.
-
-    Reads env at call time (not module import time) so that load_env() called
-    in _bootstrap() is always visible here.
-    """
-    gemini_key = os.environ.get("GEMINI_API_KEY", "")
-    openai_key = os.environ.get("OPENAI_API_KEY", "")
-    local_url = os.environ.get("LLM_URL", "")
-    model_override = os.environ.get("LLM_MODEL", "")
-
-    if gemini_key and not local_url:
-        return (
-            "https://generativelanguage.googleapis.com/v1beta/openai",
-            model_override or "gemini-2.0-flash",
-            gemini_key,
-        )
-
-    if openai_key and not local_url:
-        return (
-            "https://api.openai.com/v1",
-            model_override or "gpt-4o-mini",
-            openai_key,
-        )
-
-    if local_url:
-        return (
-            local_url.rstrip("/"),
-            model_override or "local-model",
-            os.environ.get("LLM_API_KEY", ""),
-        )
-
-    raise RuntimeError(
-        "No LLM provider configured. "
-        "Set GEMINI_API_KEY, OPENAI_API_KEY, or LLM_URL in your environment."
-    )
-
-
-# ---------------------------------------------------------------------------
-# Client
-# ---------------------------------------------------------------------------
+_OPENAI_BASE = "https://api.openai.com/v1"
+_ANTHROPIC_BASE = "https://api.anthropic.com/v1"
+_GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai"
+_PROVIDER_API_ENV_KEY = {
+    "gemini": "GEMINI_API_KEY",
+    "openai": "OPENAI_API_KEY",
+    "anthropic": "ANTHROPIC_API_KEY",
+}
+_DEFAULT_MODEL_BY_PROVIDER = {
+    "local": "local-model",
+    "gemini": "gemini-2.0-flash",
+    "openai": "gpt-4o-mini",
+    "anthropic": "claude-3-5-haiku-latest",
+}
 
 _MAX_RETRIES = 5
 _TIMEOUT = 120  # seconds
-
-# Base wait on first 429/503 (doubles each retry, caps at 60s).
-# Gemini free tier is 15 RPM = 4s minimum between requests; 10s gives headroom.
 _RATE_LIMIT_BASE_WAIT = 10
 
-
-_GEMINI_COMPAT_BASE = "https://generativelanguage.googleapis.com/v1beta/openai"
-_GEMINI_NATIVE_BASE = "https://generativelanguage.googleapis.com/v1beta"
 _GEMINI_THINKING_LEVELS = {"none", "minimal", "low", "medium", "high"}
 _GEMINI_COMPAT_REASONING_EFFORT = {
     "none": "none",
@@ -82,229 +46,284 @@ def _detect_provider() -> tuple[str, str, str]:
     "medium": "high",
     "high": "high",
 }
-_GEMINI_25_THINKING_BUDGET = {
-    "none": 0,
-    "minimal": 1024,
-    "low": 1024,
-    "medium": 8192,
-    "high": 24576,
-}
-_GEMINI_NATIVE_THINKING_LEVEL = {
-    "none": "low",
-    "minimal": "low",
-    "low": "low",
-    "medium": "high",
-    "high": "high",
-}
 
 
-class LLMClient:
-    """Thin LLM client supporting OpenAI-compatible and native Gemini endpoints.
-
-    For Gemini keys, starts on the OpenAI-compat layer. On a 403 (which
-    happens with preview/experimental models not exposed via compat), it
-    automatically switches to the native generateContent API and stays there
-    for the lifetime of the process.
-    """
-
-    def __init__(self, base_url: str, model: str, api_key: str) -> None:
-        self.base_url = base_url
-        self.model = model
-        self.api_key = api_key
-        self._client = httpx.Client(timeout=_TIMEOUT)
-        # True once we've confirmed the native Gemini API works for this model
-        self._use_native_gemini: bool = False
-        self._is_gemini: bool = base_url.startswith(_GEMINI_COMPAT_BASE)
-
-    @staticmethod
-    def _normalize_thinking_level(thinking_level: str) -> str:
-        level = (thinking_level or "low").strip().lower()
-        if level not in _GEMINI_THINKING_LEVELS:
-            log.warning("Invalid thinking_level '%s', defaulting to 'low'.", thinking_level)
-            return "low"
-        return level
-
-    def _gemini_native_thinking_config(self, thinking_level: str) -> dict:
-        level = self._normalize_thinking_level(thinking_level)
-        if "2.5" in self.model:
-            return {"thinkingBudget": _GEMINI_25_THINKING_BUDGET[level]}
-        return {"thinkingLevel": _GEMINI_NATIVE_THINKING_LEVEL[level]}
-
-    # -- Native Gemini API --------------------------------------------------
-
-    def _chat_native_gemini(
-        self,
-        messages: list[dict],
-        temperature: float,
-        max_tokens: int,
-        thinking_level: str,
-    ) -> str:
-        """Call the native Gemini generateContent API.
-
-        Used automatically when the OpenAI-compat endpoint returns 403,
-        which happens for preview/experimental models not exposed via compat.
-
-        Converts OpenAI-style messages to Gemini's contents/systemInstruction
-        format transparently.
-        """
-        contents: list[dict] = []
-        system_parts: list[dict] = []
-
-        for msg in messages:
-            role = msg["role"]
-            text = msg.get("content", "")
-            if role == "system":
-                system_parts.append({"text": text})
-            elif role == "user":
-                contents.append({"role": "user", "parts": [{"text": text}]})
-            elif role == "assistant":
-                # Gemini uses "model" instead of "assistant"
-                contents.append({"role": "model", "parts": [{"text": text}]})
-
-        payload: dict = {
-            "contents": contents,
-            "generationConfig": {
-                "temperature": temperature,
-                "maxOutputTokens": max_tokens,
-                "thinkingConfig": self._gemini_native_thinking_config(thinking_level),
-            },
-        }
-        if system_parts:
-            payload["systemInstruction"] = {"parts": system_parts}
-
-        url = f"{_GEMINI_NATIVE_BASE}/models/{self.model}:generateContent"
-        resp = self._client.post(
-            url,
-            json=payload,
-            headers={"Content-Type": "application/json"},
-            params={"key": self.api_key},
+@dataclass(frozen=True)
+class LLMConfig:
+    """Normalized LLM configuration consumed by LLMClient."""
+
+    provider: str
+    base_url: str
+    model: str
+    api_key: str
+
+
+def _env_get(env: Mapping[str, str], key: str) -> str:
+    value = env.get(key, "")
+    if value is None:
+        return ""
+    return str(value).strip()
+
+
+def _normalize_thinking_level(thinking_level: str) -> str:
+    level = (thinking_level or "low").strip().lower()
+    if level not in _GEMINI_THINKING_LEVELS:
+        log.warning("Invalid thinking_level '%s', defaulting to 'low'.", thinking_level)
+        return "low"
+    return level
+
+
+def _provider_model(provider: str, model: str) -> str:
+    if provider == "local":
+        return model
+    if model.startswith(f"{provider}/"):
+        return model
+    return f"{provider}/{model}"
+
+
+def _default_model(provider: str) -> str:
+    return _DEFAULT_MODEL_BY_PROVIDER[provider]
+
+
+def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
+    """Resolve provider configuration from environment with deterministic precedence."""
+    env_map = env if env is not None else os.environ
+
+    model_override = _env_get(env_map, "LLM_MODEL")
+    local_url = _env_get(env_map, "LLM_URL")
+    gemini_key = _env_get(env_map, "GEMINI_API_KEY")
+    openai_key = _env_get(env_map, "OPENAI_API_KEY")
+    anthropic_key = _env_get(env_map, "ANTHROPIC_API_KEY")
+    llm_provider = _env_get(env_map, "LLM_PROVIDER").lower()
+
+    providers_present = {
+        "local": bool(local_url),
+        "gemini": bool(gemini_key),
+        "openai": bool(openai_key),
+        "anthropic": bool(anthropic_key),
+    }
+    precedence = ["local", "gemini", "openai", "anthropic"]
+    configured = [provider for provider in precedence if providers_present[provider]]
+
+    if not configured:
+        raise RuntimeError(
+            "No LLM provider configured. "
+            "Set one of LLM_URL, GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY."
         )
-        resp.raise_for_status()
-        data = resp.json()
-        return data["candidates"][0]["content"]["parts"][0]["text"]
 
-    # -- OpenAI-compat API --------------------------------------------------
+    chosen = ""
+    override_aliases = {
+        "local": "local",
+        "gemini": "gemini",
+        "openai": "openai",
+        "anthropic": "anthropic",
+    }
+
+    # Optional override only when multiple providers are configured.
+    if len(configured) > 1 and llm_provider:
+        overridden = override_aliases.get(llm_provider)
+        if overridden and overridden in configured:
+            chosen = overridden
+            log.warning(
+                "Multiple LLM providers configured (%s). Using '%s' via LLM_PROVIDER override.",
+                ", ".join(configured),
+                chosen,
+            )
+        else:
+            log.warning(
+                "Ignoring LLM_PROVIDER='%s' because it is not configured. "
+                "Using precedence: LLM_URL > GEMINI_API_KEY > OPENAI_API_KEY > ANTHROPIC_API_KEY.",
+                llm_provider,
+            )
+
+    if not chosen:
+        chosen = configured[0]
+        if len(configured) > 1:
+            log.warning(
+                "Multiple LLM providers configured (%s). Using '%s' based on precedence: "
+                "LLM_URL > GEMINI_API_KEY > OPENAI_API_KEY > ANTHROPIC_API_KEY.",
+                ", ".join(configured),
+                chosen,
+            )
+    model = model_override or _default_model(chosen)
+
+    if chosen == "local":
+        return LLMConfig(
+            provider="local",
+            base_url=local_url.rstrip("/"),
+            model=model,
+            api_key=_env_get(env_map, "LLM_API_KEY"),
+        )
+    if chosen == "gemini":
+        return LLMConfig(
+            provider="gemini",
+            base_url=_GEMINI_BASE,
+            model=model,
+            api_key=gemini_key,
+        )
+    if chosen == "openai":
+        return LLMConfig(
+            provider="openai",
+            base_url=_OPENAI_BASE,
+            model=model,
+            api_key=openai_key,
+        )
+    return LLMConfig(
+        provider="anthropic",
+        base_url=_ANTHROPIC_BASE,
+        model=model,
+        api_key=anthropic_key,
+    )
 
-    def _chat_compat(
+
+def _extract_status_code(exc: Exception) -> int | None:
+    status_code = getattr(exc, "status_code", None)
+    if isinstance(status_code, int):
+        return status_code
+    response = getattr(exc, "response", None)
+    if response is not None:
+        status_code = getattr(response, "status_code", None)
+        if isinstance(status_code, int):
+            return status_code
+    return None
+
+
+def _extract_retry_after(exc: Exception) -> float | None:
+    response = getattr(exc, "response", None)
+    if response is None:
+        return None
+    headers = getattr(response, "headers", {}) or {}
+    retry_after = headers.get("Retry-After") or headers.get("X-RateLimit-Reset-Requests")
+    if not retry_after:
+        return None
+    try:
+        return float(retry_after)
+    except (TypeError, ValueError):
+        return None
+
+
+def _is_timeout_error(exc: Exception) -> bool:
+    if isinstance(exc, TimeoutError):
+        return True
+    text = str(exc).lower()
+    return "timed out" in text or "timeout" in text
+
+
+def _extract_text_content(resp: object) -> str:
+    choices = getattr(resp, "choices", None)
+    if choices is None and isinstance(resp, dict):
+        choices = resp.get("choices", [])
+    if not choices:
+        raise RuntimeError("LLM response contained no choices.")
+
+    first = choices[0]
+    if isinstance(first, dict):
+        message = first.get("message", {})
+    else:
+        message = getattr(first, "message", {})
+
+    content = message.get("content") if isinstance(message, dict) else getattr(message, "content", None)
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        chunks: list[str] = []
+        for part in content:
+            if isinstance(part, str):
+                chunks.append(part)
+            elif isinstance(part, dict):
+                text = part.get("text")
+                if isinstance(text, str):
+                    chunks.append(text)
+        text = "".join(chunks).strip()
+        if text:
+            return text
+    raise RuntimeError("LLM response contained no text content.")
+
+
+class LLMClient:
+    """Thin wrapper around LiteLLM completion()."""
+
+    def __init__(self, config: LLMConfig) -> None:
+        self.config = config
+        self.provider = config.provider
+        self.model = config.model
+        self._apply_provider_env()
+
+    def _apply_provider_env(self) -> None:
+        env_key = _PROVIDER_API_ENV_KEY.get(self.provider)
+        if env_key and self.config.api_key:
+            os.environ[env_key] = self.config.api_key
+
+    def _build_completion_args(
         self,
         messages: list[dict],
         temperature: float,
         max_tokens: int,
-        thinking_level: str,
-    ) -> str:
-        """Call the OpenAI-compatible endpoint."""
-        headers: dict[str, str] = {"Content-Type": "application/json"}
-        if self.api_key:
-            headers["Authorization"] = f"Bearer {self.api_key}"
-
-        payload = {
-            "model": self.model,
+        thinking_level: str | None,
+        completion_kwargs: Mapping[str, object] | None,
+    ) -> dict:
+        args: dict = {
+            "model": _provider_model(self.provider, self.model),
             "messages": messages,
             "temperature": temperature,
             "max_tokens": max_tokens,
+            "timeout": _TIMEOUT,
+            "num_retries": 0,  # ApplyPilot handles retries centrally below.
         }
-        if self._is_gemini:
-            level = self._normalize_thinking_level(thinking_level)
-            payload["reasoning_effort"] = _GEMINI_COMPAT_REASONING_EFFORT[level]
-
-        resp = self._client.post(
-            f"{self.base_url}/chat/completions",
-            json=payload,
-            headers=headers,
-        )
-
-        # 403 on Gemini compat = model not available on compat layer.
-        # Raise a specific sentinel so chat() can switch to native API.
-        if resp.status_code == 403 and self._is_gemini:
-            raise _GeminiCompatForbidden(resp)
-
-        return self._handle_compat_response(resp)
 
-    @staticmethod
-    def _handle_compat_response(resp: httpx.Response) -> str:
-        resp.raise_for_status()
-        data = resp.json()
-        if resp.status_code == 200:
-            # Intentionally log the full JSON payload for every successful
-            # chat/completions response to aid truncation/debug analysis.
-            log.info("LLM compat full response JSON:\n%s", json.dumps(data, indent=2, ensure_ascii=False))
-        return data["choices"][0]["message"]["content"]
+        if self.provider == "local":
+            args["model"] = self.model
+            args["api_base"] = self.config.base_url
+            if self.config.api_key:
+                args["api_key"] = self.config.api_key
+        elif self.provider == "gemini" and thinking_level is not None:
+            level = _normalize_thinking_level(thinking_level)
+            args["reasoning_effort"] = _GEMINI_COMPAT_REASONING_EFFORT[level]
 
-    # -- public API ---------------------------------------------------------
+        if completion_kwargs:
+            args.update(completion_kwargs)
+        return args
 
     def chat(
         self,
         messages: list[dict],
         temperature: float = 0.0,
         max_tokens: int = 10000,
-        thinking_level: str = "low",
+        thinking_level: str | None = None,
+        completion_kwargs: Mapping[str, object] | None = None,
     ) -> str:
-        """Send a chat completion request and return the assistant message text.
-
-        thinking_level applies to Gemini requests and defaults to "low".
-        """
-        # Qwen3 optimization: prepend /no_think to skip chain-of-thought
-        # reasoning, saving tokens on structured extraction tasks.
-        if "qwen" in self.model.lower() and messages:
-            first = messages[0]
-            if first.get("role") == "user" and not first["content"].startswith("/no_think"):
-                messages = [{"role": first["role"], "content": f"/no_think\n{first['content']}"}] + messages[1:]
+        """Send a completion request and return plain text content."""
+        try:
+            from litellm import completion as litellm_completion
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "LiteLLM is required for AI stages but is not installed. "
+                "Install dependencies and re-run."
+            ) from exc
 
         for attempt in range(_MAX_RETRIES):
             try:
-                # Route to native Gemini if we've already confirmed it's needed
-                if self._use_native_gemini:
-                    return self._chat_native_gemini(messages, temperature, max_tokens, thinking_level)
-
-                return self._chat_compat(messages, temperature, max_tokens, thinking_level)
-
-            except _GeminiCompatForbidden as exc:
-                # Model not available on OpenAI-compat layer — switch to native.
-                log.warning(
-                    "Gemini compat endpoint returned 403 for model '%s'. "
-                    "Switching to native generateContent API. "
-                    "(Preview/experimental models are often compat-only on native.)",
-                    self.model,
-                )
-                self._use_native_gemini = True
-                # Retry immediately with native — don't count as a rate-limit wait
-                try:
-                    return self._chat_native_gemini(messages, temperature, max_tokens, thinking_level)
-                except httpx.HTTPStatusError as native_exc:
-                    raise RuntimeError(
-                        f"Both Gemini endpoints failed. Compat: 403 Forbidden. "
-                        f"Native: {native_exc.response.status_code} — "
-                        f"{native_exc.response.text[:200]}"
-                    ) from native_exc
-
-            except httpx.HTTPStatusError as exc:
-                resp = exc.response
-                if resp.status_code in (429, 503) and attempt < _MAX_RETRIES - 1:
-                    # Respect Retry-After header if provided (Gemini sends this).
-                    retry_after = (
-                        resp.headers.get("Retry-After")
-                        or resp.headers.get("X-RateLimit-Reset-Requests")
+                response = litellm_completion(
+                    **self._build_completion_args(
+                        messages=messages,
+                        temperature=temperature,
+                        max_tokens=max_tokens,
+                        thinking_level=thinking_level,
+                        completion_kwargs=completion_kwargs,
                     )
-                    if retry_after:
-                        try:
-                            wait = float(retry_after)
-                        except (ValueError, TypeError):
-                            wait = _RATE_LIMIT_BASE_WAIT * (2 ** attempt)
-                    else:
-                        wait = min(_RATE_LIMIT_BASE_WAIT * (2 ** attempt), 60)
-
+                )
+                return _extract_text_content(response)
+            except Exception as exc:  # pragma: no cover - provider SDK exception types vary by backend/version.
+                status_code = _extract_status_code(exc)
+                if status_code in (429, 503, 529) and attempt < _MAX_RETRIES - 1:
+                    wait = _extract_retry_after(exc) or min(_RATE_LIMIT_BASE_WAIT * (2 ** attempt), 60)
                     log.warning(
-                        "LLM rate limited (HTTP %s). Waiting %ds before retry %d/%d. "
-                        "Tip: Gemini free tier = 15 RPM. Consider a paid account "
-                        "or switching to a local model.",
-                        resp.status_code, wait, attempt + 1, _MAX_RETRIES,
+                        "LLM rate limited (HTTP %s). Waiting %ds before retry %d/%d.",
+                        status_code, wait, attempt + 1, _MAX_RETRIES,
                     )
                     time.sleep(wait)
                     continue
-                raise
-
-            except httpx.TimeoutException:
-                if attempt < _MAX_RETRIES - 1:
+                if _is_timeout_error(exc) and attempt < _MAX_RETRIES - 1:
                     wait = min(_RATE_LIMIT_BASE_WAIT * (2 ** attempt), 60)
                     log.warning(
                         "LLM request timed out, retrying in %ds (attempt %d/%d)",
@@ -312,7 +331,7 @@ def chat(
                     )
                     time.sleep(wait)
                     continue
-                raise
+                raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
 
         raise RuntimeError("LLM request failed after all retries")
 
@@ -321,19 +340,9 @@ def ask(self, prompt: str, **kwargs) -> str:
         return self.chat([{"role": "user", "content": prompt}], **kwargs)
 
     def close(self) -> None:
-        self._client.close()
-
-
-class _GeminiCompatForbidden(Exception):
-    """Sentinel: Gemini OpenAI-compat returned 403. Switch to native API."""
-    def __init__(self, response: httpx.Response) -> None:
-        self.response = response
-        super().__init__(f"Gemini compat 403: {response.text[:200]}")
-
+        """No-op. LiteLLM completion() is stateless per call."""
+        return None
 
-# ---------------------------------------------------------------------------
-# Singleton
-# ---------------------------------------------------------------------------
 
 _instance: LLMClient | None = None
 
@@ -342,7 +351,13 @@ def get_client() -> LLMClient:
     """Return (or create) the module-level LLMClient singleton."""
     global _instance
     if _instance is None:
-        base_url, model, api_key = _detect_provider()
-        log.info("LLM provider: %s  model: %s", base_url, model)
-        _instance = LLMClient(base_url, model, api_key)
+        try:
+            from applypilot.config import load_env
+
+            load_env()
+        except ModuleNotFoundError:
+            log.debug("python-dotenv not installed; skipping .env auto-load in llm.get_client().")
+        config = resolve_llm_config()
+        log.info("LLM provider: %s  model: %s", config.provider, config.model)
+        _instance = LLMClient(config)
     return _instance
diff --git a/src/applypilot/wizard/init.py b/src/applypilot/wizard/init.py
index 20bea2a..aad9783 100644
--- a/src/applypilot/wizard/init.py
+++ b/src/applypilot/wizard/init.py
@@ -288,7 +288,7 @@ def _setup_ai_features() -> None:
         configured = ", ".join(configured_sources)
         console.print(
             f"[yellow]Multiple LLM providers saved ({configured}). "
-            "Deterministic provider resolution is added in the next phase.[/yellow]"
+            "Runtime selects one deterministically by precedence.[/yellow]"
         )
     console.print(f"[green]AI configuration saved to {ENV_PATH}[/green]")
 

From 6dba6dcac56bafa79a34bea56929b2184a15d421 Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:40:46 -0800
Subject: [PATCH 3/8] Stabilize CLI and LiteLLM runtime behavior with tests and
 tooling updates

---
 pyproject.toml                           |  4 ++
 src/applypilot/apply/dashboard.py        |  2 +-
 src/applypilot/apply/launcher.py         |  4 +-
 src/applypilot/cli.py                    | 30 +++++++++++----
 src/applypilot/discovery/jobspy.py       |  2 +-
 src/applypilot/discovery/smartextract.py | 10 ++---
 src/applypilot/enrichment/detail.py      |  6 +--
 src/applypilot/llm.py                    | 15 +++++---
 src/applypilot/pipeline.py               |  6 +--
 src/applypilot/scoring/cover_letter.py   |  5 +--
 src/applypilot/scoring/scorer.py         |  5 +--
 src/applypilot/scoring/tailor.py         |  7 +---
 src/applypilot/view.py                   |  3 +-
 src/applypilot/wizard/init.py            |  1 -
 tests/test_llm_resolution.py             | 47 ++++++++++++++++++++++++
 15 files changed, 103 insertions(+), 44 deletions(-)
 create mode 100644 tests/test_llm_resolution.py

diff --git a/pyproject.toml b/pyproject.toml
index 51268f7..489ada9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,3 +53,7 @@ artifacts = ["src/applypilot/config/*.yaml"]
 [tool.ruff]
 target-version = "py311"
 line-length = 120
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]
diff --git a/src/applypilot/apply/dashboard.py b/src/applypilot/apply/dashboard.py
index c286009..ea85373 100644
--- a/src/applypilot/apply/dashboard.py
+++ b/src/applypilot/apply/dashboard.py
@@ -7,7 +7,7 @@
 import logging
 import threading
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
 
diff --git a/src/applypilot/apply/launcher.py b/src/applypilot/apply/launcher.py
index 341a11a..e726ae1 100644
--- a/src/applypilot/apply/launcher.py
+++ b/src/applypilot/apply/launcher.py
@@ -25,7 +25,7 @@
 
 from applypilot import config
 from applypilot.database import get_connection
-from applypilot.apply import chrome, dashboard, prompt as prompt_mod
+from applypilot.apply import prompt as prompt_mod
 from applypilot.apply.chrome import (
     launch_chrome, cleanup_worker, kill_all_chrome,
     reset_worker_dir, cleanup_on_exit, _kill_process_tree,
@@ -125,7 +125,7 @@ def acquire_job(target_url: str | None = None, min_score: int = 7,
                 params.extend(blocked_sites)
             url_clauses = ""
             if blocked_patterns:
-                url_clauses = " ".join(f"AND url NOT LIKE ?" for _ in blocked_patterns)
+                url_clauses = " ".join("AND url NOT LIKE ?" for _ in blocked_patterns)
                 params.extend(blocked_patterns)
             row = conn.execute(f"""
                 SELECT url, title, site, application_url, tailored_resume_path,
diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py
index d524174..eaeed7f 100644
--- a/src/applypilot/cli.py
+++ b/src/applypilot/cli.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import logging
+import os
 from typing import Optional
 
 import typer
@@ -11,11 +12,24 @@
 
 from applypilot import __version__
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    datefmt="%H:%M:%S",
-)
+
+def _configure_logging() -> None:
+    """Set consistent logging output for CLI runs."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    # Keep LiteLLM internals quiet by default; warnings/errors still surface.
+    for name in ("LiteLLM", "litellm"):
+        noisy = logging.getLogger(name)
+        noisy.handlers.clear()
+        noisy.setLevel(logging.WARNING)
+        noisy.propagate = True
+
+
+_configure_logging()
 
 app = typer.Typer(
     name="applypilot",
@@ -211,7 +225,7 @@ def apply(
             raise typer.Exit(code=1)
 
     if gen:
-        from applypilot.apply.launcher import gen_prompt, BASE_CDP_PORT
+        from applypilot.apply.launcher import gen_prompt
         target = url or ""
         if not target:
             console.print("[red]--gen requires --url to specify which job.[/red]")
@@ -222,7 +236,7 @@ def apply(
             raise typer.Exit(code=1)
         mcp_path = _profile_path.parent / ".mcp-apply-0.json"
         console.print(f"[green]Wrote prompt to:[/green] {prompt_file}")
-        console.print(f"\n[bold]Run manually:[/bold]")
+        console.print("\n[bold]Run manually:[/bold]")
         console.print(
             f"  claude --model {model} -p "
             f"--mcp-config {mcp_path} "
@@ -338,7 +352,7 @@ def doctor() -> None:
     import shutil
     from applypilot.config import (
         load_env, PROFILE_PATH, RESUME_PATH, RESUME_PDF_PATH,
-        SEARCH_CONFIG_PATH, ENV_PATH, get_chrome_path,
+        SEARCH_CONFIG_PATH, get_chrome_path,
     )
 
     load_env()
diff --git a/src/applypilot/discovery/jobspy.py b/src/applypilot/discovery/jobspy.py
index b5e54ff..ce0c4c8 100644
--- a/src/applypilot/discovery/jobspy.py
+++ b/src/applypilot/discovery/jobspy.py
@@ -15,7 +15,7 @@
 from jobspy import scrape_jobs
 
 from applypilot import config
-from applypilot.database import get_connection, init_db, store_jobs
+from applypilot.database import get_connection, init_db
 
 log = logging.getLogger(__name__)
 
diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py
index cf49a9a..43c50c7 100644
--- a/src/applypilot/discovery/smartextract.py
+++ b/src/applypilot/discovery/smartextract.py
@@ -20,17 +20,15 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
-from pathlib import Path
 from urllib.parse import quote_plus
 
-import httpx
 import yaml
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 
 from applypilot import config
 from applypilot.config import CONFIG_DIR
-from applypilot.database import get_connection, init_db, store_jobs, get_stats
+from applypilot.database import init_db, get_stats
 from applypilot.llm import get_client
 
 log = logging.getLogger(__name__)
@@ -393,7 +391,7 @@ def judge_api_responses(api_responses: list[dict]) -> list[dict]:
         )
 
         try:
-            raw = client.ask(prompt, temperature=0.0, max_tokens=1024)
+            raw = client.ask(prompt, max_tokens=1024)
             verdict = extract_json(raw)
             is_relevant = verdict.get("relevant", False)
             reason = verdict.get("reason", "?")
@@ -424,7 +422,7 @@ def format_strategy_briefing(intel: dict) -> str:
             sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)")
             sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}")
         else:
-            sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
+            sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
         if other:
             types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other]
             sections.append(f"Other JSON-LD types (NOT job data): {types}")
@@ -642,7 +640,7 @@ def ask_llm(prompt: str) -> tuple[str, float, dict]:
     """Send prompt to LLM. Returns (response_text, seconds_taken, metadata)."""
     client = get_client()
     t0 = time.time()
-    text = client.ask(prompt, temperature=0.0, max_tokens=4096)
+    text = client.ask(prompt, max_tokens=4096)
     elapsed = time.time() - t0
     meta = {
         "finish_reason": "stop",
diff --git a/src/applypilot/enrichment/detail.py b/src/applypilot/enrichment/detail.py
index 11b7926..8a79579 100644
--- a/src/applypilot/enrichment/detail.py
+++ b/src/applypilot/enrichment/detail.py
@@ -22,9 +22,7 @@
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 
-from applypilot import config
-from applypilot.config import DB_PATH
-from applypilot.database import get_connection, init_db, ensure_columns
+from applypilot.database import init_db
 from applypilot.llm import get_client
 
 log = logging.getLogger(__name__)
@@ -465,7 +463,7 @@ def extract_with_llm(page, url: str) -> dict:
     try:
         client = get_client()
         t0 = time.time()
-        raw = client.ask(prompt, temperature=0.0, max_tokens=4096)
+        raw = client.ask(prompt, max_tokens=4096)
         elapsed = time.time() - t0
         log.info("LLM: %d chars in, %.1fs", len(prompt), elapsed)
 
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index c60aed8..bc758fc 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -257,7 +257,7 @@ def _apply_provider_env(self) -> None:
     def _build_completion_args(
         self,
         messages: list[dict],
-        temperature: float,
+        temperature: float | None,
         max_tokens: int,
         thinking_level: str | None,
         completion_kwargs: Mapping[str, object] | None,
@@ -265,11 +265,12 @@ def _build_completion_args(
         args: dict = {
             "model": _provider_model(self.provider, self.model),
             "messages": messages,
-            "temperature": temperature,
             "max_tokens": max_tokens,
             "timeout": _TIMEOUT,
             "num_retries": 0,  # ApplyPilot handles retries centrally below.
         }
+        if temperature is not None:
+            args["temperature"] = temperature
 
         if self.provider == "local":
             args["model"] = self.model
@@ -287,23 +288,27 @@ def _build_completion_args(
     def chat(
         self,
         messages: list[dict],
-        temperature: float = 0.0,
+        temperature: float | None = None,
         max_tokens: int = 10000,
         thinking_level: str | None = None,
         completion_kwargs: Mapping[str, object] | None = None,
     ) -> str:
         """Send a completion request and return plain text content."""
         try:
-            from litellm import completion as litellm_completion
+            import litellm
         except ModuleNotFoundError as exc:
             raise RuntimeError(
                 "LiteLLM is required for AI stages but is not installed. "
                 "Install dependencies and re-run."
             ) from exc
 
+        # Suppress LiteLLM's verbose multiline info logs (e.g. completion() traces).
+        litellm.set_verbose = False
+        litellm.suppress_debug_info = True
+
         for attempt in range(_MAX_RETRIES):
             try:
-                response = litellm_completion(
+                response = litellm.completion(
                     **self._build_completion_args(
                         messages=messages,
                         temperature=temperature,
diff --git a/src/applypilot/pipeline.py b/src/applypilot/pipeline.py
index 29881c5..8ae30ab 100644
--- a/src/applypilot/pipeline.py
+++ b/src/applypilot/pipeline.py
@@ -384,7 +384,7 @@ def _run_streaming(ordered: list[str], min_score: int, workers: int = 1,
     stop_event = threading.Event()
     pipeline_start = time.time()
 
-    console.print(f"\n  [bold cyan]STREAMING MODE[/bold cyan] — stages run concurrently")
+    console.print("\n  [bold cyan]STREAMING MODE[/bold cyan] — stages run concurrently")
     console.print(f"  Poll interval: {_STREAM_POLL_INTERVAL}s\n")
 
     # Mark stages NOT in `ordered` as done so downstream doesn't wait for them
@@ -492,7 +492,7 @@ def run_pipeline(
         for name in ordered:
             meta = STAGE_META[name]
             console.print(f"    {name:<12s}  {meta['desc']}")
-        console.print(f"\n  No changes made.")
+        console.print("\n  No changes made.")
         return {"stages": [], "errors": {}, "elapsed": 0.0}
 
     # Execute
@@ -527,7 +527,7 @@ def run_pipeline(
 
     # Final DB stats
     final = get_stats()
-    console.print(f"\n  [bold]DB Final State:[/bold]")
+    console.print("\n  [bold]DB Final State:[/bold]")
     console.print(f"    Total jobs:     {final['total']}")
     console.print(f"    With desc:      {final['with_description']}")
     console.print(f"    Scored:         {final['scored']}")
diff --git a/src/applypilot/scoring/cover_letter.py b/src/applypilot/scoring/cover_letter.py
index bbbb053..77045b5 100644
--- a/src/applypilot/scoring/cover_letter.py
+++ b/src/applypilot/scoring/cover_letter.py
@@ -5,14 +5,13 @@
 profile at runtime. No hardcoded personal information.
 """
 
-import json
 import logging
 import re
 import time
 from datetime import datetime, timezone
 
 from applypilot.config import COVER_LETTER_DIR, RESUME_PATH, load_profile
-from applypilot.database import get_connection, get_jobs_by_stage
+from applypilot.database import get_connection
 from applypilot.llm import get_client
 from applypilot.scoring.validator import (
     BANNED_WORDS,
@@ -165,7 +164,7 @@ def generate_cover_letter(
             )},
         ]
 
-        letter = client.chat(messages, max_tokens=10000, temperature=0.7)
+        letter = client.chat(messages, max_tokens=10000)
         letter = sanitize_text(letter)  # auto-fix em dashes, smart quotes
         letter = _strip_preamble(letter)  # remove any "Here is the letter:" prefix
 
diff --git a/src/applypilot/scoring/scorer.py b/src/applypilot/scoring/scorer.py
index 97692d5..42efda3 100644
--- a/src/applypilot/scoring/scorer.py
+++ b/src/applypilot/scoring/scorer.py
@@ -5,13 +5,12 @@
 profile and resume file.
 """
 
-import json
 import logging
 import re
 import time
 from datetime import datetime, timezone
 
-from applypilot.config import RESUME_PATH, load_profile
+from applypilot.config import RESUME_PATH
 from applypilot.database import get_connection, get_jobs_by_stage
 from applypilot.llm import get_client
 
@@ -94,7 +93,7 @@ def score_job(resume_text: str, job: dict) -> dict:
 
     try:
         client = get_client()
-        response = client.chat(messages, max_tokens=512, temperature=0.2)
+        response = client.chat(messages, max_tokens=512)
         return _parse_score_response(response)
     except Exception as e:
         log.error("LLM error scoring job '%s': %s", job.get("title", "?"), e)
diff --git a/src/applypilot/scoring/tailor.py b/src/applypilot/scoring/tailor.py
index 352fb5f..28f2a37 100644
--- a/src/applypilot/scoring/tailor.py
+++ b/src/applypilot/scoring/tailor.py
@@ -14,17 +14,14 @@
 import re
 import time
 from datetime import datetime, timezone
-from pathlib import Path
 
 from applypilot.config import RESUME_PATH, TAILORED_DIR, load_profile
 from applypilot.database import get_connection, get_jobs_by_stage
 from applypilot.llm import get_client
 from applypilot.scoring.validator import (
     BANNED_WORDS,
-    FABRICATION_WATCHLIST,
     sanitize_text,
     validate_json_fields,
-    validate_tailored_resume,
 )
 
 log = logging.getLogger(__name__)
@@ -326,7 +323,7 @@ def judge_tailored_resume(
     ]
 
     client = get_client()
-    response = client.chat(messages, max_tokens=512, temperature=0.1)
+    response = client.chat(messages, max_tokens=512)
 
     passed = "VERDICT: PASS" in response.upper()
     issues = "none"
@@ -400,7 +397,7 @@ def tailor_resume(
             {"role": "user", "content": f"ORIGINAL RESUME:\n{resume_text}\n\n---\n\nTARGET JOB:\n{job_text}\n\nReturn the JSON:"},
         ]
 
-        raw = client.chat(messages, max_tokens=2048, temperature=0.4)
+        raw = client.chat(messages, max_tokens=2048)
 
         # Parse JSON from response
         try:
diff --git a/src/applypilot/view.py b/src/applypilot/view.py
index ff42fec..82be192 100644
--- a/src/applypilot/view.py
+++ b/src/applypilot/view.py
@@ -10,14 +10,13 @@
 
 from __future__ import annotations
 
-import os
 import webbrowser
 from html import escape
 from pathlib import Path
 
 from rich.console import Console
 
-from applypilot.config import APP_DIR, DB_PATH
+from applypilot.config import APP_DIR
 from applypilot.database import get_connection
 
 console = Console()
diff --git a/src/applypilot/wizard/init.py b/src/applypilot/wizard/init.py
index aad9783..9367f91 100644
--- a/src/applypilot/wizard/init.py
+++ b/src/applypilot/wizard/init.py
@@ -13,7 +13,6 @@
 import shutil
 from pathlib import Path
 
-import typer
 from rich.console import Console
 from rich.panel import Panel
 from rich.prompt import Confirm, Prompt
diff --git a/tests/test_llm_resolution.py b/tests/test_llm_resolution.py
new file mode 100644
index 0000000..96c8c2a
--- /dev/null
+++ b/tests/test_llm_resolution.py
@@ -0,0 +1,47 @@
+import logging
+
+import pytest
+
+from applypilot.llm import resolve_llm_config
+
+
+def test_only_gemini_api_key_selects_gemini() -> None:
+    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key"})
+    assert cfg.provider == "gemini"
+
+
+def test_only_openai_api_key_selects_openai() -> None:
+    cfg = resolve_llm_config({"OPENAI_API_KEY": "o-key"})
+    assert cfg.provider == "openai"
+
+def test_llm_url_with_keys_selects_local() -> None:
+    cfg = resolve_llm_config(
+        {
+            "LLM_URL": "http://127.0.0.1:8080/v1",
+            "GEMINI_API_KEY": "g-key",
+            "OPENAI_API_KEY": "o-key",
+            "ANTHROPIC_API_KEY": "a-key",
+        }
+    )
+    assert cfg.provider == "local"
+
+
+def test_multiple_keys_selects_deterministically_and_warns(caplog: pytest.LogCaptureFixture) -> None:
+    with caplog.at_level(logging.WARNING):
+        cfg = resolve_llm_config(
+            {
+                "GEMINI_API_KEY": "g-key",
+                "OPENAI_API_KEY": "o-key",
+                "ANTHROPIC_API_KEY": "a-key",
+            }
+        )
+    assert cfg.provider == "gemini"
+    assert any(
+        "Multiple LLM providers configured" in rec.message and "Using 'gemini' based on precedence" in rec.message
+        for rec in caplog.records
+    )
+
+
+def test_missing_everything_raises_clear_error() -> None:
+    with pytest.raises(RuntimeError, match="No LLM provider configured"):
+        resolve_llm_config({})

From 2aafd3d5aaf9743d5de24197ff483eaca3269644 Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:40:49 -0800
Subject: [PATCH 4/8] Migrate to Responses API and standardize auth/model
 handling

---
 README.md                                |   1 +
 pyproject.toml                           |   2 +-
 src/applypilot/discovery/smartextract.py |   4 +-
 src/applypilot/enrichment/detail.py      |   2 +-
 src/applypilot/llm.py                    | 208 +++++++++--------------
 src/applypilot/scoring/cover_letter.py   |   2 +-
 src/applypilot/scoring/scorer.py         |   2 +-
 src/applypilot/scoring/tailor.py         |   4 +-
 src/applypilot/wizard/init.py            |   5 +-
 tests/test_llm_client.py                 |  36 ++++
 tests/test_llm_resolution.py             |  18 ++
 11 files changed, 144 insertions(+), 140 deletions(-)
 create mode 100644 tests/test_llm_client.py

diff --git a/README.md b/README.md
index 140e43c..0f18a22 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,7 @@ Each stage is independent. Run them all or pick what you need.
 | Claude Code CLI | Auto-apply | Install from [claude.ai/code](https://claude.ai/code) |
 
 **Gemini API key is free.** Get one at [aistudio.google.com](https://aistudio.google.com). OpenAI, Claude, and local models (Ollama/llama.cpp/vLLM) are also supported.
+ApplyPilot uses Gemini through LiteLLM's native Gemini provider path, and Gemini API version routing is owned by LiteLLM.
 
 ### Optional
 
diff --git a/pyproject.toml b/pyproject.toml
index 489ada9..622aa10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ classifiers = [
 dependencies = [
     "typer>=0.9.0",
     "rich>=13.0",
-    "litellm",
+    "litellm~=1.63.0",
     "httpx>=0.24",
     "beautifulsoup4>=4.12",
     "playwright>=1.40",
diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py
index 43c50c7..c8b5300 100644
--- a/src/applypilot/discovery/smartextract.py
+++ b/src/applypilot/discovery/smartextract.py
@@ -391,7 +391,7 @@ def judge_api_responses(api_responses: list[dict]) -> list[dict]:
         )
 
         try:
-            raw = client.ask(prompt, max_tokens=1024)
+            raw = client.ask(prompt, max_output_tokens=1024)
             verdict = extract_json(raw)
             is_relevant = verdict.get("relevant", False)
             reason = verdict.get("reason", "?")
@@ -640,7 +640,7 @@ def ask_llm(prompt: str) -> tuple[str, float, dict]:
     """Send prompt to LLM. Returns (response_text, seconds_taken, metadata)."""
     client = get_client()
     t0 = time.time()
-    text = client.ask(prompt, max_tokens=4096)
+    text = client.ask(prompt, max_output_tokens=4096)
     elapsed = time.time() - t0
     meta = {
         "finish_reason": "stop",
diff --git a/src/applypilot/enrichment/detail.py b/src/applypilot/enrichment/detail.py
index 8a79579..c76081d 100644
--- a/src/applypilot/enrichment/detail.py
+++ b/src/applypilot/enrichment/detail.py
@@ -463,7 +463,7 @@ def extract_with_llm(page, url: str) -> dict:
     try:
         client = get_client()
         t0 = time.time()
-        raw = client.ask(prompt, max_tokens=4096)
+        raw = client.ask(prompt, max_output_tokens=4096)
         elapsed = time.time() - t0
         log.info("LLM: %d chars in, %.1fs", len(prompt), elapsed)
 
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index bc758fc..b71396a 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -7,6 +7,11 @@
   LLM_URL             -> Local OpenAI-compatible endpoint
 
 LLM_MODEL env var overrides the model name for any provider.
+
+Gemini provider behavior:
+  - Uses LiteLLM's native Gemini provider path (no OpenAI-compat base URL).
+  - Google v1 is considered stable while v1beta can change; endpoint version choice is delegated to LiteLLM.
+  - Provider is inferred from configured credentials; model prefixes are handled internally.
 """
 
 from __future__ import annotations
@@ -15,13 +20,13 @@
 from dataclasses import dataclass
 import logging
 import os
-import time
+
+import litellm
 
 log = logging.getLogger(__name__)
 
 _OPENAI_BASE = "https://api.openai.com/v1"
 _ANTHROPIC_BASE = "https://api.anthropic.com/v1"
-_GEMINI_BASE = "https://generativelanguage.googleapis.com/v1beta/openai"
 _PROVIDER_API_ENV_KEY = {
     "gemini": "GEMINI_API_KEY",
     "openai": "OPENAI_API_KEY",
@@ -36,16 +41,8 @@
 
 _MAX_RETRIES = 5
 _TIMEOUT = 120  # seconds
-_RATE_LIMIT_BASE_WAIT = 10
-
-_GEMINI_THINKING_LEVELS = {"none", "minimal", "low", "medium", "high"}
-_GEMINI_COMPAT_REASONING_EFFORT = {
-    "none": "none",
-    "minimal": "low",
-    "low": "low",
-    "medium": "high",
-    "high": "high",
-}
+
+_THINKING_LEVELS = {"none", "low", "medium", "high"}
 
 
 @dataclass(frozen=True)
@@ -67,7 +64,7 @@ def _env_get(env: Mapping[str, str], key: str) -> str:
 
 def _normalize_thinking_level(thinking_level: str) -> str:
     level = (thinking_level or "low").strip().lower()
-    if level not in _GEMINI_THINKING_LEVELS:
+    if level not in _THINKING_LEVELS:
         log.warning("Invalid thinking_level '%s', defaulting to 'low'.", thinking_level)
         return "low"
     return level
@@ -85,6 +82,25 @@ def _default_model(provider: str) -> str:
     return _DEFAULT_MODEL_BY_PROVIDER[provider]
 
 
+def _normalize_model_for_provider(provider: str, model: str) -> str:
+    normalized = model.strip()
+    if provider == "local":
+        return normalized
+    if normalized.startswith("models/"):
+        normalized = normalized.split("/", 1)[1]
+
+    provider_prefix = f"{provider}/"
+    if normalized.startswith(provider_prefix):
+        return normalized[len(provider_prefix):]
+
+    for other in ("gemini", "openai", "anthropic", "vertex_ai"):
+        other_prefix = f"{other}/"
+        if normalized.startswith(other_prefix):
+            return normalized.split("/", 1)[1]
+
+    return normalized
+
+
 def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
     """Resolve provider configuration from environment with deterministic precedence."""
     env_map = env if env is not None else os.environ
@@ -146,6 +162,7 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
                 chosen,
             )
     model = model_override or _default_model(chosen)
+    model = _normalize_model_for_provider(chosen, model)
 
     if chosen == "local":
         return LLMConfig(
@@ -157,7 +174,7 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
     if chosen == "gemini":
         return LLMConfig(
             provider="gemini",
-            base_url=_GEMINI_BASE,
+            base_url="",
             model=model,
             api_key=gemini_key,
         )
@@ -176,70 +193,6 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
     )
 
 
-def _extract_status_code(exc: Exception) -> int | None:
-    status_code = getattr(exc, "status_code", None)
-    if isinstance(status_code, int):
-        return status_code
-    response = getattr(exc, "response", None)
-    if response is not None:
-        status_code = getattr(response, "status_code", None)
-        if isinstance(status_code, int):
-            return status_code
-    return None
-
-
-def _extract_retry_after(exc: Exception) -> float | None:
-    response = getattr(exc, "response", None)
-    if response is None:
-        return None
-    headers = getattr(response, "headers", {}) or {}
-    retry_after = headers.get("Retry-After") or headers.get("X-RateLimit-Reset-Requests")
-    if not retry_after:
-        return None
-    try:
-        return float(retry_after)
-    except (TypeError, ValueError):
-        return None
-
-
-def _is_timeout_error(exc: Exception) -> bool:
-    if isinstance(exc, TimeoutError):
-        return True
-    text = str(exc).lower()
-    return "timed out" in text or "timeout" in text
-
-
-def _extract_text_content(resp: object) -> str:
-    choices = getattr(resp, "choices", None)
-    if choices is None and isinstance(resp, dict):
-        choices = resp.get("choices", [])
-    if not choices:
-        raise RuntimeError("LLM response contained no choices.")
-
-    first = choices[0]
-    if isinstance(first, dict):
-        message = first.get("message", {})
-    else:
-        message = getattr(first, "message", {})
-
-    content = message.get("content") if isinstance(message, dict) else getattr(message, "content", None)
-    if isinstance(content, str):
-        return content
-    if isinstance(content, list):
-        chunks: list[str] = []
-        for part in content:
-            if isinstance(part, str):
-                chunks.append(part)
-            elif isinstance(part, dict):
-                text = part.get("text")
-                if isinstance(text, str):
-                    chunks.append(text)
-        text = "".join(chunks).strip()
-        if text:
-            return text
-    raise RuntimeError("LLM response contained no text content.")
-
-
 class LLMClient:
     """Thin wrapper around LiteLLM completion()."""
 
@@ -258,16 +211,16 @@ def _build_completion_args(
         self,
         messages: list[dict],
         temperature: float | None,
-        max_tokens: int,
+        max_output_tokens: int,
         thinking_level: str | None,
-        completion_kwargs: Mapping[str, object] | None,
+        response_kwargs: Mapping[str, object] | None,
     ) -> dict:
         args: dict = {
             "model": _provider_model(self.provider, self.model),
             "messages": messages,
-            "max_tokens": max_tokens,
+            "max_tokens": max_output_tokens,
             "timeout": _TIMEOUT,
-            "num_retries": 0,  # ApplyPilot handles retries centrally below.
+            "num_retries": _MAX_RETRIES,  # Delegate retry handling to LiteLLM.
         }
         if temperature is not None:
             args["temperature"] = temperature
@@ -277,68 +230,61 @@ def _build_completion_args(
             args["api_base"] = self.config.base_url
             if self.config.api_key:
                 args["api_key"] = self.config.api_key
-        elif self.provider == "gemini" and thinking_level is not None:
+        if thinking_level is not None:
             level = _normalize_thinking_level(thinking_level)
-            args["reasoning_effort"] = _GEMINI_COMPAT_REASONING_EFFORT[level]
+            args["reasoning_effort"] = level
 
-        if completion_kwargs:
-            args.update(completion_kwargs)
+        if response_kwargs:
+            args.update(response_kwargs)
         return args
 
     def chat(
         self,
         messages: list[dict],
         temperature: float | None = None,
-        max_tokens: int = 10000,
+        max_output_tokens: int = 10000,
         thinking_level: str | None = None,
-        completion_kwargs: Mapping[str, object] | None = None,
+        response_kwargs: Mapping[str, object] | None = None,
     ) -> str:
         """Send a completion request and return plain text content."""
+        # Suppress LiteLLM's verbose multiline info logs (e.g. request traces).
+        if hasattr(litellm, 'set_verbose'):
+            litellm.set_verbose(False)
+        if hasattr(litellm, 'suppress_debug_info'):
+            litellm.suppress_debug_info = True
+
         try:
-            import litellm
-        except ModuleNotFoundError as exc:
-            raise RuntimeError(
-                "LiteLLM is required for AI stages but is not installed. "
-                "Install dependencies and re-run."
-            ) from exc
-
-        # Suppress LiteLLM's verbose multiline info logs (e.g. completion() traces).
-        litellm.set_verbose = False
-        litellm.suppress_debug_info = True
-
-        for attempt in range(_MAX_RETRIES):
-            try:
-                response = litellm.completion(
-                    **self._build_completion_args(
-                        messages=messages,
-                        temperature=temperature,
-                        max_tokens=max_tokens,
-                        thinking_level=thinking_level,
-                        completion_kwargs=completion_kwargs,
-                    )
+            response = litellm.completion(
+                **self._build_completion_args(
+                    messages=messages,
+                    temperature=temperature,
+                    max_output_tokens=max_output_tokens,
+                    thinking_level=thinking_level,
+                    response_kwargs=response_kwargs,
                 )
-                return _extract_text_content(response)
-            except Exception as exc:  # pragma: no cover - provider SDK exception types vary by backend/version.
-                status_code = _extract_status_code(exc)
-                if status_code in (429, 503, 529) and attempt < _MAX_RETRIES - 1:
-                    wait = _extract_retry_after(exc) or min(_RATE_LIMIT_BASE_WAIT * (2 ** attempt), 60)
-                    log.warning(
-                        "LLM rate limited (HTTP %s). Waiting %ds before retry %d/%d.",
-                        status_code, wait, attempt + 1, _MAX_RETRIES,
-                    )
-                    time.sleep(wait)
-                    continue
-                if _is_timeout_error(exc) and attempt < _MAX_RETRIES - 1:
-                    wait = min(_RATE_LIMIT_BASE_WAIT * (2 ** attempt), 60)
-                    log.warning(
-                        "LLM request timed out, retrying in %ds (attempt %d/%d)",
-                        wait, attempt + 1, _MAX_RETRIES,
-                    )
-                    time.sleep(wait)
-                    continue
-                raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
-
-        raise RuntimeError("LLM request failed after all retries")
+            )
+
+            choices = getattr(response, "choices", None)
+            if not choices:
+                raise RuntimeError("LLM response contained no choices.")
+            content = choices[0].message.content
+
+            if isinstance(content, str):
+                text = content.strip()
+            elif isinstance(content, list):
+                text = "".join(
+                    part if isinstance(part, str) else part.get("text", "")
+                    for part in content
+                    if isinstance(part, (str, dict))
+                ).strip()
+            else:
+                text = ""
+
+            if not text:
+                raise RuntimeError("LLM response contained no text content.")
+            return text
+        except Exception as exc:  # pragma: no cover - provider SDK exception types vary by backend/version.
+            raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
 
     def ask(self, prompt: str, **kwargs) -> str:
         """Convenience: single user prompt -> assistant response."""
diff --git a/src/applypilot/scoring/cover_letter.py b/src/applypilot/scoring/cover_letter.py
index 77045b5..06e9333 100644
--- a/src/applypilot/scoring/cover_letter.py
+++ b/src/applypilot/scoring/cover_letter.py
@@ -164,7 +164,7 @@ def generate_cover_letter(
             )},
         ]
 
-        letter = client.chat(messages, max_tokens=10000)
+        letter = client.chat(messages, max_output_tokens=10000)
         letter = sanitize_text(letter)  # auto-fix em dashes, smart quotes
         letter = _strip_preamble(letter)  # remove any "Here is the letter:" prefix
 
diff --git a/src/applypilot/scoring/scorer.py b/src/applypilot/scoring/scorer.py
index 42efda3..61d6e5e 100644
--- a/src/applypilot/scoring/scorer.py
+++ b/src/applypilot/scoring/scorer.py
@@ -93,7 +93,7 @@ def score_job(resume_text: str, job: dict) -> dict:
 
     try:
         client = get_client()
-        response = client.chat(messages, max_tokens=512)
+        response = client.chat(messages, max_output_tokens=512)
         return _parse_score_response(response)
     except Exception as e:
         log.error("LLM error scoring job '%s': %s", job.get("title", "?"), e)
diff --git a/src/applypilot/scoring/tailor.py b/src/applypilot/scoring/tailor.py
index 28f2a37..aaf4021 100644
--- a/src/applypilot/scoring/tailor.py
+++ b/src/applypilot/scoring/tailor.py
@@ -323,7 +323,7 @@ def judge_tailored_resume(
     ]
 
     client = get_client()
-    response = client.chat(messages, max_tokens=512)
+    response = client.chat(messages, max_output_tokens=512)
 
     passed = "VERDICT: PASS" in response.upper()
     issues = "none"
@@ -397,7 +397,7 @@ def tailor_resume(
             {"role": "user", "content": f"ORIGINAL RESUME:\n{resume_text}\n\n---\n\nTARGET JOB:\n{job_text}\n\nReturn the JSON:"},
         ]
 
-        raw = client.chat(messages, max_tokens=2048)
+        raw = client.chat(messages, max_output_tokens=2048)
 
         # Parse JSON from response
         try:
diff --git a/src/applypilot/wizard/init.py b/src/applypilot/wizard/init.py
index 9367f91..88cbea1 100644
--- a/src/applypilot/wizard/init.py
+++ b/src/applypilot/wizard/init.py
@@ -277,7 +277,10 @@ def _setup_ai_features() -> None:
         console.print("[dim]No AI provider configured. You can add one later with [bold]applypilot init[/bold].[/dim]")
         return
 
-    model = Prompt.ask("LLM model override (optional, leave blank to use provider defaults)", default="").strip()
+    model = Prompt.ask(
+        "LLM model override (optional, leave blank to use provider defaults)",
+        default="",
+    ).strip()
     if model:
         env_lines.append(f"LLM_MODEL={model}")
 
diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py
new file mode 100644
index 0000000..0e6fe6f
--- /dev/null
+++ b/tests/test_llm_client.py
@@ -0,0 +1,36 @@
+from applypilot.llm import LLMClient, LLMConfig, _normalize_thinking_level
+
+
+def test_normalize_thinking_level_accepts_supported_levels() -> None:
+    assert _normalize_thinking_level("none") == "none"
+    assert _normalize_thinking_level("low") == "low"
+    assert _normalize_thinking_level("medium") == "medium"
+    assert _normalize_thinking_level("high") == "high"
+
+
+def test_normalize_thinking_level_defaults_minimal_to_low() -> None:
+    assert _normalize_thinking_level("minimal") == "low"
+
+
+def test_normalize_thinking_level_defaults_invalid_value_to_low() -> None:
+    assert _normalize_thinking_level("max") == "low"
+
+
+def test_build_completion_args_applies_reasoning_effort_for_openai() -> None:
+    client = LLMClient(
+        LLMConfig(
+            provider="openai",
+            base_url="https://api.openai.com/v1",
+            model="gpt-4o-mini",
+            api_key="test-key",
+        )
+    )
+    args = client._build_completion_args(
+        messages=[{"role": "user", "content": "hello"}],
+        temperature=None,
+        max_output_tokens=128,
+        thinking_level="medium",
+        response_kwargs=None,
+    )
+    assert args["reasoning_effort"] == "medium"
+    assert args["max_tokens"] == 128
diff --git a/tests/test_llm_resolution.py b/tests/test_llm_resolution.py
index 96c8c2a..d6db6a1 100644
--- a/tests/test_llm_resolution.py
+++ b/tests/test_llm_resolution.py
@@ -8,12 +8,30 @@
 def test_only_gemini_api_key_selects_gemini() -> None:
     cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key"})
     assert cfg.provider == "gemini"
+    assert cfg.base_url == ""
+    assert cfg.model == "gemini-2.0-flash"
 
 
 def test_only_openai_api_key_selects_openai() -> None:
     cfg = resolve_llm_config({"OPENAI_API_KEY": "o-key"})
     assert cfg.provider == "openai"
 
+
+def test_gemini_model_override_without_prefix_is_normalized() -> None:
+    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "gemini-2.5-flash"})
+    assert cfg.model == "gemini-2.5-flash"
+
+
+def test_gemini_model_override_google_models_prefix_is_normalized() -> None:
+    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "models/gemini-2.5-flash"})
+    assert cfg.model == "gemini-2.5-flash"
+
+
+def test_gemini_model_override_gemini_prefix_is_stripped() -> None:
+    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "gemini/gemini-2.5-flash"})
+    assert cfg.model == "gemini-2.5-flash"
+
+
 def test_llm_url_with_keys_selects_local() -> None:
     cfg = resolve_llm_config(
         {

From e1c06eda5f94d31e8c48880389ff2cec26c70b48 Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:40:53 -0800
Subject: [PATCH 5/8] Remove provider side effects and tighten runtime
 verification defaults

---
 README.md                    |  6 +++
 pyproject.toml               |  3 ++
 src/applypilot/cli.py        |  2 +-
 src/applypilot/llm.py        | 66 +++++----------------------
 tests/test_gemini_smoke.py   | 49 ++++++++++++++++++++
 tests/test_llm_client.py     | 88 +++++++++++++++++++++++++++++-------
 tests/test_llm_resolution.py |  2 +-
 7 files changed, 143 insertions(+), 73 deletions(-)
 create mode 100644 tests/test_gemini_smoke.py

diff --git a/README.md b/README.md
index 0f18a22..5bc20b2 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,12 @@ ApplyPilot uses Gemini through LiteLLM's native Gemini provider path, and Gemini
 |-----------|-------------|
 | CapSolver API key | Solves CAPTCHAs during auto-apply (hCaptcha, reCAPTCHA, Turnstile, FunCaptcha). Without it, CAPTCHA-blocked applications just fail gracefully |
 
+### Gemini Smoke Check (optional)
+
+```bash
+GEMINI_API_KEY=your_key_here pytest -m smoke -q tests/test_gemini_smoke.py
+```
+
 > **Note:** python-jobspy is installed separately with `--no-deps` because it pins an exact numpy version in its metadata that conflicts with pip's resolver. It works fine with modern numpy at runtime.
 
 ---
diff --git a/pyproject.toml b/pyproject.toml
index 622aa10..21a3afd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,3 +57,6 @@ line-length = 120
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 testpaths = ["tests"]
+markers = [
+    "smoke: live-provider smoke tests that require external API keys",
+]
diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py
index eaeed7f..85a859c 100644
--- a/src/applypilot/cli.py
+++ b/src/applypilot/cli.py
@@ -398,7 +398,7 @@ def doctor() -> None:
     try:
         llm_cfg = resolve_llm_config()
         if llm_cfg.provider == "local":
-            results.append(("LLM API key", ok_mark, f"Local: {llm_cfg.base_url} ({llm_cfg.model})"))
+            results.append(("LLM API key", ok_mark, f"Local: {llm_cfg.api_base} ({llm_cfg.model})"))
         else:
             label = {
                 "gemini": "Gemini",
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index b71396a..80f8d21 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -25,13 +25,6 @@
 
 log = logging.getLogger(__name__)
 
-_OPENAI_BASE = "https://api.openai.com/v1"
-_ANTHROPIC_BASE = "https://api.anthropic.com/v1"
-_PROVIDER_API_ENV_KEY = {
-    "gemini": "GEMINI_API_KEY",
-    "openai": "OPENAI_API_KEY",
-    "anthropic": "ANTHROPIC_API_KEY",
-}
 _DEFAULT_MODEL_BY_PROVIDER = {
     "local": "local-model",
     "gemini": "gemini-2.0-flash",
@@ -42,15 +35,13 @@
 _MAX_RETRIES = 5
 _TIMEOUT = 120  # seconds
 
-_THINKING_LEVELS = {"none", "low", "medium", "high"}
-
 
 @dataclass(frozen=True)
 class LLMConfig:
     """Normalized LLM configuration consumed by LLMClient."""
 
     provider: str
-    base_url: str
+    api_base: str | None
     model: str
     api_key: str
 
@@ -62,14 +53,6 @@ def _env_get(env: Mapping[str, str], key: str) -> str:
     return str(value).strip()
 
 
-def _normalize_thinking_level(thinking_level: str) -> str:
-    level = (thinking_level or "low").strip().lower()
-    if level not in _THINKING_LEVELS:
-        log.warning("Invalid thinking_level '%s', defaulting to 'low'.", thinking_level)
-        return "low"
-    return level
-
-
 def _provider_model(provider: str, model: str) -> str:
     if provider == "local":
         return model
@@ -167,27 +150,27 @@ def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
     if chosen == "local":
         return LLMConfig(
             provider="local",
-            base_url=local_url.rstrip("/"),
+            api_base=local_url.rstrip("/"),
             model=model,
             api_key=_env_get(env_map, "LLM_API_KEY"),
         )
     if chosen == "gemini":
         return LLMConfig(
             provider="gemini",
-            base_url="",
+            api_base=None,
             model=model,
             api_key=gemini_key,
         )
     if chosen == "openai":
         return LLMConfig(
             provider="openai",
-            base_url=_OPENAI_BASE,
+            api_base=None,
             model=model,
             api_key=openai_key,
         )
     return LLMConfig(
         provider="anthropic",
-        base_url=_ANTHROPIC_BASE,
+        api_base=None,
         model=model,
         api_key=anthropic_key,
     )
@@ -200,19 +183,12 @@ def __init__(self, config: LLMConfig) -> None:
         self.config = config
         self.provider = config.provider
         self.model = config.model
-        self._apply_provider_env()
-
-    def _apply_provider_env(self) -> None:
-        env_key = _PROVIDER_API_ENV_KEY.get(self.provider)
-        if env_key and self.config.api_key:
-            os.environ[env_key] = self.config.api_key
 
     def _build_completion_args(
         self,
         messages: list[dict],
         temperature: float | None,
         max_output_tokens: int,
-        thinking_level: str | None,
         response_kwargs: Mapping[str, object] | None,
     ) -> dict:
         args: dict = {
@@ -225,14 +201,13 @@ def _build_completion_args(
         if temperature is not None:
             args["temperature"] = temperature
 
+        if self.config.api_key:
+            args["api_key"] = self.config.api_key
+
         if self.provider == "local":
             args["model"] = self.model
-            args["api_base"] = self.config.base_url
-            if self.config.api_key:
-                args["api_key"] = self.config.api_key
-        if thinking_level is not None:
-            level = _normalize_thinking_level(thinking_level)
-            args["reasoning_effort"] = level
+            if self.config.api_base:
+                args["api_base"] = self.config.api_base
 
         if response_kwargs:
             args.update(response_kwargs)
@@ -243,15 +218,10 @@ def chat(
         messages: list[dict],
         temperature: float | None = None,
         max_output_tokens: int = 10000,
-        thinking_level: str | None = None,
         response_kwargs: Mapping[str, object] | None = None,
     ) -> str:
         """Send a completion request and return plain text content."""
-        # Suppress LiteLLM's verbose multiline info logs (e.g. request traces).
-        if hasattr(litellm, 'set_verbose'):
-            litellm.set_verbose(False)
-        if hasattr(litellm, 'suppress_debug_info'):
-            litellm.suppress_debug_info = True
+        litellm.suppress_debug_info = True
 
         try:
             response = litellm.completion(
@@ -259,7 +229,6 @@ def chat(
                     messages=messages,
                     temperature=temperature,
                     max_output_tokens=max_output_tokens,
-                    thinking_level=thinking_level,
                     response_kwargs=response_kwargs,
                 )
             )
@@ -267,18 +236,7 @@ def chat(
             choices = getattr(response, "choices", None)
             if not choices:
                 raise RuntimeError("LLM response contained no choices.")
-            content = choices[0].message.content
-
-            if isinstance(content, str):
-                text = content.strip()
-            elif isinstance(content, list):
-                text = "".join(
-                    part if isinstance(part, str) else part.get("text", "")
-                    for part in content
-                    if isinstance(part, (str, dict))
-                ).strip()
-            else:
-                text = ""
+            text = response.choices[0].message.content.strip()
 
             if not text:
                 raise RuntimeError("LLM response contained no text content.")
diff --git a/tests/test_gemini_smoke.py b/tests/test_gemini_smoke.py
new file mode 100644
index 0000000..8b732af
--- /dev/null
+++ b/tests/test_gemini_smoke.py
@@ -0,0 +1,49 @@
+import os
+
+import pytest
+
+litellm = pytest.importorskip("litellm")
+
+
+def _gemini_smoke_model() -> str:
+    raw = os.getenv("GEMINI_SMOKE_MODEL", "gemini-2.0-flash").strip()
+    if raw.startswith("gemini/"):
+        return raw
+    if raw.startswith("models/"):
+        raw = raw.split("/", 1)[1]
+    return f"gemini/{raw}"
+
+
+def _content_text(content: object) -> str:
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        return "".join(
+            part if isinstance(part, str) else str(part.get("text", ""))
+            for part in content
+            if isinstance(part, (str, dict))
+        ).strip()
+    return ""
+
+
+@pytest.mark.smoke
+def test_gemini_smoke_completion_returns_non_empty_content() -> None:
+    api_key = os.getenv("GEMINI_API_KEY", "").strip()
+    if not api_key:
+        pytest.skip("Set GEMINI_API_KEY to run Gemini smoke tests.")
+
+    prompt = os.getenv("GEMINI_SMOKE_PROMPT", "Reply with a single word: ready.")
+    response = litellm.completion(
+        model=_gemini_smoke_model(),
+        api_key=api_key,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=32,
+        timeout=60,
+        num_retries=1,
+    )
+
+    choices = getattr(response, "choices", None)
+    assert choices, "Gemini smoke call returned no choices."
+
+    content = choices[0].message.content
+    assert _content_text(content), "Gemini smoke call returned empty choices[0].message.content."
diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py
index 0e6fe6f..e0a6203 100644
--- a/tests/test_llm_client.py
+++ b/tests/test_llm_client.py
@@ -1,26 +1,26 @@
-from applypilot.llm import LLMClient, LLMConfig, _normalize_thinking_level
+import os
 
+from applypilot.llm import LLMClient, LLMConfig
 
-def test_normalize_thinking_level_accepts_supported_levels() -> None:
-    assert _normalize_thinking_level("none") == "none"
-    assert _normalize_thinking_level("low") == "low"
-    assert _normalize_thinking_level("medium") == "medium"
-    assert _normalize_thinking_level("high") == "high"
 
-
-def test_normalize_thinking_level_defaults_minimal_to_low() -> None:
-    assert _normalize_thinking_level("minimal") == "low"
-
-
-def test_normalize_thinking_level_defaults_invalid_value_to_low() -> None:
-    assert _normalize_thinking_level("max") == "low"
+def test_client_init_does_not_mutate_provider_env(monkeypatch) -> None:
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    LLMClient(
+        LLMConfig(
+            provider="openai",
+            api_base=None,
+            model="gpt-4o-mini",
+            api_key="test-key",
+        )
+    )
+    assert "OPENAI_API_KEY" not in os.environ
 
 
-def test_build_completion_args_applies_reasoning_effort_for_openai() -> None:
+def test_build_completion_args_does_not_include_reasoning_effort_by_default() -> None:
     client = LLMClient(
         LLMConfig(
             provider="openai",
-            base_url="https://api.openai.com/v1",
+            api_base=None,
             model="gpt-4o-mini",
             api_key="test-key",
         )
@@ -29,8 +29,62 @@ def test_build_completion_args_applies_reasoning_effort_for_openai() -> None:
         messages=[{"role": "user", "content": "hello"}],
         temperature=None,
         max_output_tokens=128,
-        thinking_level="medium",
         response_kwargs=None,
     )
-    assert args["reasoning_effort"] == "medium"
+    assert "reasoning_effort" not in args
     assert args["max_tokens"] == 128
+
+
+def test_build_completion_args_uses_litellm_native_gemini_model_prefix() -> None:
+    client = LLMClient(
+        LLMConfig(
+            provider="gemini",
+            api_base=None,
+            model="gemini-2.0-flash",
+            api_key="g-key",
+        )
+    )
+    args = client._build_completion_args(
+        messages=[{"role": "user", "content": "hello"}],
+        temperature=None,
+        max_output_tokens=64,
+        response_kwargs=None,
+    )
+    assert args["model"] == "gemini/gemini-2.0-flash"
+
+
+def test_build_completion_args_includes_api_key_for_remote_provider() -> None:
+    client = LLMClient(
+        LLMConfig(
+            provider="gemini",
+            api_base=None,
+            model="gemini-2.0-flash",
+            api_key="g-key",
+        )
+    )
+    args = client._build_completion_args(
+        messages=[{"role": "user", "content": "hello"}],
+        temperature=None,
+        max_output_tokens=64,
+        response_kwargs=None,
+    )
+    assert args["api_key"] == "g-key"
+
+
+def test_build_completion_args_sets_local_api_base_and_api_key() -> None:
+    client = LLMClient(
+        LLMConfig(
+            provider="local",
+            api_base="http://127.0.0.1:8080/v1",
+            model="local-model",
+            api_key="local-key",
+        )
+    )
+    args = client._build_completion_args(
+        messages=[{"role": "user", "content": "hello"}],
+        temperature=None,
+        max_output_tokens=64,
+        response_kwargs=None,
+    )
+    assert args["api_base"] == "http://127.0.0.1:8080/v1"
+    assert args["api_key"] == "local-key"
diff --git a/tests/test_llm_resolution.py b/tests/test_llm_resolution.py
index d6db6a1..c1c4fb5 100644
--- a/tests/test_llm_resolution.py
+++ b/tests/test_llm_resolution.py
@@ -8,7 +8,7 @@
 def test_only_gemini_api_key_selects_gemini() -> None:
     cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key"})
     assert cfg.provider == "gemini"
-    assert cfg.base_url == ""
+    assert cfg.api_base is None
     assert cfg.model == "gemini-2.0-flash"
 
 

From a524b8aff963fd8aa8ad4400033f66f827c32e98 Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 07:41:08 -0800
Subject: [PATCH 6/8] Simplify model selection and remove legacy input-shape
 compatibility paths

---
 README.md                                |   4 +-
 src/applypilot/cli.py                    |   7 +-
 src/applypilot/config.py                 |  19 +-
 src/applypilot/discovery/smartextract.py |   4 +-
 src/applypilot/enrichment/detail.py      |   2 +-
 src/applypilot/llm.py                    | 294 ++++++++++-------------
 src/applypilot/wizard/init.py            |  16 +-
 tests/test_gemini_smoke.py               |   2 +-
 tests/test_llm_client.py                 | 115 +++++----
 tests/test_llm_resolution.py             |  72 +++---
 10 files changed, 263 insertions(+), 272 deletions(-)

diff --git a/README.md b/README.md
index 5bc20b2..59df888 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ Each stage is independent. Run them all or pick what you need.
 |-----------|-------------|---------|
 | Python 3.11+ | Everything | Core runtime |
 | Node.js 18+ | Auto-apply | Needed for `npx` to run Playwright MCP server |
-| LLM API key or local endpoint | Scoring, tailoring, cover letters | Set one of `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `LLM_URL` |
+| LLM credentials or local endpoint | Scoring, tailoring, cover letters | Set one of `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `LLM_URL`. Optional: set `LLM_MODEL` (for example `gemini/gemini-3.0-flash`) to override the default model. |
 | Chrome/Chromium | Auto-apply | Auto-detected on most systems |
 | Claude Code CLI | Auto-apply | Install from [claude.ai/code](https://claude.ai/code) |
 
@@ -122,7 +122,7 @@ Your personal data in one structured file: contact info, work authorization, com
 Job search queries, target titles, locations, boards. Run multiple searches with different parameters.
 
 ### `.env`
-API keys and runtime config: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `LLM_URL`, `LLM_MODEL`, `CAPSOLVER_API_KEY` (optional).
+API keys and runtime config: `GEMINI_API_KEY`, `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `LLM_URL`, optional `LLM_MODEL`, optional `LLM_API_KEY`, and `CAPSOLVER_API_KEY`.
 
 ### Package configs (shipped with ApplyPilot)
 - `config/employers.yaml` - Workday employer registry (48 preconfigured)
diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py
index 85a859c..7c770ac 100644
--- a/src/applypilot/cli.py
+++ b/src/applypilot/cli.py
@@ -397,8 +397,8 @@ def doctor() -> None:
 
     try:
         llm_cfg = resolve_llm_config()
-        if llm_cfg.provider == "local":
-            results.append(("LLM API key", ok_mark, f"Local: {llm_cfg.api_base} ({llm_cfg.model})"))
+        if llm_cfg.api_base:
+            results.append(("LLM API key", ok_mark, f"Custom endpoint: {llm_cfg.api_base} ({llm_cfg.model})"))
         else:
             label = {
                 "gemini": "Gemini",
@@ -409,7 +409,8 @@ def doctor() -> None:
     except RuntimeError:
         results.append(
             ("LLM API key", fail_mark,
-             "Set GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, or LLM_URL in ~/.applypilot/.env")
+             "Set one of GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, LLM_URL, "
+             "or set LLM_MODEL with LLM_API_KEY in ~/.applypilot/.env")
         )
 
     # --- Tier 3 checks ---
diff --git a/src/applypilot/config.py b/src/applypilot/config.py
index 9067245..090dec6 100644
--- a/src/applypilot/config.py
+++ b/src/applypilot/config.py
@@ -206,10 +206,14 @@ def get_tier() -> int:
     """
     load_env()
 
-    has_llm = any(
+    has_provider_source = any(
         os.environ.get(k)
         for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "LLM_URL")
     )
+    has_model_and_generic_key = bool((os.environ.get("LLM_MODEL") or "").strip()) and bool(
+        (os.environ.get("LLM_API_KEY") or "").strip()
+    )
+    has_llm = has_provider_source or has_model_and_generic_key
     if not has_llm:
         return 1
 
@@ -241,13 +245,18 @@ def check_tier(required: int, feature: str) -> None:
     _console = Console(stderr=True)
 
     missing: list[str] = []
-    if required >= 2 and not any(
+    has_provider_source = any(
         os.environ.get(k)
         for k in ("GEMINI_API_KEY", "OPENAI_API_KEY", "ANTHROPIC_API_KEY", "LLM_URL")
-    ):
+    )
+    has_model_and_generic_key = bool((os.environ.get("LLM_MODEL") or "").strip()) and bool(
+        (os.environ.get("LLM_API_KEY") or "").strip()
+    )
+    if required >= 2 and not (has_provider_source or has_model_and_generic_key):
         missing.append(
-            "LLM API key — run [bold]applypilot init[/bold] or set "
-            "GEMINI_API_KEY / OPENAI_API_KEY / ANTHROPIC_API_KEY / LLM_URL"
+            "LLM config — run [bold]applypilot init[/bold] or set one of "
+            "GEMINI_API_KEY / OPENAI_API_KEY / ANTHROPIC_API_KEY / LLM_URL "
+            "(or set LLM_MODEL with LLM_API_KEY)"
         )
     if required >= 3:
         if not shutil.which("claude"):
diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py
index c8b5300..c9bb18c 100644
--- a/src/applypilot/discovery/smartextract.py
+++ b/src/applypilot/discovery/smartextract.py
@@ -391,7 +391,7 @@ def judge_api_responses(api_responses: list[dict]) -> list[dict]:
         )
 
         try:
-            raw = client.ask(prompt, max_output_tokens=1024)
+            raw = client.chat([{"role": "user", "content": prompt}], max_output_tokens=1024)
             verdict = extract_json(raw)
             is_relevant = verdict.get("relevant", False)
             reason = verdict.get("reason", "?")
@@ -640,7 +640,7 @@ def ask_llm(prompt: str) -> tuple[str, float, dict]:
     """Send prompt to LLM. Returns (response_text, seconds_taken, metadata)."""
     client = get_client()
     t0 = time.time()
-    text = client.ask(prompt, max_output_tokens=4096)
+    text = client.chat([{"role": "user", "content": prompt}], max_output_tokens=4096)
     elapsed = time.time() - t0
     meta = {
         "finish_reason": "stop",
diff --git a/src/applypilot/enrichment/detail.py b/src/applypilot/enrichment/detail.py
index c76081d..f415cc9 100644
--- a/src/applypilot/enrichment/detail.py
+++ b/src/applypilot/enrichment/detail.py
@@ -463,7 +463,7 @@ def extract_with_llm(page, url: str) -> dict:
     try:
         client = get_client()
         t0 = time.time()
-        raw = client.ask(prompt, max_output_tokens=4096)
+        raw = client.chat([{"role": "user", "content": prompt}], max_output_tokens=4096)
         elapsed = time.time() - t0
         log.info("LLM: %d chars in, %.1fs", len(prompt), elapsed)
 
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index 80f8d21..9888c9f 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -1,17 +1,13 @@
 """Unified LLM client for ApplyPilot using LiteLLM.
 
-Auto-detects provider from environment:
-  GEMINI_API_KEY      -> Google Gemini (default: gemini-2.0-flash)
-  OPENAI_API_KEY      -> OpenAI (default: gpt-4o-mini)
-  ANTHROPIC_API_KEY   -> Anthropic Claude (default: claude-3-5-haiku-latest)
-  LLM_URL             -> Local OpenAI-compatible endpoint
-
-LLM_MODEL env var overrides the model name for any provider.
-
-Gemini provider behavior:
-  - Uses LiteLLM's native Gemini provider path (no OpenAI-compat base URL).
-  - Google v1 is considered stable while v1beta can change; endpoint version choice is delegated to LiteLLM.
-  - Provider is inferred from configured credentials; model prefixes are handled internally.
+Runtime contract:
+  - If set, LLM_MODEL must be a fully-qualified LiteLLM model string
+    (for example: openai/gpt-4o-mini, anthropic/claude-3-5-haiku-latest,
+    gemini/gemini-3.0-flash).
+  - If LLM_MODEL is unset, provider is inferred by first configured source:
+    GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, then LLM_URL.
+  - Credentials come from provider env vars or generic LLM_API_KEY.
+  - LLM_URL is optional for custom OpenAI-compatible endpoints.
 """
 
 from __future__ import annotations
@@ -20,25 +16,32 @@
 from dataclasses import dataclass
 import logging
 import os
+from typing import Any, Literal, TypedDict, Unpack
+import warnings
 
 import litellm
 
 log = logging.getLogger(__name__)
 
-_DEFAULT_MODEL_BY_PROVIDER = {
-    "local": "local-model",
-    "gemini": "gemini-2.0-flash",
-    "openai": "gpt-4o-mini",
-    "anthropic": "claude-3-5-haiku-latest",
-}
-
 _MAX_RETRIES = 5
 _TIMEOUT = 120  # seconds
+_INFERRED_SOURCE_ORDER: tuple[tuple[str, str], ...] = (
+    ("gemini", "GEMINI_API_KEY"),
+    ("openai", "OPENAI_API_KEY"),
+    ("anthropic", "ANTHROPIC_API_KEY"),
+    ("openai", "LLM_URL"),
+)
+_DEFAULT_MODEL_BY_PROVIDER = {
+    "gemini": "gemini/gemini-3.0-flash",
+    "openai": "openai/gpt-5-mini",
+    "anthropic": "anthropic/claude-haiku-4-5",
+}
+_DEFAULT_LOCAL_MODEL = "openai/local-model"
 
 
 @dataclass(frozen=True)
 class LLMConfig:
-    """Normalized LLM configuration consumed by LLMClient."""
+    """LLM configuration consumed by LLMClient."""
 
     provider: str
     api_base: str | None
@@ -46,6 +49,22 @@ class LLMConfig:
     api_key: str
 
 
+class ChatMessage(TypedDict):
+    role: Literal["system", "user", "assistant", "tool"]
+    content: str
+
+
+class LiteLLMExtra(TypedDict, total=False):
+    stop: str | list[str]
+    top_p: float
+    seed: int
+    stream: bool
+    response_format: dict[str, Any]
+    tools: list[dict[str, Any]]
+    tool_choice: str | dict[str, Any]
+    fallbacks: list[str]
+
+
 def _env_get(env: Mapping[str, str], key: str) -> str:
     value = env.get(key, "")
     if value is None:
@@ -53,126 +72,75 @@ def _env_get(env: Mapping[str, str], key: str) -> str:
     return str(value).strip()
 
 
-def _provider_model(provider: str, model: str) -> str:
-    if provider == "local":
-        return model
-    if model.startswith(f"{provider}/"):
-        return model
-    return f"{provider}/{model}"
-
-
-def _default_model(provider: str) -> str:
-    return _DEFAULT_MODEL_BY_PROVIDER[provider]
-
-
-def _normalize_model_for_provider(provider: str, model: str) -> str:
-    normalized = model.strip()
-    if provider == "local":
-        return normalized
-    if normalized.startswith("models/"):
-        normalized = normalized.split("/", 1)[1]
-
-    provider_prefix = f"{provider}/"
-    if normalized.startswith(provider_prefix):
-        return normalized[len(provider_prefix):]
+def _provider_from_model(model: str) -> str:
+    provider, _, model_name = model.partition("/")
+    if not provider or not model_name:
+        raise RuntimeError(
+            "LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini')."
+        )
+    return provider
 
-    for other in ("gemini", "openai", "anthropic", "vertex_ai"):
-        other_prefix = f"{other}/"
-        if normalized.startswith(other_prefix):
-            return normalized.split("/", 1)[1]
 
-    return normalized
+def _infer_provider_and_source(env: Mapping[str, str]) -> tuple[str, str] | None:
+    for provider, env_key in _INFERRED_SOURCE_ORDER:
+        if _env_get(env, env_key):
+            return provider, env_key
+    return None
 
 
 def resolve_llm_config(env: Mapping[str, str] | None = None) -> LLMConfig:
-    """Resolve provider configuration from environment with deterministic precedence."""
+    """Resolve LLM configuration from environment."""
     env_map = env if env is not None else os.environ
 
-    model_override = _env_get(env_map, "LLM_MODEL")
+    model = _env_get(env_map, "LLM_MODEL")
     local_url = _env_get(env_map, "LLM_URL")
-    gemini_key = _env_get(env_map, "GEMINI_API_KEY")
-    openai_key = _env_get(env_map, "OPENAI_API_KEY")
-    anthropic_key = _env_get(env_map, "ANTHROPIC_API_KEY")
-    llm_provider = _env_get(env_map, "LLM_PROVIDER").lower()
-
-    providers_present = {
-        "local": bool(local_url),
-        "gemini": bool(gemini_key),
-        "openai": bool(openai_key),
-        "anthropic": bool(anthropic_key),
-    }
-    precedence = ["local", "gemini", "openai", "anthropic"]
-    configured = [provider for provider in precedence if providers_present[provider]]
-
-    if not configured:
-        raise RuntimeError(
-            "No LLM provider configured. "
-            "Set one of LLM_URL, GEMINI_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY."
-        )
-
-    chosen = ""
-    override_aliases = {
-        "local": "local",
-        "gemini": "gemini",
-        "openai": "openai",
-        "anthropic": "anthropic",
-    }
-
-    # Optional override only when multiple providers are configured.
-    if len(configured) > 1 and llm_provider:
-        overridden = override_aliases.get(llm_provider)
-        if overridden and overridden in configured:
-            chosen = overridden
-            log.warning(
-                "Multiple LLM providers configured (%s). Using '%s' via LLM_PROVIDER override.",
-                ", ".join(configured),
-                chosen,
-            )
+    inferred = _infer_provider_and_source(env_map)
+    if model:
+        if "/" in model:
+            provider = _provider_from_model(model)
+        elif inferred:
+            provider, _ = inferred
+            model = f"{provider}/{model}"
         else:
-            log.warning(
-                "Ignoring LLM_PROVIDER='%s' because it is not configured. "
-                "Using precedence: LLM_URL > GEMINI_API_KEY > OPENAI_API_KEY > ANTHROPIC_API_KEY.",
-                llm_provider,
+            raise RuntimeError(
+                "LLM_MODEL must include a provider prefix (for example 'openai/gpt-4o-mini')."
             )
-
-    if not chosen:
-        chosen = configured[0]
-        if len(configured) > 1:
-            log.warning(
-                "Multiple LLM providers configured (%s). Using '%s' based on precedence: "
-                "LLM_URL > GEMINI_API_KEY > OPENAI_API_KEY > ANTHROPIC_API_KEY.",
-                ", ".join(configured),
-                chosen,
+    else:
+        if not inferred:
+            raise RuntimeError(
+                "No LLM provider configured. Set one of GEMINI_API_KEY, OPENAI_API_KEY, "
+                "ANTHROPIC_API_KEY, LLM_URL, or LLM_MODEL."
             )
-    model = model_override or _default_model(chosen)
-    model = _normalize_model_for_provider(chosen, model)
-
-    if chosen == "local":
-        return LLMConfig(
-            provider="local",
-            api_base=local_url.rstrip("/"),
-            model=model,
-            api_key=_env_get(env_map, "LLM_API_KEY"),
-        )
-    if chosen == "gemini":
-        return LLMConfig(
-            provider="gemini",
-            api_base=None,
-            model=model,
-            api_key=gemini_key,
+        provider, source = inferred
+        if source == "LLM_URL":
+            model = _DEFAULT_LOCAL_MODEL
+        else:
+            model = _DEFAULT_MODEL_BY_PROVIDER[provider]
+
+    provider_api_key_env = {
+        "gemini": "GEMINI_API_KEY",
+        "openai": "OPENAI_API_KEY",
+        "anthropic": "ANTHROPIC_API_KEY",
+    }
+    api_key_env = provider_api_key_env.get(provider, "LLM_API_KEY")
+    api_key = _env_get(env_map, api_key_env) or _env_get(env_map, "LLM_API_KEY")
+
+    if not api_key and not local_url:
+        key_help = (
+            f"{api_key_env} or LLM_API_KEY"
+            if provider in provider_api_key_env
+            else "LLM_API_KEY"
         )
-    if chosen == "openai":
-        return LLMConfig(
-            provider="openai",
-            api_base=None,
-            model=model,
-            api_key=openai_key,
+        raise RuntimeError(
+            f"Missing credentials for LLM_MODEL '{model}'. Set {key_help}, or set LLM_URL for "
+            "a local OpenAI-compatible endpoint."
         )
+
     return LLMConfig(
-        provider="anthropic",
-        api_base=None,
+        provider=provider,
+        api_base=local_url.rstrip("/") if local_url else None,
         model=model,
-        api_key=anthropic_key,
+        api_key=api_key,
     )
 
 
@@ -183,60 +151,52 @@ def __init__(self, config: LLMConfig) -> None:
         self.config = config
         self.provider = config.provider
         self.model = config.model
-
-    def _build_completion_args(
-        self,
-        messages: list[dict],
-        temperature: float | None,
-        max_output_tokens: int,
-        response_kwargs: Mapping[str, object] | None,
-    ) -> dict:
-        args: dict = {
-            "model": _provider_model(self.provider, self.model),
-            "messages": messages,
-            "max_tokens": max_output_tokens,
-            "timeout": _TIMEOUT,
-            "num_retries": _MAX_RETRIES,  # Delegate retry handling to LiteLLM.
-        }
-        if temperature is not None:
-            args["temperature"] = temperature
-
-        if self.config.api_key:
-            args["api_key"] = self.config.api_key
-
-        if self.provider == "local":
-            args["model"] = self.model
-            if self.config.api_base:
-                args["api_base"] = self.config.api_base
-
-        if response_kwargs:
-            args.update(response_kwargs)
-        return args
+        litellm.suppress_debug_info = True
 
     def chat(
         self,
-        messages: list[dict],
-        temperature: float | None = None,
+        messages: list[ChatMessage],
+        *,
         max_output_tokens: int = 10000,
-        response_kwargs: Mapping[str, object] | None = None,
+        temperature: float | None = None,
+        timeout: int = _TIMEOUT,
+        num_retries: int = _MAX_RETRIES,
+        drop_params: bool = True,
+        **extra: Unpack[LiteLLMExtra],
     ) -> str:
         """Send a completion request and return plain text content."""
-        litellm.suppress_debug_info = True
-
         try:
-            response = litellm.completion(
-                **self._build_completion_args(
+            if temperature is None:
+                response = litellm.completion(
+                    model=self.model,
+                    messages=messages,
+                    max_tokens=max_output_tokens,
+                    timeout=timeout,
+                    num_retries=num_retries,
+                    drop_params=drop_params,
+                    api_key=self.config.api_key or None,
+                    api_base=self.config.api_base or None,
+                    **extra,
+                )
+            else:
+                response = litellm.completion(
+                    model=self.model,
                     messages=messages,
+                    max_tokens=max_output_tokens,
                     temperature=temperature,
-                    max_output_tokens=max_output_tokens,
-                    response_kwargs=response_kwargs,
+                    timeout=timeout,
+                    num_retries=num_retries,
+                    drop_params=drop_params,
+                    api_key=self.config.api_key or None,
+                    api_base=self.config.api_base or None,
+                    **extra,
                 )
-            )
 
             choices = getattr(response, "choices", None)
             if not choices:
                 raise RuntimeError("LLM response contained no choices.")
-            text = response.choices[0].message.content.strip()
+            content = response.choices[0].message.content
+            text = content.strip() if isinstance(content, str) else str(content).strip()
 
             if not text:
                 raise RuntimeError("LLM response contained no text content.")
@@ -244,10 +204,6 @@ def chat(
         except Exception as exc:  # pragma: no cover - provider SDK exception types vary by backend/version.
             raise RuntimeError(f"LLM request failed ({self.provider}/{self.model}): {exc}") from exc
 
-    def ask(self, prompt: str, **kwargs) -> str:
-        """Convenience: single user prompt -> assistant response."""
-        return self.chat([{"role": "user", "content": prompt}], **kwargs)
-
     def close(self) -> None:
         """No-op. LiteLLM completion() is stateless per call."""
         return None
diff --git a/src/applypilot/wizard/init.py b/src/applypilot/wizard/init.py
index 88cbea1..06826bd 100644
--- a/src/applypilot/wizard/init.py
+++ b/src/applypilot/wizard/init.py
@@ -277,12 +277,18 @@ def _setup_ai_features() -> None:
         console.print("[dim]No AI provider configured. You can add one later with [bold]applypilot init[/bold].[/dim]")
         return
 
+    default_model_by_source = {
+        "gemini": "gemini/gemini-3.0-flash",
+        "openai": "openai/gpt-4o-mini",
+        "anthropic": "anthropic/claude-3-5-haiku-latest",
+        "local": "openai/local-model",
+    }
+    default_model = default_model_by_source.get(configured_sources[0], "openai/gpt-4o-mini")
     model = Prompt.ask(
-        "LLM model override (optional, leave blank to use provider defaults)",
-        default="",
+        "LLM model (required, include provider prefix)",
+        default=default_model,
     ).strip()
-    if model:
-        env_lines.append(f"LLM_MODEL={model}")
+    env_lines.append(f"LLM_MODEL={model}")
 
     env_lines.append("")
     ENV_PATH.write_text("\n".join(env_lines), encoding="utf-8")
@@ -290,7 +296,7 @@ def _setup_ai_features() -> None:
         configured = ", ".join(configured_sources)
         console.print(
             f"[yellow]Multiple LLM providers saved ({configured}). "
-            "Runtime selects one deterministically by precedence.[/yellow]"
+            "Runtime routing follows LLM_MODEL's provider prefix.[/yellow]"
         )
     console.print(f"[green]AI configuration saved to {ENV_PATH}[/green]")
 
diff --git a/tests/test_gemini_smoke.py b/tests/test_gemini_smoke.py
index 8b732af..fecc332 100644
--- a/tests/test_gemini_smoke.py
+++ b/tests/test_gemini_smoke.py
@@ -6,7 +6,7 @@
 
 
 def _gemini_smoke_model() -> str:
-    raw = os.getenv("GEMINI_SMOKE_MODEL", "gemini-2.0-flash").strip()
+    raw = os.getenv("GEMINI_SMOKE_MODEL", "gemini-3.0-flash").strip()
     if raw.startswith("gemini/"):
         return raw
     if raw.startswith("models/"):
diff --git a/tests/test_llm_client.py b/tests/test_llm_client.py
index e0a6203..6470ea9 100644
--- a/tests/test_llm_client.py
+++ b/tests/test_llm_client.py
@@ -1,5 +1,7 @@
 import os
+from types import SimpleNamespace
 
+import applypilot.llm as llm_module
 from applypilot.llm import LLMClient, LLMConfig
 
 
@@ -9,82 +11,105 @@ def test_client_init_does_not_mutate_provider_env(monkeypatch) -> None:
         LLMConfig(
             provider="openai",
             api_base=None,
-            model="gpt-4o-mini",
+            model="openai/gpt-4o-mini",
             api_key="test-key",
         )
     )
     assert "OPENAI_API_KEY" not in os.environ
+    assert llm_module.litellm.suppress_debug_info is True
 
 
-def test_build_completion_args_does_not_include_reasoning_effort_by_default() -> None:
+def _mock_response(content: str = "hello") -> SimpleNamespace:
+    return SimpleNamespace(
+        choices=[
+            SimpleNamespace(
+                message=SimpleNamespace(content=content),
+            )
+        ]
+    )
+
+
+def test_chat_passes_defaults_without_temperature(monkeypatch) -> None:
     client = LLMClient(
         LLMConfig(
             provider="openai",
             api_base=None,
-            model="gpt-4o-mini",
+            model="openai/gpt-4o-mini",
             api_key="test-key",
         )
     )
-    args = client._build_completion_args(
-        messages=[{"role": "user", "content": "hello"}],
-        temperature=None,
-        max_output_tokens=128,
-        response_kwargs=None,
-    )
-    assert "reasoning_effort" not in args
-    assert args["max_tokens"] == 128
+    captured: dict[str, object] = {}
 
+    def _fake_completion(**kwargs: object) -> SimpleNamespace:
+        captured.update(kwargs)
+        return _mock_response()
 
-def test_build_completion_args_uses_litellm_native_gemini_model_prefix() -> None:
-    client = LLMClient(
-        LLMConfig(
-            provider="gemini",
-            api_base=None,
-            model="gemini-2.0-flash",
-            api_key="g-key",
-        )
-    )
-    args = client._build_completion_args(
-        messages=[{"role": "user", "content": "hello"}],
-        temperature=None,
-        max_output_tokens=64,
-        response_kwargs=None,
-    )
-    assert args["model"] == "gemini/gemini-2.0-flash"
+    monkeypatch.setattr(llm_module.litellm, "completion", _fake_completion)
+    response = client.chat([{"role": "user", "content": "hello"}], max_output_tokens=128)
+
+    assert response == "hello"
+    assert captured["model"] == "openai/gpt-4o-mini"
+    assert captured["max_tokens"] == 128
+    assert captured["timeout"] == 120
+    assert captured["num_retries"] == 5
+    assert captured["drop_params"] is True
+    assert captured["api_key"] == "test-key"
+    assert captured["api_base"] is None
+    assert "temperature" not in captured
+    assert "reasoning_effort" not in captured
 
 
-def test_build_completion_args_includes_api_key_for_remote_provider() -> None:
+def test_chat_supports_temperature_and_typed_extra(monkeypatch) -> None:
     client = LLMClient(
         LLMConfig(
             provider="gemini",
             api_base=None,
-            model="gemini-2.0-flash",
+            model="gemini/gemini-3.0-flash",
             api_key="g-key",
         )
     )
-    args = client._build_completion_args(
-        messages=[{"role": "user", "content": "hello"}],
-        temperature=None,
+    captured: dict[str, object] = {}
+
+    def _fake_completion(**kwargs: object) -> SimpleNamespace:
+        captured.update(kwargs)
+        return _mock_response("ok")
+
+    monkeypatch.setattr(llm_module.litellm, "completion", _fake_completion)
+    response = client.chat(
+        [{"role": "user", "content": "hello"}],
         max_output_tokens=64,
-        response_kwargs=None,
+        temperature=0.2,
+        top_p=0.9,
+        stop=["\n\n"],
+        response_format={"type": "json_object"},
     )
-    assert args["api_key"] == "g-key"
+
+    assert response == "ok"
+    assert captured["model"] == "gemini/gemini-3.0-flash"
+    assert captured["api_key"] == "g-key"
+    assert captured["temperature"] == 0.2
+    assert captured["top_p"] == 0.9
+    assert captured["stop"] == ["\n\n"]
+    assert captured["response_format"] == {"type": "json_object"}
 
 
-def test_build_completion_args_sets_local_api_base_and_api_key() -> None:
+def test_chat_sets_local_api_base_and_api_key(monkeypatch) -> None:
     client = LLMClient(
         LLMConfig(
-            provider="local",
+            provider="openai",
             api_base="http://127.0.0.1:8080/v1",
-            model="local-model",
+            model="openai/local-model",
             api_key="local-key",
         )
     )
-    args = client._build_completion_args(
-        messages=[{"role": "user", "content": "hello"}],
-        temperature=None,
-        max_output_tokens=64,
-        response_kwargs=None,
-    )
-    assert args["api_base"] == "http://127.0.0.1:8080/v1"
-    assert args["api_key"] == "local-key"
+    captured: dict[str, object] = {}
+
+    def _fake_completion(**kwargs: object) -> SimpleNamespace:
+        captured.update(kwargs)
+        return _mock_response()
+
+    monkeypatch.setattr(llm_module.litellm, "completion", _fake_completion)
+    _ = client.chat([{"role": "user", "content": "hello"}], max_output_tokens=64)
+
+    assert captured["api_base"] == "http://127.0.0.1:8080/v1"
+    assert captured["api_key"] == "local-key"
diff --git a/tests/test_llm_resolution.py b/tests/test_llm_resolution.py
index c1c4fb5..47022d6 100644
--- a/tests/test_llm_resolution.py
+++ b/tests/test_llm_resolution.py
@@ -1,63 +1,57 @@
-import logging
-
 import pytest
 
 from applypilot.llm import resolve_llm_config
 
 
-def test_only_gemini_api_key_selects_gemini() -> None:
-    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key"})
+def test_infers_provider_from_first_configured_source() -> None:
+    cfg = resolve_llm_config(
+        {
+            "GEMINI_API_KEY": "g-key",
+            "OPENAI_API_KEY": "o-key",
+            "ANTHROPIC_API_KEY": "a-key",
+            "LLM_URL": "http://127.0.0.1:8080/v1",
+        }
+    )
     assert cfg.provider == "gemini"
-    assert cfg.api_base is None
-    assert cfg.model == "gemini-2.0-flash"
+    assert cfg.model == "gemini/gemini-3.0-flash"
+    assert cfg.api_key == "g-key"
 
 
-def test_only_openai_api_key_selects_openai() -> None:
-    cfg = resolve_llm_config({"OPENAI_API_KEY": "o-key"})
+def test_unprefixed_model_uses_inferred_provider() -> None:
+    cfg = resolve_llm_config({"LLM_MODEL": "gpt-4o-mini", "OPENAI_API_KEY": "o-key"})
     assert cfg.provider == "openai"
+    assert cfg.model == "openai/gpt-4o-mini"
 
 
-def test_gemini_model_override_without_prefix_is_normalized() -> None:
-    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "gemini-2.5-flash"})
-    assert cfg.model == "gemini-2.5-flash"
+def test_requires_model_provider_prefix_without_inferable_provider() -> None:
+    with pytest.raises(RuntimeError, match="must include a provider prefix"):
+        resolve_llm_config({"LLM_MODEL": "gpt-4o-mini", "LLM_API_KEY": "generic"})
 
 
-def test_gemini_model_override_google_models_prefix_is_normalized() -> None:
-    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "models/gemini-2.5-flash"})
-    assert cfg.model == "gemini-2.5-flash"
+def test_provider_and_api_key_come_from_model_contract() -> None:
+    cfg = resolve_llm_config({"LLM_MODEL": "gemini/gemini-3.0-flash", "GEMINI_API_KEY": "g-key"})
+    assert cfg.provider == "gemini"
+    assert cfg.api_base is None
+    assert cfg.model == "gemini/gemini-3.0-flash"
+    assert cfg.api_key == "g-key"
 
 
-def test_gemini_model_override_gemini_prefix_is_stripped() -> None:
-    cfg = resolve_llm_config({"GEMINI_API_KEY": "g-key", "LLM_MODEL": "gemini/gemini-2.5-flash"})
-    assert cfg.model == "gemini-2.5-flash"
+def test_uses_generic_api_key_for_unmapped_provider() -> None:
+    cfg = resolve_llm_config({"LLM_MODEL": "vertex_ai/gemini-3.0-flash", "LLM_API_KEY": "v-key"})
+    assert cfg.provider == "vertex_ai"
+    assert cfg.api_key == "v-key"
 
 
-def test_llm_url_with_keys_selects_local() -> None:
+def test_llm_url_infers_local_default_model_and_allows_missing_api_key() -> None:
     cfg = resolve_llm_config(
         {
-            "LLM_URL": "http://127.0.0.1:8080/v1",
-            "GEMINI_API_KEY": "g-key",
-            "OPENAI_API_KEY": "o-key",
-            "ANTHROPIC_API_KEY": "a-key",
+            "LLM_URL": "http://127.0.0.1:8080/v1/",
         }
     )
-    assert cfg.provider == "local"
-
-
-def test_multiple_keys_selects_deterministically_and_warns(caplog: pytest.LogCaptureFixture) -> None:
-    with caplog.at_level(logging.WARNING):
-        cfg = resolve_llm_config(
-            {
-                "GEMINI_API_KEY": "g-key",
-                "OPENAI_API_KEY": "o-key",
-                "ANTHROPIC_API_KEY": "a-key",
-            }
-        )
-    assert cfg.provider == "gemini"
-    assert any(
-        "Multiple LLM providers configured" in rec.message and "Using 'gemini' based on precedence" in rec.message
-        for rec in caplog.records
-    )
+    assert cfg.provider == "openai"
+    assert cfg.model == "openai/local-model"
+    assert cfg.api_base == "http://127.0.0.1:8080/v1"
+    assert cfg.api_key == ""
 
 
 def test_missing_everything_raises_clear_error() -> None:

From efaee0809a76b3d1ad9c7082f53a6abb806d9d4c Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 08:21:24 -0800
Subject: [PATCH 7/8] Cleanup gitignore and pyproject.toml

---
 .gitignore     | 3 ---
 pyproject.toml | 7 -------
 2 files changed, 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 35c6a55..835589f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,6 @@ resume.pdf
 *.env
 .env.*
 !.env.example
-.venv/*
-
 
 # Runtime artifacts
 *.db
@@ -41,4 +39,3 @@ Thumbs.db
 
 # Claude Code
 .claude/
-tm_dev/nb.ipynb
diff --git a/pyproject.toml b/pyproject.toml
index 21a3afd..2b0e264 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,10 +53,3 @@ artifacts = ["src/applypilot/config/*.yaml"]
 [tool.ruff]
 target-version = "py311"
 line-length = 120
-
-[tool.pytest.ini_options]
-pythonpath = ["src"]
-testpaths = ["tests"]
-markers = [
-    "smoke: live-provider smoke tests that require external API keys",
-]

From 8e29d13c3b19994fead7f2e192544128bd885cc5 Mon Sep 17 00:00:00 2001
From: Trevor Mells <trevormells@gmail.com>
Date: Fri, 27 Feb 2026 15:10:25 -0800
Subject: [PATCH 8/8] increase tokens for tailor graph.  Improve logging

---
 src/applypilot/cli.py               | 13 +++++++++++++
 src/applypilot/llm.py               |  4 ++++
 src/applypilot/scoring/tailor.py    |  7 +++++--
 src/applypilot/scoring/validator.py |  5 ++++-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/applypilot/cli.py b/src/applypilot/cli.py
index 7c770ac..6344ce9 100644
--- a/src/applypilot/cli.py
+++ b/src/applypilot/cli.py
@@ -28,6 +28,19 @@ def _configure_logging() -> None:
         noisy.setLevel(logging.WARNING)
         noisy.propagate = True
 
+    # Route verbose tailor/cover loggers to a file instead of the terminal.
+    # Per-attempt warnings and validation details are useful for debugging
+    # but too noisy for normal CLI output.
+    from applypilot.config import LOG_DIR
+    LOG_DIR.mkdir(parents=True, exist_ok=True)
+    _file_fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
+    for logger_name in ("applypilot.scoring.tailor", "applypilot.scoring.cover_letter"):
+        file_log = logging.getLogger(logger_name)
+        file_log.propagate = False  # suppress terminal output
+        fh = logging.FileHandler(LOG_DIR / f"{logger_name.split('.')[-1]}.log", encoding="utf-8")
+        fh.setFormatter(_file_fmt)
+        file_log.addHandler(fh)
+
 
 _configure_logging()
 
diff --git a/src/applypilot/llm.py b/src/applypilot/llm.py
index 9888c9f..030f2ce 100644
--- a/src/applypilot/llm.py
+++ b/src/applypilot/llm.py
@@ -21,6 +21,10 @@
 
 import litellm
 
+# Suppress pydantic serialization warnings from litellm internals when provider
+# responses have fewer fields than the full ModelResponse schema.
+warnings.filterwarnings("ignore", category=UserWarning, module="pydantic.*")
+
 log = logging.getLogger(__name__)
 
 _MAX_RETRIES = 5
diff --git a/src/applypilot/scoring/tailor.py b/src/applypilot/scoring/tailor.py
index aaf4021..0fb71d9 100644
--- a/src/applypilot/scoring/tailor.py
+++ b/src/applypilot/scoring/tailor.py
@@ -397,12 +397,14 @@ def tailor_resume(
             {"role": "user", "content": f"ORIGINAL RESUME:\n{resume_text}\n\n---\n\nTARGET JOB:\n{job_text}\n\nReturn the JSON:"},
         ]
 
-        raw = client.chat(messages, max_output_tokens=2048)
+        raw = client.chat(messages, max_output_tokens=16000)
 
         # Parse JSON from response
         try:
             data = extract_json(raw)
-        except ValueError:
+        except ValueError as exc:
+            log.warning("Attempt %d JSON parse failed (%s). Raw response (first 500 chars):\n%s",
+                        attempt + 1, exc, raw[:1000])
             avoid_notes.append("Output was not valid JSON. Return ONLY a JSON object, nothing else.")
             continue
 
@@ -412,6 +414,7 @@ def tailor_resume(
 
         if not validation["passed"]:
             # Only retry if there are hard errors (warnings never block)
+            log.warning("Attempt %d validation failed: %s", attempt + 1, validation["errors"])
             avoid_notes.extend(validation["errors"])
             if attempt < max_retries:
                 continue
diff --git a/src/applypilot/scoring/validator.py b/src/applypilot/scoring/validator.py
index abb8f89..3d3ce17 100644
--- a/src/applypilot/scoring/validator.py
+++ b/src/applypilot/scoring/validator.py
@@ -114,9 +114,12 @@ def validate_json_fields(data: dict, profile: dict, mode: str = "normal") -> dic
     warnings: list[str] = []
 
     # Required keys — always checked regardless of mode
-    for key in ("title", "summary", "skills", "experience", "projects", "education"):
+    # "projects" may be an empty list (model may drop all projects for some jobs)
+    for key in ("title", "summary", "skills", "experience", "education"):
         if key not in data or not data[key]:
             errors.append(f"Missing required field: {key}")
+    if "projects" not in data:
+        errors.append("Missing required field: projects")
     if errors:
         return {"passed": False, "errors": errors, "warnings": warnings}