From 0dd2717317df89a35003b944c002f205ef268f99 Mon Sep 17 00:00:00 2001
From: Sebastian Geraldes
 <199673787+sebastiangeraldes@users.noreply.github.com>
Date: Tue, 20 Jan 2026 14:03:52 -0300
Subject: [PATCH] fix: Replace hardcoded token limits with model-specific
 configuration

Problem:
- Spec creation failing with "max_tokens: 65537 > 64000" error for Opus 4.5
- Magic numbers scattered across codebase (62000, 63999, 64000)
- No validation against model-specific limits
- SDK bug #8756 causes intermittent validation errors when max_tokens
  is reduced without adjusting thinking budget

Solution:
- Created model_limits.json configuration file with all Claude 4.5 model limits
- All models have 64K max_output_tokens (Opus, Sonnet, Haiku 4.5)
- Set ultrathink budget to 60K (leaves 4K buffer for SDK overhead)
- Added validation functions to cap thinking budgets to model limits
- Updated frontend constants to match backend configuration
- Added comprehensive tests for model-specific validation

Changes:
- apps/backend/model_limits.json: New configuration file with model limits
- apps/backend/phase_config.py: Load limits from config, add validation
- apps/frontend/src/shared/constants/models.ts: Add model limit constants
- tests/test_model_limits.py: New tests for model-specific validation (10 tests)
- tests/test_thinking_level_validation.py: Update ultrathink budget to 60K

Technical Details:
- API constraint: max_tokens > thinking.budget_tokens (strictly greater)
- All Claude 4.5 models: 64K max output, 200K context window
- Safe thinking budget: 60K tokens (4K buffer for SDK overhead)
- Graceful degradation: Warns and caps excessive budgets instead of failing

Testing:
- All 19 tests pass (9 existing + 10 new)
- Validates budget capping, API constraints, and buffer requirements

Fixes SDK bug workaround for issue anthropics/claude-code#8756

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: Sebastian Geraldes <199673787+sebastiangeraldes@users.noreply.github.com>
---
 apps/backend/model_limits.json               |  56 +++++++
 apps/backend/phase_config.py                 | 150 +++++++++++++++---
 apps/frontend/src/shared/constants/models.ts |  23 ++-
 tests/test_model_limits.py                   | 152 +++++++++++++++++++
 tests/test_thinking_level_validation.py      |   6 +-
 5 files changed, 363 insertions(+), 24 deletions(-)
 create mode 100644 apps/backend/model_limits.json
 create mode 100644 tests/test_model_limits.py

diff --git a/apps/backend/model_limits.json b/apps/backend/model_limits.json
new file mode 100644
index 0000000000..2d18904381
--- /dev/null
+++ b/apps/backend/model_limits.json
@@ -0,0 +1,56 @@
+{
+  "$schema": "model_limits_schema.json",
+  "description": "Model-specific token limits for Claude API. All values are in tokens.",
+  "models": {
+    "claude-opus-4-5-20251101": {
+      "display_name": "Claude Opus 4.5",
+      "max_output_tokens": 64000,
+      "context_window": 200000,
+      "max_thinking_tokens": 60000,
+      "notes": "Maximum output increased from 32K (Opus 4.1) to 64K"
+    },
+    "claude-sonnet-4-5-20250929": {
+      "display_name": "Claude Sonnet 4.5",
+      "max_output_tokens": 64000,
+      "context_window": 200000,
+      "max_thinking_tokens": 60000,
+      "notes": "Standard output limit, 1M context available in beta"
+    },
+    "claude-haiku-4-5-20251001": {
+      "display_name": "Claude Haiku 4.5",
+      "max_output_tokens": 64000,
+      "context_window": 200000,
+      "max_thinking_tokens": 60000,
+      "notes": "Output increased from 8K (Haiku 3.5) to 64K"
+    }
+  },
+  "thinking_levels": {
+    "none": {
+      "budget": null,
+      "description": "No extended thinking"
+    },
+    "low": {
+      "budget": 1024,
+      "description": "Brief consideration (1K tokens)"
+    },
+    "medium": {
+      "budget": 4096,
+      "description": "Moderate analysis (4K tokens)"
+    },
+    "high": {
+      "budget": 16384,
+      "description": "Deep thinking for complex tasks (16K tokens)"
+    },
+    "ultrathink": {
+      "budget": 60000,
+      "description": "Maximum reasoning depth (60K tokens, leaves 4K buffer for SDK overhead)",
+      "safety_note": "SDK may reduce max_tokens, so we keep 4K buffer below the 64K limit"
+    }
+  },
+  "validation_rules": {
+    "max_thinking_tokens_must_be_less_than_max_output": true,
+    "min_thinking_budget": 1024,
+    "safety_buffer_tokens": 4000,
+    "notes": "API constraint: max_tokens > thinking.budget_tokens. SDK bug #8756 causes intermittent validation errors when max_tokens is reduced without adjusting thinking budget."
+  }
+}
diff --git a/apps/backend/phase_config.py b/apps/backend/phase_config.py
index 41af2d81eb..ffcbb6e7a1 100644
--- a/apps/backend/phase_config.py
+++ b/apps/backend/phase_config.py
@@ -3,14 +3,18 @@
 ===========================
 
 Handles model and thinking level configuration for different execution phases.
-Reads configuration from task_metadata.json and provides resolved model IDs.
+Reads configuration from task_metadata.json and model_limits.json for model-specific constraints.
 """
 
 import json
+import logging
 import os
 from pathlib import Path
 from typing import Literal, TypedDict
 
+# Get logger for this module
+logger = logging.getLogger(__name__)
+
 # Model shorthand to full model ID mapping
 MODEL_ID_MAP: dict[str, str] = {
     "opus": "claude-opus-4-5-20251101",
@@ -18,14 +22,39 @@
     "haiku": "claude-haiku-4-5-20251001",
 }
 
-# Thinking level to budget tokens mapping (None = no extended thinking)
+# Load model limits from configuration file
+def _load_model_limits() -> dict:
+    """Load model limits from model_limits.json."""
+    limits_file = Path(__file__).parent / "model_limits.json"
+    try:
+        with open(limits_file, encoding="utf-8") as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        logger.warning(f"Failed to load model_limits.json: {e}. Using fallback defaults.")
+        # Fallback to hardcoded defaults if file is missing
+        return {
+            "models": {
+                "claude-opus-4-5-20251101": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
+                "claude-sonnet-4-5-20250929": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
+                "claude-haiku-4-5-20251001": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
+            },
+            "thinking_levels": {
+                "none": {"budget": None},
+                "low": {"budget": 1024},
+                "medium": {"budget": 4096},
+                "high": {"budget": 16384},
+                "ultrathink": {"budget": 60000},
+            },
+        }
+
+# Load model limits at module initialization
+_MODEL_LIMITS = _load_model_limits()
+
+# Thinking level to budget tokens mapping (loaded from model_limits.json)
 # Values must match auto-claude-ui/src/shared/constants/models.ts THINKING_BUDGET_MAP
 THINKING_BUDGET_MAP: dict[str, int | None] = {
-    "none": None,
-    "low": 1024,
-    "medium": 4096,  # Moderate analysis
-    "high": 16384,  # Deep thinking for QA review
-    "ultrathink": 63999,  # Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit)
+    level: config.get("budget")
+    for level, config in _MODEL_LIMITS.get("thinking_levels", {}).items()
 }
 
 # Spec runner phase-specific thinking levels
@@ -126,27 +155,100 @@ def resolve_model_id(model: str) -> str:
     return model
 
 
-def get_thinking_budget(thinking_level: str) -> int | None:
+def get_model_max_output_tokens(model_id: str) -> int:
     """
-    Get the thinking budget for a thinking level.
+    Get the maximum output tokens for a specific model.
 
     Args:
-        thinking_level: Thinking level (none, low, medium, high, ultrathink)
+        model_id: Full model ID (e.g., 'claude-opus-4-5-20251101')
+
+    Returns:
+        Maximum output tokens for the model (defaults to 64000 if model not found)
+    """
+    models = _MODEL_LIMITS.get("models", {})
+    model_config = models.get(model_id, {})
+    return model_config.get("max_output_tokens", 64000)
+
+
+def get_model_max_thinking_tokens(model_id: str) -> int:
+    """
+    Get the maximum thinking tokens for a specific model.
+
+    This represents the safe maximum thinking budget that leaves enough buffer
+    for SDK overhead and ensures thinking_budget < max_tokens constraint.
+
+    Args:
+        model_id: Full model ID (e.g., 'claude-opus-4-5-20251101')
 
     Returns:
-        Token budget or None for no extended thinking
+        Maximum thinking tokens for the model (defaults to 60000 if model not found)
     """
-    import logging
+    models = _MODEL_LIMITS.get("models", {})
+    model_config = models.get(model_id, {})
+    return model_config.get("max_thinking_tokens", 60000)
+
+
+def validate_thinking_budget(
+    thinking_budget: int | None, model_id: str
+) -> tuple[int | None, bool]:
+    """
+    Validate and cap thinking budget to ensure it doesn't exceed model limits.
+
+    API constraint: max_tokens > thinking.budget_tokens (must be strictly greater)
+    SDK bug #8756: SDK sometimes reduces max_tokens without adjusting thinking budget
+
+    Args:
+        thinking_budget: Requested thinking budget (or None for no extended thinking)
+        model_id: Full model ID to validate against
 
+    Returns:
+        Tuple of (capped_budget, was_capped)
+        - capped_budget: Valid thinking budget that respects model limits
+        - was_capped: True if the budget was reduced, False otherwise
+    """
+    if thinking_budget is None:
+        return None, False
+
+    max_thinking = get_model_max_thinking_tokens(model_id)
+
+    if thinking_budget > max_thinking:
+        logger.warning(
+            f"Thinking budget {thinking_budget} exceeds model limit {max_thinking} for {model_id}. "
+            f"Capping to {max_thinking} tokens."
+        )
+        return max_thinking, True
+
+    return thinking_budget, False
+
+
+def get_thinking_budget(thinking_level: str, model_id: str | None = None) -> int | None:
+    """
+    Get the thinking budget for a thinking level, optionally validated against model limits.
+
+    Args:
+        thinking_level: Thinking level (none, low, medium, high, ultrathink)
+        model_id: Optional model ID to validate against (if provided, budget is capped to model limits)
+
+    Returns:
+        Token budget or None for no extended thinking (capped to model limits if model_id provided)
+    """
     if thinking_level not in THINKING_BUDGET_MAP:
         valid_levels = ", ".join(THINKING_BUDGET_MAP.keys())
-        logging.warning(
+        logger.warning(
             f"Invalid thinking_level '{thinking_level}'. Valid values: {valid_levels}. "
             f"Defaulting to 'medium'."
         )
-        return THINKING_BUDGET_MAP["medium"]
+        thinking_level = "medium"
+
+    budget = THINKING_BUDGET_MAP[thinking_level]
 
-    return THINKING_BUDGET_MAP[thinking_level]
+    # Validate against model limits if model_id provided
+    if model_id and budget is not None:
+        budget, was_capped = validate_thinking_budget(budget, model_id)
+        if was_capped:
+            logger.info(f"Thinking budget capped for model {model_id}")
+
+    return budget
 
 
 def load_task_metadata(spec_dir: Path) -> TaskMetadataConfig | None:
@@ -261,20 +363,25 @@ def get_phase_thinking_budget(
     spec_dir: Path,
     phase: Phase,
     cli_thinking: str | None = None,
+    cli_model: str | None = None,
 ) -> int | None:
     """
     Get the thinking budget tokens for a specific execution phase.
 
+    The budget is validated against model-specific limits to prevent API errors.
+
     Args:
         spec_dir: Path to the spec directory
         phase: Execution phase (spec, planning, coding, qa)
         cli_thinking: Thinking level from CLI argument (optional)
+        cli_model: Model from CLI argument (optional, used for validation)
 
     Returns:
-        Token budget or None for no extended thinking
+        Token budget or None for no extended thinking (capped to model limits)
     """
     thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking)
-    return get_thinking_budget(thinking_level)
+    model_id = get_phase_model(spec_dir, phase, cli_model)
+    return get_thinking_budget(thinking_level, model_id=model_id)
 
 
 def get_phase_config(
@@ -286,6 +393,8 @@ def get_phase_config(
     """
     Get the full configuration for a specific execution phase.
 
+    Thinking budget is validated against model-specific limits.
+
     Args:
         spec_dir: Path to the spec directory
         phase: Execution phase (spec, planning, coding, qa)
@@ -297,12 +406,12 @@ def get_phase_config(
     """
     model_id = get_phase_model(spec_dir, phase, cli_model)
     thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking)
-    thinking_budget = get_thinking_budget(thinking_level)
+    thinking_budget = get_thinking_budget(thinking_level, model_id=model_id)
 
     return model_id, thinking_level, thinking_budget
 
 
-def get_spec_phase_thinking_budget(phase_name: str) -> int | None:
+def get_spec_phase_thinking_budget(phase_name: str, model_id: str | None = None) -> int | None:
     """
     Get the thinking budget for a specific spec runner phase.
 
@@ -311,9 +420,10 @@ def get_spec_phase_thinking_budget(phase_name: str) -> int | None:
 
     Args:
         phase_name: Name of the spec phase (e.g., 'discovery', 'spec_writing')
+        model_id: Optional model ID to validate budget against model limits
 
     Returns:
         Token budget for extended thinking, or None for no extended thinking
     """
     thinking_level = SPEC_PHASE_THINKING_LEVELS.get(phase_name, "medium")
-    return get_thinking_budget(thinking_level)
+    return get_thinking_budget(thinking_level, model_id=model_id)
diff --git a/apps/frontend/src/shared/constants/models.ts b/apps/frontend/src/shared/constants/models.ts
index 72e76a0c0a..303150efa8 100644
--- a/apps/frontend/src/shared/constants/models.ts
+++ b/apps/frontend/src/shared/constants/models.ts
@@ -23,12 +23,33 @@ export const MODEL_ID_MAP: Record<string, string> = {
 } as const;
 
 // Maps thinking levels to budget tokens (null = no extended thinking)
+// Must match apps/backend/model_limits.json thinking_levels
 export const THINKING_BUDGET_MAP: Record<string, number | null> = {
   none: null,
   low: 1024,
   medium: 4096,
   high: 16384,
-  ultrathink: 63999 // Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit)
+  ultrathink: 60000 // Maximum reasoning depth (leaves 4K buffer for SDK overhead, keeps max_tokens under 64K for all Claude 4.5 models)
+} as const;
+
+// Model-specific output token limits (all Claude 4.5 models have 64K max_tokens)
+export const MODEL_OUTPUT_LIMITS: Record<string, number> = {
+  'claude-opus-4-5-20251101': 64000,
+  'claude-sonnet-4-5-20250929': 64000,
+  'claude-haiku-4-5-20251001': 64000,
+  opus: 64000,
+  sonnet: 64000,
+  haiku: 64000
+} as const;
+
+// Maximum safe thinking budget for each model (leaves buffer for SDK overhead)
+export const MODEL_MAX_THINKING: Record<string, number> = {
+  'claude-opus-4-5-20251101': 60000,
+  'claude-sonnet-4-5-20250929': 60000,
+  'claude-haiku-4-5-20251001': 60000,
+  opus: 60000,
+  sonnet: 60000,
+  haiku: 60000
 } as const;
 
 // ============================================
diff --git a/tests/test_model_limits.py b/tests/test_model_limits.py
new file mode 100644
index 0000000000..e3a20bf4c3
--- /dev/null
+++ b/tests/test_model_limits.py
@@ -0,0 +1,152 @@
+"""
+Tests for model-specific limits and validation in phase_config module.
+
+Ensures that thinking budgets are properly validated against model limits
+and that the configuration system correctly handles model-specific constraints.
+"""
+
+import logging
+import sys
+from pathlib import Path
+
+# Add auto-claude to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend"))
+
+from phase_config import (
+    get_model_max_output_tokens,
+    get_model_max_thinking_tokens,
+    get_thinking_budget,
+    validate_thinking_budget,
+)
+
+
+class TestModelLimits:
+    """Test model-specific token limits and validation."""
+
+    def test_all_models_have_64k_output_limit(self):
+        """Test that all Claude 4.5 models have 64,000 max_tokens limit."""
+        models = [
+            "claude-opus-4-5-20251101",
+            "claude-sonnet-4-5-20250929",
+            "claude-haiku-4-5-20251001",
+        ]
+
+        for model_id in models:
+            max_output = get_model_max_output_tokens(model_id)
+            assert (
+                max_output == 64000
+            ), f"{model_id} should have 64000 max_tokens, got {max_output}"
+
+    def test_all_models_have_60k_thinking_limit(self):
+        """Test that all models have 60,000 max thinking tokens (4K buffer)."""
+        models = [
+            "claude-opus-4-5-20251101",
+            "claude-sonnet-4-5-20250929",
+            "claude-haiku-4-5-20251001",
+        ]
+
+        for model_id in models:
+            max_thinking = get_model_max_thinking_tokens(model_id)
+            assert (
+                max_thinking == 60000
+            ), f"{model_id} should have 60000 max thinking tokens, got {max_thinking}"
+
+    def test_unknown_model_uses_defaults(self):
+        """Test that unknown models default to 64K output and 60K thinking."""
+        unknown_model = "claude-unknown-model-v99"
+
+        max_output = get_model_max_output_tokens(unknown_model)
+        max_thinking = get_model_max_thinking_tokens(unknown_model)
+
+        assert max_output == 64000, "Unknown model should default to 64000 max_tokens"
+        assert (
+            max_thinking == 60000
+        ), "Unknown model should default to 60000 max thinking tokens"
+
+    def test_validate_thinking_budget_within_limits(self):
+        """Test that valid thinking budgets pass validation unchanged."""
+        model_id = "claude-opus-4-5-20251101"
+        budgets = [1024, 4096, 16384, 50000, 60000]
+
+        for budget in budgets:
+            result, was_capped = validate_thinking_budget(budget, model_id)
+            assert result == budget, f"Budget {budget} should not be modified"
+            assert not was_capped, f"Budget {budget} should not be capped"
+
+    def test_validate_thinking_budget_caps_excessive_budget(self, caplog):
+        """Test that thinking budgets exceeding model limits are capped."""
+        model_id = "claude-opus-4-5-20251101"
+        excessive_budgets = [60001, 63999, 64000, 100000]
+
+        with caplog.at_level(logging.WARNING):
+            for budget in excessive_budgets:
+                result, was_capped = validate_thinking_budget(budget, model_id)
+                assert result == 60000, f"Budget {budget} should be capped to 60000"
+                assert was_capped, f"Budget {budget} should be flagged as capped"
+
+            # Should have logged warnings for each excessive budget
+            assert len(caplog.records) == len(excessive_budgets)
+            for record in caplog.records:
+                assert "exceeds model limit" in record.message
+
+    def test_validate_thinking_budget_handles_none(self):
+        """Test that None thinking budget passes through validation."""
+        model_id = "claude-opus-4-5-20251101"
+        result, was_capped = validate_thinking_budget(None, model_id)
+
+        assert result is None, "None budget should pass through unchanged"
+        assert not was_capped, "None budget should not be flagged as capped"
+
+    def test_get_thinking_budget_with_model_validation(self, caplog):
+        """Test that get_thinking_budget validates against model limits when model_id provided."""
+        model_id = "claude-opus-4-5-20251101"
+
+        # Normal levels should work fine
+        budget = get_thinking_budget("low", model_id=model_id)
+        assert budget == 1024
+
+        budget = get_thinking_budget("medium", model_id=model_id)
+        assert budget == 4096
+
+        budget = get_thinking_budget("high", model_id=model_id)
+        assert budget == 16384
+
+        # Ultrathink should be capped at 60000 (not exceed it)
+        budget = get_thinking_budget("ultrathink", model_id=model_id)
+        assert budget == 60000
+
+    def test_get_thinking_budget_without_model_validation(self):
+        """Test that get_thinking_budget works without model_id (backward compatibility)."""
+        # Should work without model_id parameter
+        budget = get_thinking_budget("low")
+        assert budget == 1024
+
+        budget = get_thinking_budget("ultrathink")
+        assert budget == 60000
+
+    def test_thinking_budget_leaves_buffer_for_sdk(self):
+        """Test that max thinking budget leaves adequate buffer for SDK overhead."""
+        model_id = "claude-opus-4-5-20251101"
+        max_output = get_model_max_output_tokens(model_id)
+        max_thinking = get_model_max_thinking_tokens(model_id)
+
+        # Buffer should be at least 4000 tokens (mentioned in model_limits.json)
+        buffer = max_output - max_thinking
+        assert buffer >= 4000, f"Buffer should be at least 4K tokens, got {buffer}"
+
+    def test_api_constraint_satisfied(self):
+        """Test that thinking budget is strictly less than max_tokens (API constraint)."""
+        model_id = "claude-opus-4-5-20251101"
+        max_output = get_model_max_output_tokens(model_id)
+        max_thinking = get_model_max_thinking_tokens(model_id)
+
+        # API constraint: max_tokens > thinking.budget_tokens
+        assert (
+            max_thinking < max_output
+        ), f"thinking budget ({max_thinking}) must be < max_tokens ({max_output})"
+
+        # Also test with ultrathink budget
+        ultrathink_budget = get_thinking_budget("ultrathink", model_id=model_id)
+        assert (
+            ultrathink_budget < max_output
+        ), f"ultrathink ({ultrathink_budget}) must be < max_tokens ({max_output})"
diff --git a/tests/test_thinking_level_validation.py b/tests/test_thinking_level_validation.py
index 34da1a5198..9fe4914431 100644
--- a/tests/test_thinking_level_validation.py
+++ b/tests/test_thinking_level_validation.py
@@ -32,8 +32,8 @@ def test_none_level_returns_none(self):
         assert get_thinking_budget("none") is None
 
     def test_ultrathink_max_budget(self):
-        """Test that 'ultrathink' returns maximum budget (63999 so max_tokens = 63999 + 1 = 64000 limit)."""
-        assert get_thinking_budget("ultrathink") == 63999
+        """Test that 'ultrathink' returns maximum budget (60000 to keep max_tokens under 64000 with 4K SDK buffer)."""
+        assert get_thinking_budget("ultrathink") == 60000
 
     def test_invalid_level_logs_warning(self, caplog):
         """Test that invalid thinking level logs a warning."""
@@ -89,4 +89,4 @@ def test_budget_values_match_expected(self):
         assert get_thinking_budget("low") == 1024
         assert get_thinking_budget("medium") == 4096
         assert get_thinking_budget("high") == 16384
-        assert get_thinking_budget("ultrathink") == 63999
+        assert get_thinking_budget("ultrathink") == 60000