From 0dd2717317df89a35003b944c002f205ef268f99 Mon Sep 17 00:00:00 2001 From: Sebastian Geraldes <199673787+sebastiangeraldes@users.noreply.github.com> Date: Tue, 20 Jan 2026 14:03:52 -0300 Subject: [PATCH] fix: Replace hardcoded token limits with model-specific configuration Problem: - Spec creation failing with "max_tokens: 65537 > 64000" error for Opus 4.5 - Magic numbers scattered across codebase (62000, 63999, 64000) - No validation against model-specific limits - SDK bug #8756 causes intermittent validation errors when max_tokens is reduced without adjusting thinking budget Solution: - Created model_limits.json configuration file with all Claude 4.5 model limits - All models have 64K max_output_tokens (Opus, Sonnet, Haiku 4.5) - Set ultrathink budget to 60K (leaves 4K buffer for SDK overhead) - Added validation functions to cap thinking budgets to model limits - Updated frontend constants to match backend configuration - Added comprehensive tests for model-specific validation Changes: - apps/backend/model_limits.json: New configuration file with model limits - apps/backend/phase_config.py: Load limits from config, add validation - apps/frontend/src/shared/constants/models.ts: Add model limit constants - tests/test_model_limits.py: New tests for model-specific validation (10 tests) - tests/test_thinking_level_validation.py: Update ultrathink budget to 60K Technical Details: - API constraint: max_tokens > thinking.budget_tokens (strictly greater) - All Claude 4.5 models: 64K max output, 200K context window - Safe thinking budget: 60K tokens (4K buffer for SDK overhead) - Graceful degradation: Warns and caps excessive budgets instead of failing Testing: - All 19 tests pass (9 existing + 10 new) - Validates budget capping, API constraints, and buffer requirements Fixes SDK bug workaround for issue anthropics/claude-code#8756 Co-Authored-By: Claude Opus 4.5 Signed-off-by: Sebastian Geraldes <199673787+sebastiangeraldes@users.noreply.github.com> --- apps/backend/model_limits.json | 56 +++++++ apps/backend/phase_config.py | 150 +++++++++++++++--- apps/frontend/src/shared/constants/models.ts | 23 ++- tests/test_model_limits.py | 152 +++++++++++++++++++ tests/test_thinking_level_validation.py | 6 +- 5 files changed, 363 insertions(+), 24 deletions(-) create mode 100644 apps/backend/model_limits.json create mode 100644 tests/test_model_limits.py diff --git a/apps/backend/model_limits.json b/apps/backend/model_limits.json new file mode 100644 index 0000000000..2d18904381 --- /dev/null +++ b/apps/backend/model_limits.json @@ -0,0 +1,56 @@ +{ + "$schema": "model_limits_schema.json", + "description": "Model-specific token limits for Claude API. All values are in tokens.", + "models": { + "claude-opus-4-5-20251101": { + "display_name": "Claude Opus 4.5", + "max_output_tokens": 64000, + "context_window": 200000, + "max_thinking_tokens": 60000, + "notes": "Maximum output increased from 32K (Opus 4.1) to 64K" + }, + "claude-sonnet-4-5-20250929": { + "display_name": "Claude Sonnet 4.5", + "max_output_tokens": 64000, + "context_window": 200000, + "max_thinking_tokens": 60000, + "notes": "Standard output limit, 1M context available in beta" + }, + "claude-haiku-4-5-20251001": { + "display_name": "Claude Haiku 4.5", + "max_output_tokens": 64000, + "context_window": 200000, + "max_thinking_tokens": 60000, + "notes": "Output increased from 8K (Haiku 3.5) to 64K" + } + }, + "thinking_levels": { + "none": { + "budget": null, + "description": "No extended thinking" + }, + "low": { + "budget": 1024, + "description": "Brief consideration (1K tokens)" + }, + "medium": { + "budget": 4096, + "description": "Moderate analysis (4K tokens)" + }, + "high": { + "budget": 16384, + "description": "Deep thinking for complex tasks (16K tokens)" + }, + "ultrathink": { + "budget": 60000, + "description": "Maximum reasoning depth (60K tokens, leaves 4K buffer for SDK overhead)", + "safety_note": "SDK may reduce max_tokens, so we keep 4K buffer below the 64K limit" + } + }, + "validation_rules": { + "max_thinking_tokens_must_be_less_than_max_output": true, + "min_thinking_budget": 1024, + "safety_buffer_tokens": 4000, + "notes": "API constraint: max_tokens > thinking.budget_tokens. SDK bug #8756 causes intermittent validation errors when max_tokens is reduced without adjusting thinking budget." + } +} diff --git a/apps/backend/phase_config.py b/apps/backend/phase_config.py index 41af2d81eb..ffcbb6e7a1 100644 --- a/apps/backend/phase_config.py +++ b/apps/backend/phase_config.py @@ -3,14 +3,18 @@ =========================== Handles model and thinking level configuration for different execution phases. -Reads configuration from task_metadata.json and provides resolved model IDs. +Reads configuration from task_metadata.json and model_limits.json for model-specific constraints. """ import json +import logging import os from pathlib import Path from typing import Literal, TypedDict +# Get logger for this module +logger = logging.getLogger(__name__) + # Model shorthand to full model ID mapping MODEL_ID_MAP: dict[str, str] = { "opus": "claude-opus-4-5-20251101", @@ -18,14 +22,39 @@ "haiku": "claude-haiku-4-5-20251001", } -# Thinking level to budget tokens mapping (None = no extended thinking) +# Load model limits from configuration file +def _load_model_limits() -> dict: + """Load model limits from model_limits.json.""" + limits_file = Path(__file__).parent / "model_limits.json" + try: + with open(limits_file, encoding="utf-8") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + logger.warning(f"Failed to load model_limits.json: {e}. Using fallback defaults.") + # Fallback to hardcoded defaults if file is missing + return { + "models": { + "claude-opus-4-5-20251101": {"max_output_tokens": 64000, "max_thinking_tokens": 60000}, + "claude-sonnet-4-5-20250929": {"max_output_tokens": 64000, "max_thinking_tokens": 60000}, + "claude-haiku-4-5-20251001": {"max_output_tokens": 64000, "max_thinking_tokens": 60000}, + }, + "thinking_levels": { + "none": {"budget": None}, + "low": {"budget": 1024}, + "medium": {"budget": 4096}, + "high": {"budget": 16384}, + "ultrathink": {"budget": 60000}, + }, + } + +# Load model limits at module initialization +_MODEL_LIMITS = _load_model_limits() + +# Thinking level to budget tokens mapping (loaded from model_limits.json) # Values must match auto-claude-ui/src/shared/constants/models.ts THINKING_BUDGET_MAP THINKING_BUDGET_MAP: dict[str, int | None] = { - "none": None, - "low": 1024, - "medium": 4096, # Moderate analysis - "high": 16384, # Deep thinking for QA review - "ultrathink": 63999, # Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit) + level: config.get("budget") + for level, config in _MODEL_LIMITS.get("thinking_levels", {}).items() } # Spec runner phase-specific thinking levels @@ -126,27 +155,100 @@ def resolve_model_id(model: str) -> str: return model -def get_thinking_budget(thinking_level: str) -> int | None: +def get_model_max_output_tokens(model_id: str) -> int: """ - Get the thinking budget for a thinking level. + Get the maximum output tokens for a specific model. Args: - thinking_level: Thinking level (none, low, medium, high, ultrathink) + model_id: Full model ID (e.g., 'claude-opus-4-5-20251101') + + Returns: + Maximum output tokens for the model (defaults to 64000 if model not found) + """ + models = _MODEL_LIMITS.get("models", {}) + model_config = models.get(model_id, {}) + return model_config.get("max_output_tokens", 64000) + + +def get_model_max_thinking_tokens(model_id: str) -> int: + """ + Get the maximum thinking tokens for a specific model. + + This represents the safe maximum thinking budget that leaves enough buffer + for SDK overhead and ensures thinking_budget < max_tokens constraint. + + Args: + model_id: Full model ID (e.g., 'claude-opus-4-5-20251101') Returns: - Token budget or None for no extended thinking + Maximum thinking tokens for the model (defaults to 60000 if model not found) """ - import logging + models = _MODEL_LIMITS.get("models", {}) + model_config = models.get(model_id, {}) + return model_config.get("max_thinking_tokens", 60000) + + +def validate_thinking_budget( + thinking_budget: int | None, model_id: str +) -> tuple[int | None, bool]: + """ + Validate and cap thinking budget to ensure it doesn't exceed model limits. + + API constraint: max_tokens > thinking.budget_tokens (must be strictly greater) + SDK bug #8756: SDK sometimes reduces max_tokens without adjusting thinking budget + + Args: + thinking_budget: Requested thinking budget (or None for no extended thinking) + model_id: Full model ID to validate against + Returns: + Tuple of (capped_budget, was_capped) + - capped_budget: Valid thinking budget that respects model limits + - was_capped: True if the budget was reduced, False otherwise + """ + if thinking_budget is None: + return None, False + + max_thinking = get_model_max_thinking_tokens(model_id) + + if thinking_budget > max_thinking: + logger.warning( + f"Thinking budget {thinking_budget} exceeds model limit {max_thinking} for {model_id}. " + f"Capping to {max_thinking} tokens." + ) + return max_thinking, True + + return thinking_budget, False + + +def get_thinking_budget(thinking_level: str, model_id: str | None = None) -> int | None: + """ + Get the thinking budget for a thinking level, optionally validated against model limits. + + Args: + thinking_level: Thinking level (none, low, medium, high, ultrathink) + model_id: Optional model ID to validate against (if provided, budget is capped to model limits) + + Returns: + Token budget or None for no extended thinking (capped to model limits if model_id provided) + """ if thinking_level not in THINKING_BUDGET_MAP: valid_levels = ", ".join(THINKING_BUDGET_MAP.keys()) - logging.warning( + logger.warning( f"Invalid thinking_level '{thinking_level}'. Valid values: {valid_levels}. " f"Defaulting to 'medium'." ) - return THINKING_BUDGET_MAP["medium"] + thinking_level = "medium" + + budget = THINKING_BUDGET_MAP[thinking_level] - return THINKING_BUDGET_MAP[thinking_level] + # Validate against model limits if model_id provided + if model_id and budget is not None: + budget, was_capped = validate_thinking_budget(budget, model_id) + if was_capped: + logger.info(f"Thinking budget capped for model {model_id}") + + return budget def load_task_metadata(spec_dir: Path) -> TaskMetadataConfig | None: @@ -261,20 +363,25 @@ def get_phase_thinking_budget( spec_dir: Path, phase: Phase, cli_thinking: str | None = None, + cli_model: str | None = None, ) -> int | None: """ Get the thinking budget tokens for a specific execution phase. + The budget is validated against model-specific limits to prevent API errors. + Args: spec_dir: Path to the spec directory phase: Execution phase (spec, planning, coding, qa) cli_thinking: Thinking level from CLI argument (optional) + cli_model: Model from CLI argument (optional, used for validation) Returns: - Token budget or None for no extended thinking + Token budget or None for no extended thinking (capped to model limits) """ thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking) - return get_thinking_budget(thinking_level) + model_id = get_phase_model(spec_dir, phase, cli_model) + return get_thinking_budget(thinking_level, model_id=model_id) def get_phase_config( @@ -286,6 +393,8 @@ def get_phase_config( """ Get the full configuration for a specific execution phase. + Thinking budget is validated against model-specific limits. + Args: spec_dir: Path to the spec directory phase: Execution phase (spec, planning, coding, qa) @@ -297,12 +406,12 @@ def get_phase_config( """ model_id = get_phase_model(spec_dir, phase, cli_model) thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking) - thinking_budget = get_thinking_budget(thinking_level) + thinking_budget = get_thinking_budget(thinking_level, model_id=model_id) return model_id, thinking_level, thinking_budget -def get_spec_phase_thinking_budget(phase_name: str) -> int | None: +def get_spec_phase_thinking_budget(phase_name: str, model_id: str | None = None) -> int | None: """ Get the thinking budget for a specific spec runner phase. @@ -311,9 +420,10 @@ def get_spec_phase_thinking_budget(phase_name: str) -> int | None: Args: phase_name: Name of the spec phase (e.g., 'discovery', 'spec_writing') + model_id: Optional model ID to validate budget against model limits Returns: Token budget for extended thinking, or None for no extended thinking """ thinking_level = SPEC_PHASE_THINKING_LEVELS.get(phase_name, "medium") - return get_thinking_budget(thinking_level) + return get_thinking_budget(thinking_level, model_id=model_id) diff --git a/apps/frontend/src/shared/constants/models.ts b/apps/frontend/src/shared/constants/models.ts index 72e76a0c0a..303150efa8 100644 --- a/apps/frontend/src/shared/constants/models.ts +++ b/apps/frontend/src/shared/constants/models.ts @@ -23,12 +23,33 @@ export const MODEL_ID_MAP: Record = { } as const; // Maps thinking levels to budget tokens (null = no extended thinking) +// Must match apps/backend/model_limits.json thinking_levels export const THINKING_BUDGET_MAP: Record = { none: null, low: 1024, medium: 4096, high: 16384, - ultrathink: 63999 // Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit) + ultrathink: 60000 // Maximum reasoning depth (leaves 4K buffer for SDK overhead, keeps max_tokens under 64K for all Claude 4.5 models) +} as const; + +// Model-specific output token limits (all Claude 4.5 models have 64K max_tokens) +export const MODEL_OUTPUT_LIMITS: Record = { + 'claude-opus-4-5-20251101': 64000, + 'claude-sonnet-4-5-20250929': 64000, + 'claude-haiku-4-5-20251001': 64000, + opus: 64000, + sonnet: 64000, + haiku: 64000 +} as const; + +// Maximum safe thinking budget for each model (leaves buffer for SDK overhead) +export const MODEL_MAX_THINKING: Record = { + 'claude-opus-4-5-20251101': 60000, + 'claude-sonnet-4-5-20250929': 60000, + 'claude-haiku-4-5-20251001': 60000, + opus: 60000, + sonnet: 60000, + haiku: 60000 } as const; // ============================================ diff --git a/tests/test_model_limits.py b/tests/test_model_limits.py new file mode 100644 index 0000000000..e3a20bf4c3 --- /dev/null +++ b/tests/test_model_limits.py @@ -0,0 +1,152 @@ +""" +Tests for model-specific limits and validation in phase_config module. + +Ensures that thinking budgets are properly validated against model limits +and that the configuration system correctly handles model-specific constraints. +""" + +import logging +import sys +from pathlib import Path + +# Add auto-claude to path +sys.path.insert(0, str(Path(__file__).parent.parent / "apps" / "backend")) + +from phase_config import ( + get_model_max_output_tokens, + get_model_max_thinking_tokens, + get_thinking_budget, + validate_thinking_budget, +) + + +class TestModelLimits: + """Test model-specific token limits and validation.""" + + def test_all_models_have_64k_output_limit(self): + """Test that all Claude 4.5 models have 64,000 max_tokens limit.""" + models = [ + "claude-opus-4-5-20251101", + "claude-sonnet-4-5-20250929", + "claude-haiku-4-5-20251001", + ] + + for model_id in models: + max_output = get_model_max_output_tokens(model_id) + assert ( + max_output == 64000 + ), f"{model_id} should have 64000 max_tokens, got {max_output}" + + def test_all_models_have_60k_thinking_limit(self): + """Test that all models have 60,000 max thinking tokens (4K buffer).""" + models = [ + "claude-opus-4-5-20251101", + "claude-sonnet-4-5-20250929", + "claude-haiku-4-5-20251001", + ] + + for model_id in models: + max_thinking = get_model_max_thinking_tokens(model_id) + assert ( + max_thinking == 60000 + ), f"{model_id} should have 60000 max thinking tokens, got {max_thinking}" + + def test_unknown_model_uses_defaults(self): + """Test that unknown models default to 64K output and 60K thinking.""" + unknown_model = "claude-unknown-model-v99" + + max_output = get_model_max_output_tokens(unknown_model) + max_thinking = get_model_max_thinking_tokens(unknown_model) + + assert max_output == 64000, "Unknown model should default to 64000 max_tokens" + assert ( + max_thinking == 60000 + ), "Unknown model should default to 60000 max thinking tokens" + + def test_validate_thinking_budget_within_limits(self): + """Test that valid thinking budgets pass validation unchanged.""" + model_id = "claude-opus-4-5-20251101" + budgets = [1024, 4096, 16384, 50000, 60000] + + for budget in budgets: + result, was_capped = validate_thinking_budget(budget, model_id) + assert result == budget, f"Budget {budget} should not be modified" + assert not was_capped, f"Budget {budget} should not be capped" + + def test_validate_thinking_budget_caps_excessive_budget(self, caplog): + """Test that thinking budgets exceeding model limits are capped.""" + model_id = "claude-opus-4-5-20251101" + excessive_budgets = [60001, 63999, 64000, 100000] + + with caplog.at_level(logging.WARNING): + for budget in excessive_budgets: + result, was_capped = validate_thinking_budget(budget, model_id) + assert result == 60000, f"Budget {budget} should be capped to 60000" + assert was_capped, f"Budget {budget} should be flagged as capped" + + # Should have logged warnings for each excessive budget + assert len(caplog.records) == len(excessive_budgets) + for record in caplog.records: + assert "exceeds model limit" in record.message + + def test_validate_thinking_budget_handles_none(self): + """Test that None thinking budget passes through validation.""" + model_id = "claude-opus-4-5-20251101" + result, was_capped = validate_thinking_budget(None, model_id) + + assert result is None, "None budget should pass through unchanged" + assert not was_capped, "None budget should not be flagged as capped" + + def test_get_thinking_budget_with_model_validation(self, caplog): + """Test that get_thinking_budget validates against model limits when model_id provided.""" + model_id = "claude-opus-4-5-20251101" + + # Normal levels should work fine + budget = get_thinking_budget("low", model_id=model_id) + assert budget == 1024 + + budget = get_thinking_budget("medium", model_id=model_id) + assert budget == 4096 + + budget = get_thinking_budget("high", model_id=model_id) + assert budget == 16384 + + # Ultrathink should be capped at 60000 (not exceed it) + budget = get_thinking_budget("ultrathink", model_id=model_id) + assert budget == 60000 + + def test_get_thinking_budget_without_model_validation(self): + """Test that get_thinking_budget works without model_id (backward compatibility).""" + # Should work without model_id parameter + budget = get_thinking_budget("low") + assert budget == 1024 + + budget = get_thinking_budget("ultrathink") + assert budget == 60000 + + def test_thinking_budget_leaves_buffer_for_sdk(self): + """Test that max thinking budget leaves adequate buffer for SDK overhead.""" + model_id = "claude-opus-4-5-20251101" + max_output = get_model_max_output_tokens(model_id) + max_thinking = get_model_max_thinking_tokens(model_id) + + # Buffer should be at least 4000 tokens (mentioned in model_limits.json) + buffer = max_output - max_thinking + assert buffer >= 4000, f"Buffer should be at least 4K tokens, got {buffer}" + + def test_api_constraint_satisfied(self): + """Test that thinking budget is strictly less than max_tokens (API constraint).""" + model_id = "claude-opus-4-5-20251101" + max_output = get_model_max_output_tokens(model_id) + max_thinking = get_model_max_thinking_tokens(model_id) + + # API constraint: max_tokens > thinking.budget_tokens + assert ( + max_thinking < max_output + ), f"thinking budget ({max_thinking}) must be < max_tokens ({max_output})" + + # Also test with ultrathink budget + ultrathink_budget = get_thinking_budget("ultrathink", model_id=model_id) + assert ( + ultrathink_budget < max_output + ), f"ultrathink ({ultrathink_budget}) must be < max_tokens ({max_output})" diff --git a/tests/test_thinking_level_validation.py b/tests/test_thinking_level_validation.py index 34da1a5198..9fe4914431 100644 --- a/tests/test_thinking_level_validation.py +++ b/tests/test_thinking_level_validation.py @@ -32,8 +32,8 @@ def test_none_level_returns_none(self): assert get_thinking_budget("none") is None def test_ultrathink_max_budget(self): - """Test that 'ultrathink' returns maximum budget (63999 so max_tokens = 63999 + 1 = 64000 limit).""" - assert get_thinking_budget("ultrathink") == 63999 + """Test that 'ultrathink' returns maximum budget (60000 to keep max_tokens under 64000 with 4K SDK buffer).""" + assert get_thinking_budget("ultrathink") == 60000 def test_invalid_level_logs_warning(self, caplog): """Test that invalid thinking level logs a warning.""" @@ -89,4 +89,4 @@ def test_budget_values_match_expected(self): assert get_thinking_budget("low") == 1024 assert get_thinking_budget("medium") == 4096 assert get_thinking_budget("high") == 16384 - assert get_thinking_budget("ultrathink") == 63999 + assert get_thinking_budget("ultrathink") == 60000