Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions apps/backend/model_limits.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"$schema": "model_limits_schema.json",
"description": "Model-specific token limits for Claude API. All values are in tokens.",
"models": {
"claude-opus-4-5-20251101": {
"display_name": "Claude Opus 4.5",
"max_output_tokens": 64000,
"context_window": 200000,
"max_thinking_tokens": 60000,
"notes": "Maximum output increased from 32K (Opus 4.1) to 64K"
},
"claude-sonnet-4-5-20250929": {
"display_name": "Claude Sonnet 4.5",
"max_output_tokens": 64000,
"context_window": 200000,
"max_thinking_tokens": 60000,
"notes": "Standard output limit, 1M context available in beta"
},
"claude-haiku-4-5-20251001": {
"display_name": "Claude Haiku 4.5",
"max_output_tokens": 64000,
"context_window": 200000,
"max_thinking_tokens": 60000,
"notes": "Output increased from 8K (Haiku 3.5) to 64K"
}
},
"thinking_levels": {
"none": {
"budget": null,
"description": "No extended thinking"
},
"low": {
"budget": 1024,
"description": "Brief consideration (1K tokens)"
},
"medium": {
"budget": 4096,
"description": "Moderate analysis (4K tokens)"
},
"high": {
"budget": 16384,
"description": "Deep thinking for complex tasks (16K tokens)"
},
"ultrathink": {
"budget": 60000,
"description": "Maximum reasoning depth (60K tokens, leaves 4K buffer for SDK overhead)",
"safety_note": "SDK may reduce max_tokens, so we keep 4K buffer below the 64K limit"
}
},
"validation_rules": {
"max_thinking_tokens_must_be_less_than_max_output": true,
"min_thinking_budget": 1024,
"safety_buffer_tokens": 4000,
"notes": "API constraint: max_tokens > thinking.budget_tokens. SDK bug #8756 causes intermittent validation errors when max_tokens is reduced without adjusting thinking budget."
}
}
150 changes: 130 additions & 20 deletions apps/backend/phase_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,58 @@
===========================

Handles model and thinking level configuration for different execution phases.
Reads configuration from task_metadata.json and provides resolved model IDs.
Reads configuration from task_metadata.json and model_limits.json for model-specific constraints.
"""

import json
import logging
import os
from pathlib import Path
from typing import Literal, TypedDict

# Get logger for this module
logger = logging.getLogger(__name__)

# Model shorthand to full model ID mapping
MODEL_ID_MAP: dict[str, str] = {
"opus": "claude-opus-4-5-20251101",
"sonnet": "claude-sonnet-4-5-20250929",
"haiku": "claude-haiku-4-5-20251001",
}

# Thinking level to budget tokens mapping (None = no extended thinking)
# Load model limits from configuration file
def _load_model_limits() -> dict:
"""Load model limits from model_limits.json."""
limits_file = Path(__file__).parent / "model_limits.json"
try:
with open(limits_file, encoding="utf-8") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.warning(f"Failed to load model_limits.json: {e}. Using fallback defaults.")
# Fallback to hardcoded defaults if file is missing
return {
"models": {
"claude-opus-4-5-20251101": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
"claude-sonnet-4-5-20250929": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
"claude-haiku-4-5-20251001": {"max_output_tokens": 64000, "max_thinking_tokens": 60000},
},
"thinking_levels": {
"none": {"budget": None},
"low": {"budget": 1024},
"medium": {"budget": 4096},
"high": {"budget": 16384},
"ultrathink": {"budget": 60000},
},
}

# Load model limits at module initialization
_MODEL_LIMITS = _load_model_limits()

# Thinking level to budget tokens mapping (loaded from model_limits.json)
# Values must match auto-claude-ui/src/shared/constants/models.ts THINKING_BUDGET_MAP
THINKING_BUDGET_MAP: dict[str, int | None] = {
"none": None,
"low": 1024,
"medium": 4096, # Moderate analysis
"high": 16384, # Deep thinking for QA review
"ultrathink": 63999, # Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit)
level: config.get("budget")
for level, config in _MODEL_LIMITS.get("thinking_levels", {}).items()
}

# Spec runner phase-specific thinking levels
Expand Down Expand Up @@ -126,27 +155,100 @@ def resolve_model_id(model: str) -> str:
return model


def get_thinking_budget(thinking_level: str) -> int | None:
def get_model_max_output_tokens(model_id: str) -> int:
"""
Get the thinking budget for a thinking level.
Get the maximum output tokens for a specific model.

Args:
thinking_level: Thinking level (none, low, medium, high, ultrathink)
model_id: Full model ID (e.g., 'claude-opus-4-5-20251101')

Returns:
Maximum output tokens for the model (defaults to 64000 if model not found)
"""
models = _MODEL_LIMITS.get("models", {})
model_config = models.get(model_id, {})
return model_config.get("max_output_tokens", 64000)


def get_model_max_thinking_tokens(model_id: str) -> int:
"""
Get the maximum thinking tokens for a specific model.

This represents the safe maximum thinking budget that leaves enough buffer
for SDK overhead and ensures thinking_budget < max_tokens constraint.

Args:
model_id: Full model ID (e.g., 'claude-opus-4-5-20251101')

Returns:
Token budget or None for no extended thinking
Maximum thinking tokens for the model (defaults to 60000 if model not found)
"""
import logging
models = _MODEL_LIMITS.get("models", {})
model_config = models.get(model_id, {})
return model_config.get("max_thinking_tokens", 60000)


def validate_thinking_budget(
thinking_budget: int | None, model_id: str
) -> tuple[int | None, bool]:
"""
Validate and cap thinking budget to ensure it doesn't exceed model limits.

API constraint: max_tokens > thinking.budget_tokens (must be strictly greater)
SDK bug #8756: SDK sometimes reduces max_tokens without adjusting thinking budget

Args:
thinking_budget: Requested thinking budget (or None for no extended thinking)
model_id: Full model ID to validate against

Returns:
Tuple of (capped_budget, was_capped)
- capped_budget: Valid thinking budget that respects model limits
- was_capped: True if the budget was reduced, False otherwise
"""
if thinking_budget is None:
return None, False

max_thinking = get_model_max_thinking_tokens(model_id)

if thinking_budget > max_thinking:
logger.warning(
f"Thinking budget {thinking_budget} exceeds model limit {max_thinking} for {model_id}. "
f"Capping to {max_thinking} tokens."
)
return max_thinking, True

return thinking_budget, False


def get_thinking_budget(thinking_level: str, model_id: str | None = None) -> int | None:
"""
Get the thinking budget for a thinking level, optionally validated against model limits.

Args:
thinking_level: Thinking level (none, low, medium, high, ultrathink)
model_id: Optional model ID to validate against (if provided, budget is capped to model limits)

Returns:
Token budget or None for no extended thinking (capped to model limits if model_id provided)
"""
if thinking_level not in THINKING_BUDGET_MAP:
valid_levels = ", ".join(THINKING_BUDGET_MAP.keys())
logging.warning(
logger.warning(
f"Invalid thinking_level '{thinking_level}'. Valid values: {valid_levels}. "
f"Defaulting to 'medium'."
)
return THINKING_BUDGET_MAP["medium"]
thinking_level = "medium"

budget = THINKING_BUDGET_MAP[thinking_level]

return THINKING_BUDGET_MAP[thinking_level]
# Validate against model limits if model_id provided
if model_id and budget is not None:
budget, was_capped = validate_thinking_budget(budget, model_id)
if was_capped:
logger.info(f"Thinking budget capped for model {model_id}")

return budget


def load_task_metadata(spec_dir: Path) -> TaskMetadataConfig | None:
Expand Down Expand Up @@ -261,20 +363,25 @@ def get_phase_thinking_budget(
spec_dir: Path,
phase: Phase,
cli_thinking: str | None = None,
cli_model: str | None = None,
) -> int | None:
"""
Get the thinking budget tokens for a specific execution phase.

The budget is validated against model-specific limits to prevent API errors.

Args:
spec_dir: Path to the spec directory
phase: Execution phase (spec, planning, coding, qa)
cli_thinking: Thinking level from CLI argument (optional)
cli_model: Model from CLI argument (optional, used for validation)

Returns:
Token budget or None for no extended thinking
Token budget or None for no extended thinking (capped to model limits)
"""
thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking)
return get_thinking_budget(thinking_level)
model_id = get_phase_model(spec_dir, phase, cli_model)
return get_thinking_budget(thinking_level, model_id=model_id)


def get_phase_config(
Expand All @@ -286,6 +393,8 @@ def get_phase_config(
"""
Get the full configuration for a specific execution phase.

Thinking budget is validated against model-specific limits.

Args:
spec_dir: Path to the spec directory
phase: Execution phase (spec, planning, coding, qa)
Expand All @@ -297,12 +406,12 @@ def get_phase_config(
"""
model_id = get_phase_model(spec_dir, phase, cli_model)
thinking_level = get_phase_thinking(spec_dir, phase, cli_thinking)
thinking_budget = get_thinking_budget(thinking_level)
thinking_budget = get_thinking_budget(thinking_level, model_id=model_id)

return model_id, thinking_level, thinking_budget


def get_spec_phase_thinking_budget(phase_name: str) -> int | None:
def get_spec_phase_thinking_budget(phase_name: str, model_id: str | None = None) -> int | None:
"""
Get the thinking budget for a specific spec runner phase.

Expand All @@ -311,9 +420,10 @@ def get_spec_phase_thinking_budget(phase_name: str) -> int | None:

Args:
phase_name: Name of the spec phase (e.g., 'discovery', 'spec_writing')
model_id: Optional model ID to validate budget against model limits

Returns:
Token budget for extended thinking, or None for no extended thinking
"""
thinking_level = SPEC_PHASE_THINKING_LEVELS.get(phase_name, "medium")
return get_thinking_budget(thinking_level)
return get_thinking_budget(thinking_level, model_id=model_id)
23 changes: 22 additions & 1 deletion apps/frontend/src/shared/constants/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,33 @@ export const MODEL_ID_MAP: Record<string, string> = {
} as const;

// Maps thinking levels to budget tokens (null = no extended thinking)
// Must match apps/backend/model_limits.json thinking_levels
export const THINKING_BUDGET_MAP: Record<string, number | null> = {
none: null,
low: 1024,
medium: 4096,
high: 16384,
ultrathink: 63999 // Maximum reasoning depth (API requires max_tokens >= budget + 1, so 63999 + 1 = 64000 limit)
ultrathink: 60000 // Maximum reasoning depth (leaves 4K buffer for SDK overhead, keeps max_tokens under 64K for all Claude 4.5 models)
} as const;

// Model-specific output token limits (all Claude 4.5 models have 64K max_tokens)
export const MODEL_OUTPUT_LIMITS: Record<string, number> = {
'claude-opus-4-5-20251101': 64000,
'claude-sonnet-4-5-20250929': 64000,
'claude-haiku-4-5-20251001': 64000,
opus: 64000,
sonnet: 64000,
haiku: 64000
} as const;

// Maximum safe thinking budget for each model (leaves buffer for SDK overhead)
export const MODEL_MAX_THINKING: Record<string, number> = {
'claude-opus-4-5-20251101': 60000,
'claude-sonnet-4-5-20250929': 60000,
'claude-haiku-4-5-20251001': 60000,
opus: 60000,
sonnet: 60000,
haiku: 60000
} as const;

// ============================================
Expand Down
Loading
Loading