Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ OPENROUTER_API_KEY=your-api-key-here # Get from https://openrouter.ai/keys
LLM_MODEL=anthropic/claude-sonnet-4 # Primary model
LLM_FALLBACK_MODEL=anthropic/claude-haiku # Fallback for resilience
LLM_TIMEOUT_SECONDS=30.0 # Request timeout
LLM_ENABLE_PROMPT_CACHING=true # Enable Anthropic prompt caching for cost reduction

# Rate limiting
RATE_LIMIT_REQUESTS=10 # Requests allowed per window
Expand Down
3 changes: 3 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class Settings(BaseSettings):
circuit_breaker_fail_max: int = 5
circuit_breaker_timeout: float = 60.0

# Prompt caching (Anthropic models via OpenRouter)
llm_enable_prompt_caching: bool = True

# Embeddings (via OpenRouter)
embedding_api_key: SecretStr | None = None
embedding_base_url: str = "https://openrouter.ai/api/v1"
Expand Down
79 changes: 71 additions & 8 deletions src/infrastructure/llm/openrouter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
timeout_seconds: float = 30.0,
circuit_breaker_fail_max: int = 5,
circuit_breaker_timeout: float = 60.0,
enable_prompt_caching: bool = True,
) -> None:
"""Initialize the OpenRouter provider.

Expand All @@ -56,6 +57,7 @@ def __init__(
timeout_seconds: Request timeout in seconds.
circuit_breaker_fail_max: Open circuit after this many failures.
circuit_breaker_timeout: Time in seconds before attempting recovery.
enable_prompt_caching: Enable Anthropic prompt caching via OpenRouter.

Raises:
LLMConfigurationError: If API key is missing.
Expand All @@ -72,6 +74,7 @@ def __init__(
)
self._default_model = default_model
self._timeout = timeout_seconds
self._enable_prompt_caching = enable_prompt_caching

# Circuit breaker: fail fast after repeated failures
self._breaker = CircuitBreaker(
Expand Down Expand Up @@ -165,6 +168,60 @@ async def _complete_with_resilience(
self._do_complete, system_prompt, user_message, model
)

def _build_system_message(self, system_prompt: str) -> dict[str, object]:
"""Build system message with optional cache control.

The cache_control format follows Anthropic's prompt caching API,
which OpenRouter passes through to Anthropic models. For non-Anthropic
models, the cache_control field is typically ignored by the provider.

Args:
system_prompt: The system prompt text.

Returns:
Message dict with cache_control if caching is enabled.
"""
if self._enable_prompt_caching:
return {
"role": "system",
"content": [
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
}
return {"role": "system", "content": system_prompt}

def _log_cache_metrics(self, response: object, model: str) -> None:
"""Log cache performance metrics from response.

Args:
response: The API response object.
model: The model used for the request.
"""
usage = getattr(response, "usage", None)
if usage is None:
return

# Safely extract cache metrics with fallback to 0
try:
cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
cache_creation = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
except (TypeError, ValueError):
# Handle cases where attributes aren't numeric
return

if cache_read > 0 or cache_creation > 0:
logger.info(
"llm_cache_metrics",
provider=self.PROVIDER_NAME,
model=model,
cache_read_tokens=cache_read,
cache_creation_tokens=cache_creation,
)

async def _do_complete(
self,
system_prompt: str,
Expand All @@ -184,16 +241,20 @@ async def _do_complete(
)

try:
messages: list[dict[str, object]] = [
self._build_system_message(system_prompt),
{"role": "user", "content": user_message},
]

response = await self._client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
messages=messages, # type: ignore[arg-type]
)

content = response.choices[0].message.content or ""

self._log_cache_metrics(response, model)

logger.debug(
"llm_request_success",
provider=self.PROVIDER_NAME,
Expand Down Expand Up @@ -333,11 +394,11 @@ async def _do_complete_with_history(
)

try:
# Build messages array with system prompt first
all_messages: list[dict[str, str]] = [
{"role": "system", "content": system_prompt}
# Build messages array with system prompt first (with cache control)
all_messages: list[dict[str, object]] = [
self._build_system_message(system_prompt)
]
all_messages.extend(messages)
all_messages.extend(messages) # type: ignore[arg-type]

response = await self._client.chat.completions.create(
model=model,
Expand All @@ -346,6 +407,8 @@ async def _do_complete_with_history(

content = response.choices[0].message.content or ""

self._log_cache_metrics(response, model)

logger.debug(
"llm_history_request_success",
provider=self.PROVIDER_NAME,
Expand Down
1 change: 1 addition & 0 deletions src/web/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def get_llm_provider(
timeout_seconds=settings.llm_timeout_seconds,
circuit_breaker_fail_max=settings.circuit_breaker_fail_max,
circuit_breaker_timeout=settings.circuit_breaker_timeout,
enable_prompt_caching=settings.llm_enable_prompt_caching,
)


Expand Down
Loading