ckrough · ckrough · Jan 10, 2026
diff --git a/.env.example b/.env.example
@@ -12,6 +12,7 @@ OPENROUTER_API_KEY=your-api-key-here  # Get from https://openrouter.ai/keys
 LLM_MODEL=anthropic/claude-sonnet-4  # Primary model
 LLM_FALLBACK_MODEL=anthropic/claude-haiku  # Fallback for resilience
 LLM_TIMEOUT_SECONDS=30.0  # Request timeout
+LLM_ENABLE_PROMPT_CACHING=true  # Enable Anthropic prompt caching for cost reduction
 
 # Rate limiting
 RATE_LIMIT_REQUESTS=10  # Requests allowed per window

diff --git a/src/config.py b/src/config.py
@@ -38,6 +38,9 @@ class Settings(BaseSettings):
     circuit_breaker_fail_max: int = 5
     circuit_breaker_timeout: float = 60.0
 
+    # Prompt caching (Anthropic models via OpenRouter)
+    llm_enable_prompt_caching: bool = True
+
     # Embeddings (via OpenRouter)
     embedding_api_key: SecretStr | None = None
     embedding_base_url: str = "https://openrouter.ai/api/v1"

diff --git a/src/infrastructure/llm/openrouter.py b/src/infrastructure/llm/openrouter.py
@@ -47,6 +47,7 @@ def __init__(
         timeout_seconds: float = 30.0,
         circuit_breaker_fail_max: int = 5,
         circuit_breaker_timeout: float = 60.0,
+        enable_prompt_caching: bool = True,
     ) -> None:
         """Initialize the OpenRouter provider.
 
@@ -56,6 +57,7 @@ def __init__(
             timeout_seconds: Request timeout in seconds.
             circuit_breaker_fail_max: Open circuit after this many failures.
             circuit_breaker_timeout: Time in seconds before attempting recovery.
+            enable_prompt_caching: Enable Anthropic prompt caching via OpenRouter.
 
         Raises:
             LLMConfigurationError: If API key is missing.
@@ -72,6 +74,7 @@ def __init__(
         )
         self._default_model = default_model
         self._timeout = timeout_seconds
+        self._enable_prompt_caching = enable_prompt_caching
 
         # Circuit breaker: fail fast after repeated failures
         self._breaker = CircuitBreaker(
@@ -165,6 +168,60 @@ async def _complete_with_resilience(
             self._do_complete, system_prompt, user_message, model
         )
 
+    def _build_system_message(self, system_prompt: str) -> dict[str, object]:
+        """Build system message with optional cache control.
+
+        The cache_control format follows Anthropic's prompt caching API,
+        which OpenRouter passes through to Anthropic models. For non-Anthropic
+        models, the cache_control field is typically ignored by the provider.
+
+        Args:
+            system_prompt: The system prompt text.
+
+        Returns:
+            Message dict with cache_control if caching is enabled.
+        """
+        if self._enable_prompt_caching:
+            return {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": system_prompt,
+                        "cache_control": {"type": "ephemeral"},
+                    }
+                ],
+            }
+        return {"role": "system", "content": system_prompt}
+
+    def _log_cache_metrics(self, response: object, model: str) -> None:
+        """Log cache performance metrics from response.
+
+        Args:
+            response: The API response object.
+            model: The model used for the request.
+        """
+        usage = getattr(response, "usage", None)
+        if usage is None:
+            return
+
+        # Safely extract cache metrics with fallback to 0
+        try:
+            cache_read = int(getattr(usage, "cache_read_input_tokens", 0) or 0)
+            cache_creation = int(getattr(usage, "cache_creation_input_tokens", 0) or 0)
+        except (TypeError, ValueError):
+            # Handle cases where attributes aren't numeric
+            return
+
+        if cache_read > 0 or cache_creation > 0:
+            logger.info(
+                "llm_cache_metrics",
+                provider=self.PROVIDER_NAME,
+                model=model,
+                cache_read_tokens=cache_read,
+                cache_creation_tokens=cache_creation,
+            )
+
     async def _do_complete(
         self,
         system_prompt: str,
@@ -184,16 +241,20 @@ async def _do_complete(
         )
 
         try:
+            messages: list[dict[str, object]] = [
+                self._build_system_message(system_prompt),
+                {"role": "user", "content": user_message},
+            ]
+
             response = await self._client.chat.completions.create(
                 model=model,
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": user_message},
-                ],
+                messages=messages,  # type: ignore[arg-type]
             )
 
             content = response.choices[0].message.content or ""
 
+            self._log_cache_metrics(response, model)
+
             logger.debug(
                 "llm_request_success",
                 provider=self.PROVIDER_NAME,
@@ -333,11 +394,11 @@ async def _do_complete_with_history(
         )
 
         try:
-            # Build messages array with system prompt first
-            all_messages: list[dict[str, str]] = [
-                {"role": "system", "content": system_prompt}
+            # Build messages array with system prompt first (with cache control)
+            all_messages: list[dict[str, object]] = [
+                self._build_system_message(system_prompt)
             ]
-            all_messages.extend(messages)
+            all_messages.extend(messages)  # type: ignore[arg-type]
 
             response = await self._client.chat.completions.create(
                 model=model,
@@ -346,6 +407,8 @@ async def _do_complete_with_history(
 
             content = response.choices[0].message.content or ""
 
+            self._log_cache_metrics(response, model)
+
             logger.debug(
                 "llm_history_request_success",
                 provider=self.PROVIDER_NAME,

diff --git a/src/web/routes.py b/src/web/routes.py
@@ -51,6 +51,7 @@ def get_llm_provider(
         timeout_seconds=settings.llm_timeout_seconds,
         circuit_breaker_fail_max=settings.circuit_breaker_fail_max,
         circuit_breaker_timeout=settings.circuit_breaker_timeout,
+        enable_prompt_caching=settings.llm_enable_prompt_caching,
     )