From 2de635606576572091fcb87b9502bc912d16f7c9 Mon Sep 17 00:00:00 2001
From: will brown <williambrown97@gmail.com>
Date: Tue, 17 Feb 2026 16:10:43 -0800
Subject: [PATCH 1/5] Add token parsing for Anthropic router replay responses

---
 tests/test_client_multimodal_types.py         |  95 +++++++++++++++
 .../clients/anthropic_messages_client.py      | 109 +++++++++++++++++-
 2 files changed, 202 insertions(+), 2 deletions(-)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index d51c38262..828bb7af2 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -1,4 +1,7 @@
+import base64
+
 import pytest
+import numpy as np
 from types import SimpleNamespace
 
 from verifiers.clients.openai_chat_completions_client import OpenAIChatCompletionsClient
@@ -237,3 +240,95 @@ async def test_anthropic_tool_call_round_trips_thinking_blocks():
         {"type": "thinking", "thinking": "hidden chain", "signature": "sig_1"},
         {"type": "tool_use", "id": "call_1", "name": "lookup", "input": {"q": "x"}},
     ]
+
+
+class _CaptureAnthropicMessages:
+    def __init__(self) -> None:
+        self.last_kwargs: dict | None = None
+
+    async def create(self, **kwargs):
+        self.last_kwargs = kwargs
+        return SimpleNamespace()
+
+
+class _CaptureAnthropicClient:
+    def __init__(self) -> None:
+        self.messages = _CaptureAnthropicMessages()
+
+
+@pytest.mark.asyncio
+async def test_anthropic_get_native_response_forwards_router_replay_with_extra_body():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    native_client = _CaptureAnthropicClient()
+    client = AnthropicMessagesClient(native_client)
+
+    await client.get_native_response(
+        prompt=[{"role": "user", "content": "hello"}],
+        model="claude-test",
+        sampling_args={
+            "max_tokens": 32,
+            "temperature": 0.2,
+            "extra_body": {"seed": 7},
+            "routed_experts": [[[1]]],
+        },
+    )
+
+    sent = native_client.messages.last_kwargs
+    assert sent is not None
+    assert sent["temperature"] == 0.2
+    assert sent["extra_body"] == {"seed": 7, "routed_experts": [[[1]]]}
+    assert "routed_experts" not in sent
+
+
+@pytest.mark.asyncio
+async def test_anthropic_from_native_response_extracts_tokens_and_router_replay():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    client = AnthropicMessagesClient(object())
+    routed = np.array([[[11, 12]], [[21, 22]]], dtype=np.int32)
+    native_response = SimpleNamespace(
+        id="msg_tokens",
+        model="claude-haiku-4-5",
+        stop_reason="end_turn",
+        usage=SimpleNamespace(input_tokens=3, output_tokens=2),
+        content=[SimpleNamespace(type="text", text="ok")],
+        prompt_token_ids=[1, 2, 3],
+        token_ids=[4, 5],
+        logprobs={"content": [{"logprob": -0.1}, {"logprob": -0.2}]},
+        routed_experts={
+            "data": base64.b85encode(routed.tobytes()).decode("utf-8"),
+            "shape": list(routed.shape),
+        },
+    )
+
+    response = await client.from_native_response(native_response)
+
+    assert response.message.tokens is not None
+    assert response.message.tokens.prompt_ids == [1, 2, 3]
+    assert response.message.tokens.completion_ids == [4, 5]
+    assert response.message.tokens.completion_logprobs == [-0.1, -0.2]
+    assert response.message.tokens.routed_experts == routed.tolist()
+
+
+@pytest.mark.asyncio
+async def test_anthropic_from_native_response_requires_logprobs_for_tokens():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    client = AnthropicMessagesClient(object())
+    native_response = SimpleNamespace(
+        id="msg_tokens_missing",
+        model="claude-haiku-4-5",
+        stop_reason="end_turn",
+        usage=SimpleNamespace(input_tokens=2, output_tokens=1),
+        content=[SimpleNamespace(type="text", text="ok")],
+        prompt_token_ids=[1, 2],
+        token_ids=[3],
+        logprobs=None,
+    )
+
+    response = await client.from_native_response(native_response)
+    assert response.message.tokens is None
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 9e80b63b7..041c64178 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -1,9 +1,12 @@
+import base64
 import functools
 import json
 import time
-from collections.abc import Mapping
+from collections.abc import Iterable, Mapping
 from typing import Any, cast
 
+import numpy as np
+
 from anthropic import (
     AsyncAnthropic,
     AuthenticationError,
@@ -38,6 +41,7 @@
     Messages,
     Response,
     ResponseMessage,
+    ResponseTokens,
     SamplingArgs,
     SystemMessage,
     TextMessage,
@@ -345,6 +349,11 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
             max_tokens = sampling_args.pop("max_tokens", None)
             sampling_args.pop("n", None)
             sampling_args.pop("stop", None)
+            extra_body = sampling_args.pop("extra_body", {})
+            if not isinstance(extra_body, Mapping):
+                raise TypeError(
+                    "sampling_args['extra_body'] must be a mapping when provided"
+                )
             if max_tokens is None:
                 self.logger.warning(
                     "max_tokens is not set but Anthropic /v1/messages endpoint requires it, falling back to max_tokens=4096"
@@ -352,6 +361,27 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
                 max_tokens = 4096
             sampling_args["max_tokens"] = max_tokens
 
+            # Anthropic SDK validates top-level request fields. Mirror OpenAI chat
+            # completions usage by forwarding unknown model args through extra_body
+            # so router replay payloads (e.g. routed_experts) can be passed via
+            # sampling_args without custom provider branching.
+            known_anthropic_args = {
+                "max_tokens",
+                "metadata",
+                "service_tier",
+                "stop_sequences",
+                "temperature",
+                "thinking",
+                "top_k",
+                "top_p",
+            }
+            extra_body_dict: dict[str, Any] = dict(extra_body)
+            for key in list(sampling_args.keys()):
+                if key not in known_anthropic_args:
+                    extra_body_dict[key] = sampling_args.pop(key)
+            if extra_body_dict:
+                sampling_args["extra_body"] = extra_body_dict
+
             return {k: v for k, v in sampling_args.items() if v is not None}
 
         # Remove internal framework keys not recognized by the Anthropic SDK
@@ -440,6 +470,81 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:
                 case _:
                     return None
 
+        def parse_completion_logprobs(logprobs: Any) -> list[float] | None:
+            if isinstance(logprobs, Mapping):
+                content = logprobs.get("content")
+            else:
+                content = getattr(logprobs, "content", None)
+            if content is None:
+                return None
+            if isinstance(content, Mapping):
+                content_items: Iterable[Any] = [content]
+            elif isinstance(content, list):
+                content_items = content
+            elif isinstance(content, Iterable) and not isinstance(
+                content, (str, bytes)
+            ):
+                content_items = list(content)
+            else:
+                return None
+            values: list[float] = []
+            for token in content_items:
+                if isinstance(token, Mapping):
+                    value = token.get("logprob")
+                else:
+                    value = getattr(token, "logprob", None)
+                if not isinstance(value, (float, int)):
+                    return None
+                values.append(float(value))
+            return values
+
+        def parse_tokens(response: AnthropicMessage) -> ResponseTokens | None:
+            prompt_ids = getattr(response, "prompt_token_ids", None)
+            completion_ids = getattr(response, "token_ids", None)
+            if not isinstance(prompt_ids, list) or not isinstance(completion_ids, list):
+                return None
+            if not all(isinstance(token_id, int) for token_id in prompt_ids):
+                return None
+            if not all(isinstance(token_id, int) for token_id in completion_ids):
+                return None
+
+            completion_logprobs = parse_completion_logprobs(
+                getattr(response, "logprobs", None)
+            )
+            if completion_logprobs is None:
+                return None
+
+            has_routed_experts = (
+                isinstance(
+                    routed_experts := getattr(response, "routed_experts", None), dict
+                )
+                and "data" in routed_experts
+                and "shape" in routed_experts
+            )
+            if has_routed_experts:
+                routed_experts = cast(dict[str, Any], routed_experts)
+                routed_experts = cast(
+                    list[list[list[int]]],
+                    (
+                        np.frombuffer(
+                            base64.b85decode(routed_experts["data"]), dtype=np.int32
+                        )
+                        .reshape(routed_experts["shape"])
+                        .tolist()
+                    ),
+                )
+            else:
+                routed_experts = None
+
+            return ResponseTokens(
+                prompt_ids=prompt_ids,
+                prompt_mask=[0] * len(prompt_ids),
+                completion_ids=completion_ids,
+                completion_mask=[1] * len(completion_ids),
+                completion_logprobs=completion_logprobs,
+                routed_experts=routed_experts,
+            )
+
         content, reasoning_content, tool_calls, thinking_blocks = parse_content(
             response.content
         )
@@ -465,6 +570,6 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:
                 tool_calls=tool_calls or None,
                 finish_reason=parse_finish_reason(response),
                 is_truncated=is_truncated,
-                tokens=None,
+                tokens=parse_tokens(response),
             ),
         )

From 03d4de66fcebf8bdc5dab22c9a94026b7f58638c Mon Sep 17 00:00:00 2001
From: will brown <williambrown97@gmail.com>
Date: Tue, 17 Feb 2026 16:21:44 -0800
Subject: [PATCH 2/5] Require explicit max_tokens for Anthropic messages

---
 tests/test_client_multimodal_types.py          | 16 ++++++++++++++++
 verifiers/clients/anthropic_messages_client.py |  5 ++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index 828bb7af2..48dd65cf7 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -282,6 +282,22 @@ async def test_anthropic_get_native_response_forwards_router_replay_with_extra_b
     assert "routed_experts" not in sent
 
 
+@pytest.mark.asyncio
+async def test_anthropic_get_native_response_requires_max_tokens():
+    pytest.importorskip("anthropic")
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    native_client = _CaptureAnthropicClient()
+    client = AnthropicMessagesClient(native_client)
+
+    with pytest.raises(ValueError, match=r"sampling_args\['max_tokens'\] is required"):
+        await client.get_native_response(
+            prompt=[{"role": "user", "content": "hello"}],
+            model="claude-test",
+            sampling_args={"temperature": 0.2},
+        )
+
+
 @pytest.mark.asyncio
 async def test_anthropic_from_native_response_extracts_tokens_and_router_replay():
     pytest.importorskip("anthropic")
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 041c64178..ff572d88f 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -355,10 +355,9 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
                     "sampling_args['extra_body'] must be a mapping when provided"
                 )
             if max_tokens is None:
-                self.logger.warning(
-                    "max_tokens is not set but Anthropic /v1/messages endpoint requires it, falling back to max_tokens=4096"
+                raise ValueError(
+                    "sampling_args['max_tokens'] is required for Anthropic /v1/messages requests"
                 )
-                max_tokens = 4096
             sampling_args["max_tokens"] = max_tokens
 
             # Anthropic SDK validates top-level request fields. Mirror OpenAI chat

From 752ee4f0195b823db4d93f71a6f3d42be0df137e Mon Sep 17 00:00:00 2001
From: will brown <williambrown97@gmail.com>
Date: Tue, 17 Feb 2026 16:21:48 -0800
Subject: [PATCH 3/5] Set Anthropic max_tokens default to 32768 with rationale
 comment

---
 tests/test_client_multimodal_types.py          | 18 +++++++++++-------
 verifiers/clients/anthropic_messages_client.py |  7 ++++---
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
index 48dd65cf7..c2b715a9a 100644
--- a/tests/test_client_multimodal_types.py
+++ b/tests/test_client_multimodal_types.py
@@ -283,19 +283,23 @@ async def test_anthropic_get_native_response_forwards_router_replay_with_extra_b
 
 
 @pytest.mark.asyncio
-async def test_anthropic_get_native_response_requires_max_tokens():
+async def test_anthropic_get_native_response_defaults_max_tokens_when_missing():
     pytest.importorskip("anthropic")
     from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
 
     native_client = _CaptureAnthropicClient()
     client = AnthropicMessagesClient(native_client)
 
-    with pytest.raises(ValueError, match=r"sampling_args\['max_tokens'\] is required"):
-        await client.get_native_response(
-            prompt=[{"role": "user", "content": "hello"}],
-            model="claude-test",
-            sampling_args={"temperature": 0.2},
-        )
+    await client.get_native_response(
+        prompt=[{"role": "user", "content": "hello"}],
+        model="claude-test",
+        sampling_args={"temperature": 0.2},
+    )
+
+    sent = native_client.messages.last_kwargs
+    assert sent is not None
+    assert sent["max_tokens"] == 32768
+    assert sent["temperature"] == 0.2
 
 
 @pytest.mark.asyncio
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index ff572d88f..7370db2da 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -355,9 +355,10 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
                     "sampling_args['extra_body'] must be a mapping when provided"
                 )
             if max_tokens is None:
-                raise ValueError(
-                    "sampling_args['max_tokens'] is required for Anthropic /v1/messages requests"
-                )
+                # Anthropic /v1/messages requires max_tokens in every request.
+                # Use an explicit large default to keep behavior deterministic
+                # across Anthropic-compatible backends (e.g. vLLM).
+                max_tokens = 32768
             sampling_args["max_tokens"] = max_tokens
 
             # Anthropic SDK validates top-level request fields. Mirror OpenAI chat

From 016232afd06837acbcf4cc1975e9f55de5870f80 Mon Sep 17 00:00:00 2001
From: will brown <williambrown97@gmail.com>
Date: Tue, 17 Feb 2026 17:41:53 -0800
Subject: [PATCH 4/5] Refine Anthropic max_tokens and extra_body comments

---
 verifiers/clients/anthropic_messages_client.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 7370db2da..633a04adb 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -355,15 +355,13 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
                     "sampling_args['extra_body'] must be a mapping when provided"
                 )
             if max_tokens is None:
-                # Anthropic /v1/messages requires max_tokens in every request.
-                # Use an explicit large default to keep behavior deterministic
-                # across Anthropic-compatible backends (e.g. vLLM).
+                # Anthropic /v1/messages requires max_tokens to be set in every request.
                 max_tokens = 32768
             sampling_args["max_tokens"] = max_tokens
 
-            # Anthropic SDK validates top-level request fields. Mirror OpenAI chat
-            # completions usage by forwarding unknown model args through extra_body
-            # so router replay payloads (e.g. routed_experts) can be passed via
+            # Anthropic SDK validates top-level request fields.
+            # Forward unknown model args through extra_body
+            # so backend-specific payloads (e.g. routed_experts) can be passed via
             # sampling_args without custom provider branching.
             known_anthropic_args = {
                 "max_tokens",

From 34bd2ca42560a41ee15f8d27b920667a490ccfa0 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Wed, 18 Feb 2026 02:29:30 +0000
Subject: [PATCH 5/5] clients/anthropic: define ANTHROPIC_MAX_TOKENS constant
 and log default at init; use constant for fallback max_tokens

Co-authored-by: will brown <willccbb@users.noreply.github.com>
---
 verifiers/clients/anthropic_messages_client.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
index 633a04adb..392901318 100644
--- a/verifiers/clients/anthropic_messages_client.py
+++ b/verifiers/clients/anthropic_messages_client.py
@@ -53,6 +53,10 @@
 )
 from verifiers.utils.client_utils import setup_anthropic_client
 
+# Default output-token limit used when callers omit max_tokens.
+# Anthropic /v1/messages requires max_tokens on every request.
+ANTHROPIC_MAX_TOKENS: int = 32768
+
 
 def _handle_anthropic_overlong_prompt(func):
     """Decorator to handle overlong prompt errors from the Anthropic API."""
@@ -91,6 +95,12 @@ class AnthropicMessagesClient(
     """Wrapper for Messages API via AsyncAnthropic client."""
 
     def setup_client(self, config: ClientConfig) -> AsyncAnthropic:
+        # Log the default and remind that max_tokens is required for Anthropic.
+        self.logger.info(
+            "Anthropic client initialized. max_tokens is required on every request; "
+            "defaulting to ANTHROPIC_MAX_TOKENS=%d when not provided.",
+            ANTHROPIC_MAX_TOKENS,
+        )
         return setup_anthropic_client(config)
 
     async def close(self) -> None:
@@ -356,7 +366,7 @@ def normalize_sampling_args(sampling_args: SamplingArgs) -> dict:
                 )
             if max_tokens is None:
                 # Anthropic /v1/messages requires max_tokens to be set in every request.
-                max_tokens = 32768
+                max_tokens = ANTHROPIC_MAX_TOKENS
             sampling_args["max_tokens"] = max_tokens
 
             # Anthropic SDK validates top-level request fields.