Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ __pycache__/

# libraries
/prime-rl
/packages/tasksets
/packages/harnesses


# outputs
wandb/
Expand Down
2 changes: 1 addition & 1 deletion docs/environments.md
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ class MyGameEnv(vf.MultiTurnEnv):
return state.get("lives", 1) <= 0
```

`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, and `max_turns` by default.
`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, and incomplete response detection by default.

Execution order can be controlled with `priority` (higher runs first). This is useful for checking cheap conditions before expensive ones:

Expand Down
4 changes: 2 additions & 2 deletions environments/wiki_search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name = "wiki-search"
description = "Agentic RAG over Wikipedia pages for trivia Q&A"
tags = ["wikipedia", "multi-turn", "agentic-search", "rag", "train", "eval", "llm-judge"]
requires-python = ">=3.11"
version = "0.1.23"
version = "0.1.24"
dependencies = [
"verifiers>=0.1.9",
"verifiers>=0.1.11.dev0",
"chromadb",
"datasets",
"openai",
Expand Down
5 changes: 4 additions & 1 deletion environments/wiki_search/wiki_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,10 @@ async def read_section(section_id: str) -> str:
)

async def judge_reward_func(judge, prompt, completion, answer, state) -> float:
judge_response = await judge(prompt, completion, answer, state)
cleaned_completion = [
{x["role"]: x["content"].split("</think>")[-1] for x in completion}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Split on None content causes AttributeError

Medium Severity

AssistantMessage.content is typed as MessageContent | None and defaults to None for tool-call-only messages. The expression x["content"].split("</think>") will raise an AttributeError when content is None. In a multi-turn tool-use environment like wiki_search, assistant messages with only tool_calls and no content are common. The error is silently caught by _call_individual_reward_func, returning a reward of 0.0, which silently corrupts training signal.

Fix in Cursor Fix in Web

]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dict comprehension creates wrong message structure for judge

High Severity

The dict comprehension {x["role"]: x["content"].split("</think>")[-1] for x in completion} creates a single dictionary with role names (e.g., "assistant", "tool") as keys and cleaned content as values, wrapped in a list. This produces a structure like [{"assistant": "...", "tool": "..."}] instead of the expected list of message dicts with "role" and "content" keys. When the judge's parse_answer tries to find assistant messages, it looks for a "role" key in each element — which doesn't exist in this dict — so it always returns None, making the judge evaluate against a None response. The brackets likely need to be moved so the list comprehension wraps each message individually.

Fix in Cursor Fix in Web

judge_response = await judge(prompt, cleaned_completion, answer, state)
if "yes" in judge_response.lower():
return 1.0
else:
Expand Down
8 changes: 6 additions & 2 deletions verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,9 @@ def __init__(

if dataset is not None:
if callable(dataset):
self.dataset_source: DatasetBuilder | None = dataset
self.dataset_source: DatasetBuilder | None = cast(
DatasetBuilder, dataset
)
else:
self.dataset_source = lambda ds=dataset: ds
self.build_dataset() # Eagerly build for raw datasets (backwards compat)
Expand All @@ -165,7 +167,9 @@ def __init__(

if eval_dataset is not None:
if callable(eval_dataset):
self.eval_dataset_source: DatasetBuilder | None = eval_dataset
self.eval_dataset_source: DatasetBuilder | None = cast(
DatasetBuilder, eval_dataset
)
else:
self.eval_dataset_source = lambda ds=eval_dataset: ds
self.build_eval_dataset() # Eagerly build for raw datasets (backwards compat)
Expand Down
14 changes: 12 additions & 2 deletions verifiers/envs/multiturn_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ async def has_final_env_response(self, state: State) -> bool:
"""Check if env_response signaled termination via final_env_response."""
return state.get("final_env_response") is not None

@vf.stop
async def has_incomplete_response(self, state: State) -> bool:
return state.get("incomplete_response", False)

async def setup_state(self, state: State) -> State:
"""Override to add environment-specific state fields."""
return state
Expand Down Expand Up @@ -121,9 +125,15 @@ async def add_model_response(
):
completion_messages = await parse_response_message(response)
tokens = await parse_response_tokens(response, self.max_seq_len)
has_content = bool(response.message.content)
has_tool_calls = bool(response.message.tool_calls)
if not has_content and not has_tool_calls:
state["incomplete_response"] = True
response_is_truncated = response.message.is_truncated or False
is_truncated = response_is_truncated or (
tokens is not None and bool(tokens.get("is_truncated"))
is_truncated = (
response_is_truncated
or (tokens is not None and bool(tokens.get("is_truncated")))
or state.get("incomplete_response", False)
)
trajectory_step = TrajectoryStep(
prompt=prompt_messages,
Expand Down
Loading