Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions tests/test_browser_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest
from unittest.mock import MagicMock, patch
from datasets import Dataset
import verifiers as vf

# Skip all tests in this module if browser dependencies are not installed
pytest.importorskip("stagehand", reason="verifiers[browser] extra not installed")
Expand Down Expand Up @@ -299,6 +300,75 @@ def test_filter_screenshots_fewer_than_limit(self):
assert filtered[1]["content"][0]["type"] == "image_url"




class TestCUAModeSetupErrors:
"""Tests for setup_state error wrapping in CUAMode."""

def test_local_setup_errors_wrapped_as_vf_error(self):
"""Test local setup_state wraps unexpected exceptions in BrowserSessionSetupError."""
import asyncio

from verifiers.envs.integrations.browser_env.modes.cua_mode import (
BrowserSessionSetupError,
CUAMode,
)

mode = CUAMode(execution_mode="local")

class FailingAttempt:
def __enter__(self):
return None

def __exit__(self, exc_type, exc, tb):
return False

class SingleAttemptRetry:
def __init__(self):
self._used = False

def __aiter__(self):
return self

async def __anext__(self):
if self._used:
raise StopAsyncIteration
self._used = True
return FailingAttempt()

mode.retrying = SingleAttemptRetry()

async def fail_create_session_http():
raise RuntimeError("boom")

mode._create_session_http = fail_create_session_http # type: ignore[method-assign]

with pytest.raises(BrowserSessionSetupError):
asyncio.run(mode.setup_state({}))


class TestBrowserEnvStopErrors:
"""Tests for BrowserEnv stop error configuration."""

def test_cua_defaults_stop_errors_to_sandbox_error(self):
"""Test CUA mode configures stop_errors to include vf.SandboxError by default."""
from verifiers.envs.integrations.browser_env.browser_env import BrowserEnv

with patch.dict(os.environ, {"BROWSERBASE_API_KEY": "test"}, clear=True):
with patch(
"verifiers.envs.integrations.browser_env.modes.cua_mode.CUAMode.verify_server_connection"
):
env = BrowserEnv(
mode="cua",
use_sandbox=False,
env="LOCAL",
dataset=Dataset.from_dict(
{"question": ["test"], "answer": ["test"]}
),
)
assert any(err_type is vf.SandboxError for err_type in env.stop_errors)


class TestCUAModeResponseFormat:
"""Tests for response formatting in CUAMode."""

Expand Down
3 changes: 3 additions & 0 deletions verifiers/envs/integrations/browser_env/browser_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ def __init__(
prebuilt_image: Docker image to use (default: deepdream19/cua-server:latest)
**kwargs: Additional arguments passed to StatefulToolEnv
"""
if mode == "cua" and "stop_errors" not in kwargs:
kwargs["stop_errors"] = [vf.SandboxError]

super().__init__(**kwargs)
self.mode = mode
browserbase_api_key = os.getenv(browserbase_api_key_var)
Expand Down
109 changes: 62 additions & 47 deletions verifiers/envs/integrations/browser_env/modes/cua_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@
APIClient = None # type: ignore[misc, assignment]


class BrowserSessionSetupError(vf.Error): ...


class BrowserSandboxSetupError(vf.SandboxError): ...


class CUAMode:
"""
CUA-based browser mode supporting both local HTTP and sandbox execution.
Expand Down Expand Up @@ -801,62 +807,71 @@ def filter_screenshots_in_messages(self, messages: list) -> list:

async def setup_state(self, state: vf.State, **kwargs: Any) -> vf.State:
"""Create a browser session (and sandbox if in sandbox mode)."""
if self._execution_mode == "local":
# Local mode: create session via HTTP
async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
result = await self._create_session_http()
session_id = result.get("sessionId")
if not session_id:
raise RuntimeError("Failed to get session ID from server response")

with self._sessions_lock:
self.active_sessions.add(session_id)

state["session_id"] = session_id
state["browser_state"] = result.get("state", {})
else:
# Sandbox mode: create sandbox, set up server, create session
if self.use_prebuilt_image:
if self.logger:
self.logger.debug(f"Using prebuilt image: {self.prebuilt_image}")

try:
if self._execution_mode == "local":
# Local mode: create session via HTTP
async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
sandbox_id = await self._create_sandbox()
await self._wait_for_sandbox_ready(sandbox_id)
state["cua_sandbox_id"] = sandbox_id
await self._wait_for_server(sandbox_id)
result = await self._create_session_http()
session_id = result.get("sessionId")
if not session_id:
raise RuntimeError("Failed to get session ID from server response")

with self._sessions_lock:
self.active_sessions.add(session_id)

state["session_id"] = session_id
state["browser_state"] = result.get("state", {})
else:
if self.use_binary:
await self._ensure_binary_exists()
# Sandbox mode: create sandbox, set up server, create session
if self.use_prebuilt_image:
if self.logger:
self.logger.debug(
f"Using prebuilt image: {self.prebuilt_image}"
)

async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
sandbox_id = await self._create_sandbox()
await self._wait_for_sandbox_ready(sandbox_id)
state["cua_sandbox_id"] = sandbox_id
await self._wait_for_server(sandbox_id)
else:
if self.use_binary:
await self._ensure_binary_exists()

async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
sandbox_id = await self._create_sandbox()
await self._wait_for_sandbox_ready(sandbox_id)
state["cua_sandbox_id"] = sandbox_id
await self._upload_server_files(sandbox_id)
await self._start_server(sandbox_id)
await self._wait_for_server(sandbox_id)

async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
sandbox_id = await self._create_sandbox()
await self._wait_for_sandbox_ready(sandbox_id)
state["cua_sandbox_id"] = sandbox_id
await self._upload_server_files(sandbox_id)
await self._start_server(sandbox_id)
await self._wait_for_server(sandbox_id)

async for attempt in self.retrying: # type: ignore[union-attr]
with attempt:
result = await self._create_session_curl(sandbox_id)
session_id = result.get("sessionId")
if not session_id:
raise RuntimeError(
f"Failed to get session ID from server response. "
f"Response keys: {list(result.keys())}, Response: {str(result)[:500]}"
)
result = await self._create_session_curl(sandbox_id)
session_id = result.get("sessionId")
if not session_id:
raise RuntimeError(
f"Failed to get session ID from server response. "
f"Response keys: {list(result.keys())}, Response: {str(result)[:500]}"
)

with self._sessions_lock:
self.active_sessions.add(session_id)
with self._sessions_lock:
self.active_sessions.add(session_id)

state["session_id"] = session_id
state["browser_state"] = result.get("state", {})
state["session_id"] = session_id
state["browser_state"] = result.get("state", {})

return state
return state
except vf.Error:
raise
except Exception as e:
if self._execution_mode == "sandbox":
raise BrowserSandboxSetupError(e) from e
raise BrowserSessionSetupError(e) from e

def update_tool_args(
self,
Expand Down
Loading