diff --git a/pyproject.toml b/pyproject.toml index ee5d06a..9552bec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapybara" -version = "2.1.1" +version = "2.1.2" description = "" readme = "README.md" authors = [] diff --git a/src/scrapybara/anthropic/__init__.py b/src/scrapybara/anthropic/__init__.py index b99c7c2..a3cfda2 100644 --- a/src/scrapybara/anthropic/__init__.py +++ b/src/scrapybara/anthropic/__init__.py @@ -14,6 +14,21 @@ # New: universal act API class Anthropic(Model): + """Model adapter for Anthropic. + + Supported models: + - claude-3-5-sonnet-20241022 (with computer use beta) + + If an API key is not provided, each call will cost 1 agent credit. + + Args: + name: Anthropic model name, defaults to "claude-3-5-sonnet-20241022" + api_key: Optional Anthropic API key + + Returns: + A Model configuration object + """ + provider: Literal["anthropic"] = Field(default="anthropic", frozen=True) def __init__( diff --git a/src/scrapybara/client.py b/src/scrapybara/client.py index 7db66c6..a4d8f3d 100644 --- a/src/scrapybara/client.py +++ b/src/scrapybara/client.py @@ -5,6 +5,8 @@ Dict, List, Sequence, + Type, + TypeVar, Union, Literal, Generator, @@ -16,6 +18,7 @@ import asyncio import httpx +from pydantic import BaseModel, ConfigDict from scrapybara.core.http_client import AsyncHttpClient, HttpClient from scrapybara.environment import ScrapybaraEnvironment @@ -43,8 +46,8 @@ StopInstanceResponse, ) from .types.act import ( - ActRequest, - ActResponse, + SingleActRequest, + SingleActResponse, Message, Model, TextPart, @@ -55,11 +58,38 @@ UserMessage, AssistantMessage, Step, + ActResponse, + TokenUsage, ) from .base_client import BaseClient, AsyncBaseClient from .instance.types import Action, Command OMIT = typing.cast(typing.Any, ...) +SchemaT = TypeVar("SchemaT", bound=BaseModel) + + +class StructuredOutputTool(Tool): + """A tool that allows the agent to output structured data.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + _model: Type[BaseModel] + + def __init__(self, model: Type[BaseModel]): + schema = model.model_json_schema() + super().__init__( + name="structured_output", + description="Output structured data according to the provided schema parameters. Only use this tool at the end of your task. The output data is final and will be passed directly back to the user.", + parameters={ + "type": "object", + "properties": schema.get("properties", {}), + "required": schema.get("required", []), + }, + ) + self._model = model + + def __call__(self, **kwargs: Any) -> Dict[str, Any]: + validated = self._model.model_validate(kwargs) + return validated.model_dump() class Browser: @@ -853,46 +883,56 @@ def act( self, *, model: Model, + tools: Optional[List[Tool]] = None, system: Optional[str] = None, prompt: Optional[str] = None, messages: Optional[List[Message]] = None, - tools: Optional[List[Tool]] = None, + schema: Optional[Type[SchemaT]] = None, on_step: Optional[Callable[[Step], None]] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, request_options: Optional[RequestOptions] = None, - ) -> List[Message]: + ) -> ActResponse[SchemaT]: """ Run an agent loop with the given tools and model, returning all messages at the end. Args: - tools: List of tools available to the agent model: The model to use for generating responses + tools: List of tools available to the agent system: System prompt for the agent prompt: Initial user prompt messages: List of messages to start with + schema: Optional Pydantic model class to structure the final output on_step: Callback for each step of the conversation temperature: Optional temperature parameter for the model max_tokens: Optional max tokens parameter for the model request_options: Optional request configuration Returns: - List of all messages from the conversation + ActResponse containing all messages, steps, text, output (if schema is provided), and token usage """ result_messages: List[Message] = [] + steps: List[Step] = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + if messages: result_messages.extend(messages) + for step in self.act_stream( - tools=tools, model=model, + tools=tools, system=system, prompt=prompt, messages=messages, + schema=schema, + on_step=on_step, temperature=temperature, max_tokens=max_tokens, - on_step=on_step, request_options=request_options, ): + steps.append(step) assistant_msg = AssistantMessage( content=[TextPart(text=step.text)] + (step.tool_calls or []) ) @@ -900,16 +940,40 @@ def act( if step.tool_results: tool_msg = ToolMessage(content=step.tool_results) result_messages.append(tool_msg) - return result_messages + + if step.usage: + total_prompt_tokens += step.usage.prompt_tokens + total_completion_tokens += step.usage.completion_tokens + total_tokens += step.usage.total_tokens + + text = steps[-1].text if steps else None + if schema: + output = ( + steps[-1].tool_results[-1].result if steps[-1].tool_results else None + ) + output = schema.model_validate(output) + + usage = None + if total_tokens > 0: + usage = TokenUsage( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens, + ) + + return ActResponse( + messages=result_messages, steps=steps, text=text, output=output, usage=usage + ) def act_stream( self, *, model: Model, + tools: Optional[List[Tool]] = None, system: Optional[str] = None, prompt: Optional[str] = None, messages: Optional[List[Message]] = None, - tools: Optional[List[Tool]] = None, + schema: Optional[Type[BaseModel]] = None, on_step: Optional[Callable[[Step], None]] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, @@ -919,11 +983,12 @@ def act_stream( Run an interactive agent loop with the given tools and model. Args: - tools: List of tools available to the agent model: The model to use for generating responses + tools: List of tools available to the agent system: System prompt for the agent prompt: Initial user prompt messages: List of messages to start with + schema: Optional Pydantic model class to structure the final output on_step: Callback for each step of the conversation temperature: Optional temperature parameter for the model max_tokens: Optional max tokens parameter for the model @@ -942,8 +1007,11 @@ def act_stream( current_tools = [] if tools is None else list(tools) + if schema: + current_tools.append(StructuredOutputTool(schema)) + while True: - request = ActRequest( + request = SingleActRequest( model=model, system=system, messages=current_messages, @@ -963,7 +1031,7 @@ def act_stream( if not 200 <= response.status_code < 300: raise ApiError(status_code=response.status_code, body=response.json()) - act_response = ActResponse.model_validate(response.json()) + act_response = SingleActResponse.model_validate(response.json()) current_messages.append(act_response.message) # Extract text from assistant message @@ -988,14 +1056,17 @@ def act_stream( usage=act_response.usage, ) - # Check if we should continue the loop + # Check if there are tool calls has_tool_calls = bool(tool_calls) + has_structured_output = False if has_tool_calls: tool_results: List[ToolResultPart] = [] for part in tool_calls: tool = next(t for t in current_tools if t.name == part.tool_name) try: + if tool.name == "structured_output" and schema: + has_structured_output = True result = tool(**part.args) tool_results.append( ToolResultPart( @@ -1021,7 +1092,7 @@ def act_stream( on_step(step) yield step - if not has_tool_calls: + if not has_tool_calls or has_structured_output: break @@ -1117,16 +1188,17 @@ async def get_auth_states( async def act( self, *, + tools: Optional[List[Tool]] = None, model: Model, system: Optional[str] = None, prompt: Optional[str] = None, messages: Optional[List[Message]] = None, - tools: Optional[List[Tool]] = None, + schema: Optional[Type[SchemaT]] = None, on_step: Optional[Callable[[Step], None]] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, request_options: Optional[RequestOptions] = None, - ) -> List[Message]: + ) -> ActResponse[SchemaT]: """ Run an agent loop with the given tools and model, returning all messages at the end. @@ -1136,28 +1208,37 @@ async def act( system: System prompt for the agent prompt: Initial user prompt messages: List of messages to start with + schema: Optional Pydantic model class to structure the final output on_step: Callback for each step of the conversation temperature: Optional temperature parameter for the model max_tokens: Optional max tokens parameter for the model request_options: Optional request configuration Returns: - List of all messages from the conversation + ActResponse containing all messages, steps, text, output (if schema is provided), and token usage """ result_messages: List[Message] = [] + steps: List[Step] = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + total_tokens = 0 + if messages: result_messages.extend(messages) + async for step in self.act_stream( tools=tools, model=model, system=system, prompt=prompt, messages=messages, + schema=schema, temperature=temperature, max_tokens=max_tokens, on_step=on_step, request_options=request_options, ): + steps.append(step) assistant_msg = AssistantMessage( content=[TextPart(text=step.text)] + (step.tool_calls or []) ) @@ -1165,16 +1246,40 @@ async def act( if step.tool_results: tool_msg = ToolMessage(content=step.tool_results) result_messages.append(tool_msg) - return result_messages + + if step.usage: + total_prompt_tokens += step.usage.prompt_tokens + total_completion_tokens += step.usage.completion_tokens + total_tokens += step.usage.total_tokens + + text = steps[-1].text if steps else None + if schema: + output = ( + steps[-1].tool_results[-1].result if steps[-1].tool_results else None + ) + output = schema.model_validate(output) + + usage = None + if total_tokens > 0: + usage = TokenUsage( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_tokens, + ) + + return ActResponse( + messages=result_messages, steps=steps, text=text, output=output, usage=usage + ) async def act_stream( self, *, model: Model, + tools: Optional[List[Tool]] = None, system: Optional[str] = None, prompt: Optional[str] = None, messages: Optional[List[Message]] = None, - tools: Optional[List[Tool]] = None, + schema: Optional[Type[SchemaT]] = None, on_step: Optional[Callable[[Step], None]] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, @@ -1184,11 +1289,12 @@ async def act_stream( Run an interactive agent loop with the given tools and model. Args: - tools: List of tools available to the agent model: The model to use for generating responses + tools: List of tools available to the agent system: System prompt for the agent prompt: Initial user prompt messages: List of messages to start with + schema: Optional Pydantic model class to structure the final output on_step: Callback for each step of the conversation temperature: Optional temperature parameter for the model max_tokens: Optional max tokens parameter for the model @@ -1208,7 +1314,7 @@ async def act_stream( current_tools = [] if tools is None else list(tools) while True: - request = ActRequest( + request = SingleActRequest( model=model, system=system, messages=current_messages, @@ -1228,7 +1334,7 @@ async def act_stream( if not 200 <= response.status_code < 300: raise ApiError(status_code=response.status_code, body=response.json()) - act_response = ActResponse.model_validate(response.json()) + act_response = SingleActResponse.model_validate(response.json()) current_messages.append(act_response.message) # Extract text from assistant message @@ -1253,14 +1359,17 @@ async def act_stream( usage=act_response.usage, ) - # Check if we should continue the loop + # Check if there are tool calls has_tool_calls = bool(tool_calls) + has_structured_output = False if has_tool_calls: tool_results: List[ToolResultPart] = [] for part in tool_calls: tool = next(t for t in current_tools if t.name == part.tool_name) try: + if tool.name == "structured_output" and schema: + has_structured_output = True loop = asyncio.get_event_loop() result = await loop.run_in_executor( None, lambda: tool(**part.args) @@ -1289,5 +1398,5 @@ async def act_stream( on_step(step) yield step - if not has_tool_calls: + if not has_tool_calls or has_structured_output: break diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py index a2eb2f4..e3442ae 100644 --- a/src/scrapybara/core/client_wrapper.py +++ b/src/scrapybara/core/client_wrapper.py @@ -7,7 +7,9 @@ class BaseClientWrapper: - def __init__(self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None): + def __init__( + self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None + ): self.api_key = api_key self._base_url = base_url self._timeout = timeout @@ -16,7 +18,7 @@ def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { "X-Fern-Language": "Python", "X-Fern-SDK-Name": "scrapybara", - "X-Fern-SDK-Version": "2.1.1", + "X-Fern-SDK-Version": "2.1.2", } headers["x-api-key"] = self.api_key return headers @@ -30,7 +32,12 @@ def get_timeout(self) -> typing.Optional[float]: class SyncClientWrapper(BaseClientWrapper): def __init__( - self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None, httpx_client: httpx.Client + self, + *, + api_key: str, + base_url: str, + timeout: typing.Optional[float] = None, + httpx_client: httpx.Client ): super().__init__(api_key=api_key, base_url=base_url, timeout=timeout) self.httpx_client = HttpClient( @@ -43,7 +50,12 @@ def __init__( class AsyncClientWrapper(BaseClientWrapper): def __init__( - self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None, httpx_client: httpx.AsyncClient + self, + *, + api_key: str, + base_url: str, + timeout: typing.Optional[float] = None, + httpx_client: httpx.AsyncClient ): super().__init__(api_key=api_key, base_url=base_url, timeout=timeout) self.httpx_client = AsyncHttpClient( diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py index 6ceb10b..f31379b 100644 --- a/src/scrapybara/tools/__init__.py +++ b/src/scrapybara/tools/__init__.py @@ -7,6 +7,18 @@ from ..client import Instance +def image_result(base64: str) -> str: + """Return an image result that is interpretable by the model.""" + return json.dumps( + { + "output": "", + "error": "", + "base64_image": base64, + "system": None, + } + ) + + class ComputerTool(Tool): """A computer interaction tool that allows the agent to control mouse and keyboard.""" @@ -205,15 +217,3 @@ def __call__(self, **kwargs: Any) -> Any: finally: browser.close() - - -def image_result(base64: str) -> str: - """Return an image result that is interpretable by the model.""" - return json.dumps( - { - "output": "", - "error": "", - "base64_image": base64, - "system": None, - } - ) diff --git a/src/scrapybara/types/act.py b/src/scrapybara/types/act.py index c70899b..6d7c052 100644 --- a/src/scrapybara/types/act.py +++ b/src/scrapybara/types/act.py @@ -1,7 +1,9 @@ -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union, Generic, TypeVar from pydantic import BaseModel from .tool import Tool +OutputT = TypeVar("OutputT") + # Message part types class TextPart(BaseModel): @@ -55,7 +57,7 @@ class Model(BaseModel): api_key: Optional[str] = None -class ActRequest(BaseModel): +class SingleActRequest(BaseModel): model: Model system: Optional[str] = None messages: Optional[List[Message]] = None @@ -70,7 +72,7 @@ class TokenUsage(BaseModel): total_tokens: int -class ActResponse(BaseModel): +class SingleActResponse(BaseModel): message: AssistantMessage finish_reason: Literal[ "stop", "length", "content-filter", "tool-calls", "error", "other", "unknown" @@ -95,3 +97,12 @@ class Step(BaseModel): ] ] = None usage: Optional[TokenUsage] = None + + +# Act response +class ActResponse(BaseModel, Generic[OutputT]): + messages: List[Message] + steps: List[Step] + text: Optional[str] = None + output: OutputT + usage: Optional[TokenUsage] = None diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index 70f026a..4747ba0 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -1,3 +1,4 @@ +from pydantic import BaseModel from scrapybara import Scrapybara import os @@ -20,19 +21,27 @@ def test_client() -> None: cdp_url = instance.browser.get_cdp_url() assert cdp_url is not None - messages = client.act( + class YCStats(BaseModel): + number_of_startups: int + combined_valuation: int + + response = client.act( model=Anthropic(), system=SYSTEM_PROMPT, - prompt="Go to the YC website and fetch the HTML", + prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(instance), BashTool(instance), EditTool(instance), BrowserTool(instance), ], - on_step=lambda step: print(f"{step}\n"), + schema=YCStats, ) - assert len(messages) > 0 + print(response) + + assert response.output is not None + assert response.output.number_of_startups is not None + assert response.output.combined_valuation is not None instance.browser.stop() instance.stop()