diff --git a/src/scrapybara/client.py b/src/scrapybara/client.py index a4d8f3d..ca64fc2 100644 --- a/src/scrapybara/client.py +++ b/src/scrapybara/client.py @@ -52,6 +52,7 @@ Model, TextPart, Tool, + ApiTool, ToolCallPart, ToolMessage, ToolResultPart, @@ -75,15 +76,10 @@ class StructuredOutputTool(Tool): _model: Type[BaseModel] def __init__(self, model: Type[BaseModel]): - schema = model.model_json_schema() super().__init__( name="structured_output", description="Output structured data according to the provided schema parameters. Only use this tool at the end of your task. The output data is final and will be passed directly back to the user.", - parameters={ - "type": "object", - "properties": schema.get("properties", {}), - "required": schema.get("required", []), - }, + parameters=model, ) self._model = model @@ -1011,11 +1007,14 @@ def act_stream( current_tools.append(StructuredOutputTool(schema)) while True: + # Convert tools to ApiTools + api_tools = [ApiTool.from_tool(tool) for tool in current_tools] + request = SingleActRequest( model=model, system=system, messages=current_messages, - tools=current_tools, + tools=api_tools, temperature=temperature, max_tokens=max_tokens, ) @@ -1313,12 +1312,18 @@ async def act_stream( current_tools = [] if tools is None else list(tools) + if schema: + current_tools.append(StructuredOutputTool(schema)) + while True: + # Convert tools to ApiTools + api_tools = [ApiTool.from_tool(tool) for tool in current_tools] + request = SingleActRequest( model=model, system=system, messages=current_messages, - tools=current_tools, + tools=api_tools, temperature=temperature, max_tokens=max_tokens, ) diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py index f31379b..967270b 100644 --- a/src/scrapybara/tools/__init__.py +++ b/src/scrapybara/tools/__init__.py @@ -1,10 +1,34 @@ import base64 import json -from typing import Any +from typing import Any, Literal, Optional, Sequence, Tuple +from pydantic import BaseModel, Field from playwright.sync_api import sync_playwright from ..types.tool import Tool from ..client import Instance +from ..instance.types import Action, Command + + +def image_result(base64: str) -> str: + """Return an image result that is interpretable by the model.""" + return json.dumps( + { + "output": "", + "error": "", + "base64_image": base64, + "system": None, + } + ) + + +class ComputerToolParameters(BaseModel): + """Parameters for computer interaction commands.""" + + action: Action = Field(description="The computer action to execute") + coordinate: Optional[Sequence[int]] = Field( + None, description="Coordinates for mouse actions" + ) + text: Optional[str] = Field(None, description="Text for keyboard actions") def image_result(base64: str) -> str: @@ -27,21 +51,40 @@ class ComputerTool(Tool): def __init__(self, instance: Instance) -> None: super().__init__( name="computer", + description="Control mouse and keyboard for computer interaction", + parameters=ComputerToolParameters, ) self._instance = instance def __call__(self, **kwargs: Any) -> Any: - action = kwargs.pop("action") - coordinate = kwargs.pop("coordinate", None) - text = kwargs.pop("text", None) - + params = ComputerToolParameters.model_validate(kwargs) return self._instance.computer( - action=action, - coordinate=tuple(coordinate) if coordinate else None, - text=text, + action=params.action, + coordinate=tuple(params.coordinate) if params.coordinate else None, + text=params.text, ) +class EditToolParameters(BaseModel): + """Parameters for file editing commands.""" + + command: Command = Field(description="The edit command to execute") + path: str = Field(description="Path to the file to edit") + file_text: Optional[str] = Field( + None, description="File content for create command" + ) + view_range: Optional[Tuple[int, int]] = Field( + None, description="Line range for view command" + ) + old_str: Optional[str] = Field( + None, description="String to replace for replace command" + ) + new_str: Optional[str] = Field(None, description="New string for replace command") + insert_line: Optional[int] = Field( + None, description="Line number for insert command" + ) + + class EditTool(Tool): """A filesystem editor tool that allows the agent to view, create, and edit files.""" @@ -50,29 +93,31 @@ class EditTool(Tool): def __init__(self, instance: Instance) -> None: super().__init__( name="str_replace_editor", + description="View, create, and edit files in the filesystem", + parameters=EditToolParameters, ) self._instance = instance def __call__(self, **kwargs: Any) -> Any: - command = kwargs.pop("command") - path = kwargs.pop("path") - file_text = kwargs.pop("file_text", None) - view_range = kwargs.pop("view_range", None) - old_str = kwargs.pop("old_str", None) - new_str = kwargs.pop("new_str", None) - insert_line = kwargs.pop("insert_line", None) - + params = EditToolParameters.model_validate(kwargs) return self._instance.edit( - command=command, - path=path, - file_text=file_text, - view_range=view_range, - old_str=old_str, - new_str=new_str, - insert_line=insert_line, + command=params.command, + path=params.path, + file_text=params.file_text, + view_range=params.view_range, + old_str=params.old_str, + new_str=params.new_str, + insert_line=params.insert_line, ) +class BashToolParameters(BaseModel): + """Parameters for bash command execution.""" + + command: str = Field(description="The bash command to execute") + restart: Optional[bool] = Field(False, description="Whether to restart the shell") + + class BashTool(Tool): """A shell execution tool that allows the agent to run bash commands.""" @@ -81,14 +126,59 @@ class BashTool(Tool): def __init__(self, instance: Instance) -> None: super().__init__( name="bash", + description="Execute bash commands in the shell", + parameters=BashToolParameters, ) self._instance = instance def __call__(self, **kwargs: Any) -> Any: - command = kwargs.pop("command") - restart = kwargs.pop("restart", False) - - return self._instance.bash(command=command, restart=restart) + params = BashToolParameters.model_validate(kwargs) + return self._instance.bash(command=params.command, restart=params.restart) + + +class BrowserToolParameters(BaseModel): + """Parameters for browser interaction commands.""" + + command: Literal[ + "go_to", # Navigate to a URL + "get_html", # Get current page HTML + "evaluate", # Run JavaScript code + "click", # Click on an element + "type", # Type into an element + "screenshot", # Take a screenshot + "get_text", # Get text content of element + "get_attribute", # Get attribute of element + ] = Field( + description="The browser command to execute. Required parameters per command:\n" + "- go_to: requires 'url'\n" + "- evaluate: requires 'code'\n" + "- click: requires 'selector'\n" + "- type: requires 'selector' and 'text'\n" + "- get_text: requires 'selector'\n" + "- get_attribute: requires 'selector' and 'attribute'\n" + "- get_html: no additional parameters\n" + "- screenshot: no additional parameters" + ) + url: Optional[str] = Field( + None, description="URL for go_to command (required for go_to)" + ) + selector: Optional[str] = Field( + None, + description="CSS selector for element operations (required for click, type, get_text, get_attribute)", + ) + code: Optional[str] = Field( + None, description="JavaScript code for evaluate command (required for evaluate)" + ) + text: Optional[str] = Field( + None, description="Text to type for type command (required for type)" + ) + timeout: Optional[int] = Field( + 30000, description="Timeout in milliseconds for operations" + ) + attribute: Optional[str] = Field( + None, + description="Attribute name for get_attribute command (required for get_attribute)", + ) class BrowserTool(Tool): @@ -100,62 +190,19 @@ def __init__(self, instance: Instance) -> None: super().__init__( name="browser", description="Interact with a browser for web scraping and automation", - parameters={ - "type": "object", - "properties": { - "command": { - "type": "string", - "enum": [ - "go_to", # Navigate to a URL - "get_html", # Get current page HTML - "evaluate", # Run JavaScript code - "click", # Click on an element - "type", # Type into an element - "screenshot", # Take a screenshot - "get_text", # Get text content of element - "get_attribute", # Get attribute of element - ], - "description": "The browser command to execute. Required parameters per command:\n- go_to: requires 'url'\n- evaluate: requires 'code'\n- click: requires 'selector'\n- type: requires 'selector' and 'text'\n- get_text: requires 'selector'\n- get_attribute: requires 'selector' and 'attribute'\n- get_html: no additional parameters\n- screenshot: no additional parameters", - }, - "url": { - "type": "string", - "description": "URL for go_to command (required for go_to)", - }, - "selector": { - "type": "string", - "description": "CSS selector for element operations (required for click, type, get_text, get_attribute)", - }, - "code": { - "type": "string", - "description": "JavaScript code for evaluate command (required for evaluate)", - }, - "text": { - "type": "string", - "description": "Text to type for type command (required for type)", - }, - "timeout": { - "type": "integer", - "description": "Timeout in milliseconds for operations", - "default": 30000, - }, - "attribute": { - "type": "string", - "description": "Attribute name for get_attribute command (required for get_attribute)", - }, - }, - "required": ["command"], - }, + parameters=BrowserToolParameters, ) self._instance = instance def __call__(self, **kwargs: Any) -> Any: - command = kwargs.pop("command") - url = kwargs.pop("url", None) - selector = kwargs.pop("selector", None) - code = kwargs.pop("code", None) - text = kwargs.pop("text", None) - timeout = kwargs.pop("timeout", 30000) - attribute = kwargs.pop("attribute", None) + params = BrowserToolParameters.model_validate(kwargs) + command = params.command + url = params.url + selector = params.selector + code = params.code + text = params.text + timeout = params.timeout or 30000 + attribute = params.attribute cdp_url = self._instance.browser.get_cdp_url().cdp_url if cdp_url is None: @@ -171,6 +218,8 @@ def __call__(self, **kwargs: Any) -> Any: try: if command == "go_to": + if not url: + raise ValueError("URL is required for go_to command") page.goto(url, timeout=timeout) return True @@ -182,13 +231,21 @@ def __call__(self, **kwargs: Any) -> Any: return page.evaluate("() => document.documentElement.innerHTML") elif command == "evaluate": + if not code: + raise ValueError("Code is required for evaluate command") return page.evaluate(code) elif command == "click": + if not selector: + raise ValueError("Selector is required for click command") page.click(selector, timeout=timeout) return True elif command == "type": + if not selector: + raise ValueError("Selector is required for type command") + if not text: + raise ValueError("Text is required for type command") page.type(selector, text, timeout=timeout) return True @@ -198,12 +255,22 @@ def __call__(self, **kwargs: Any) -> Any: ) elif command == "get_text": + if not selector: + raise ValueError("Selector is required for get_text command") element = page.wait_for_selector(selector, timeout=timeout) if element is None: raise ValueError(f"Element not found: {selector}") return element.text_content() elif command == "get_attribute": + if not selector: + raise ValueError( + "Selector is required for get_attribute command" + ) + if not attribute: + raise ValueError( + "Attribute is required for get_attribute command" + ) element = page.wait_for_selector(selector, timeout=timeout) if element is None: raise ValueError(f"Element not found: {selector}") diff --git a/src/scrapybara/types/act.py b/src/scrapybara/types/act.py index 6d7c052..56c44da 100644 --- a/src/scrapybara/types/act.py +++ b/src/scrapybara/types/act.py @@ -1,6 +1,8 @@ from typing import Any, Dict, List, Literal, Optional, Union, Generic, TypeVar from pydantic import BaseModel -from .tool import Tool +from .tool import Tool, ApiTool # noqa: F401 + +OutputT = TypeVar("OutputT") OutputT = TypeVar("OutputT") @@ -61,7 +63,7 @@ class SingleActRequest(BaseModel): model: Model system: Optional[str] = None messages: Optional[List[Message]] = None - tools: Optional[List[Tool]] = None + tools: Optional[List[ApiTool]] = None temperature: Optional[float] = None max_tokens: Optional[int] = None diff --git a/src/scrapybara/types/tool.py b/src/scrapybara/types/tool.py index 0a79b94..ad342f4 100644 --- a/src/scrapybara/types/tool.py +++ b/src/scrapybara/types/tool.py @@ -1,11 +1,32 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Type from pydantic import BaseModel class Tool(BaseModel): name: str description: Optional[str] = None - parameters: Optional[Dict[str, Any]] = None + parameters: Optional[Type[BaseModel]] = None def __call__(self, **kwargs: Any) -> Any: + """Execute the tool with the given arguments. + + The kwargs type will be inferred from the parameters field's type hints. + """ raise NotImplementedError("Tool.__call__ must be implemented by subclasses") + + +class ApiTool(BaseModel): + """A tool that can be serialized to JSON for API calls.""" + + name: str + description: Optional[str] = None + parameters: Optional[Dict[str, Any]] = None + + @classmethod + def from_tool(cls, tool: Tool) -> "ApiTool": + """Convert a Tool to an ApiTool for API serialization.""" + return cls( + name=tool.name, + description=tool.description, + parameters=tool.parameters.model_json_schema() if tool.parameters else None, + )