Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions src/scrapybara/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
Model,
TextPart,
Tool,
ApiTool,
ToolCallPart,
ToolMessage,
ToolResultPart,
Expand All @@ -75,15 +76,10 @@ class StructuredOutputTool(Tool):
_model: Type[BaseModel]

def __init__(self, model: Type[BaseModel]):
schema = model.model_json_schema()
super().__init__(
name="structured_output",
description="Output structured data according to the provided schema parameters. Only use this tool at the end of your task. The output data is final and will be passed directly back to the user.",
parameters={
"type": "object",
"properties": schema.get("properties", {}),
"required": schema.get("required", []),
},
parameters=model,
)
self._model = model

Expand Down Expand Up @@ -1011,11 +1007,14 @@ def act_stream(
current_tools.append(StructuredOutputTool(schema))

while True:
# Convert tools to ApiTools
api_tools = [ApiTool.from_tool(tool) for tool in current_tools]

request = SingleActRequest(
model=model,
system=system,
messages=current_messages,
tools=current_tools,
tools=api_tools,
temperature=temperature,
max_tokens=max_tokens,
)
Expand Down Expand Up @@ -1313,12 +1312,18 @@ async def act_stream(

current_tools = [] if tools is None else list(tools)

if schema:
current_tools.append(StructuredOutputTool(schema))

while True:
# Convert tools to ApiTools
api_tools = [ApiTool.from_tool(tool) for tool in current_tools]

request = SingleActRequest(
model=model,
system=system,
messages=current_messages,
tools=current_tools,
tools=api_tools,
temperature=temperature,
max_tokens=max_tokens,
)
Expand Down
225 changes: 146 additions & 79 deletions src/scrapybara/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,34 @@
import base64
import json
from typing import Any
from typing import Any, Literal, Optional, Sequence, Tuple
from pydantic import BaseModel, Field
from playwright.sync_api import sync_playwright

from ..types.tool import Tool
from ..client import Instance
from ..instance.types import Action, Command


def image_result(base64: str) -> str:
"""Return an image result that is interpretable by the model."""
return json.dumps(
{
"output": "",
"error": "",
"base64_image": base64,
"system": None,
}
)


class ComputerToolParameters(BaseModel):
"""Parameters for computer interaction commands."""

action: Action = Field(description="The computer action to execute")
coordinate: Optional[Sequence[int]] = Field(
None, description="Coordinates for mouse actions"
)
text: Optional[str] = Field(None, description="Text for keyboard actions")


def image_result(base64: str) -> str:
Expand All @@ -27,21 +51,40 @@ class ComputerTool(Tool):
def __init__(self, instance: Instance) -> None:
super().__init__(
name="computer",
description="Control mouse and keyboard for computer interaction",
parameters=ComputerToolParameters,
)
self._instance = instance

def __call__(self, **kwargs: Any) -> Any:
action = kwargs.pop("action")
coordinate = kwargs.pop("coordinate", None)
text = kwargs.pop("text", None)

params = ComputerToolParameters.model_validate(kwargs)
return self._instance.computer(
action=action,
coordinate=tuple(coordinate) if coordinate else None,
text=text,
action=params.action,
coordinate=tuple(params.coordinate) if params.coordinate else None,
text=params.text,
)


class EditToolParameters(BaseModel):
"""Parameters for file editing commands."""

command: Command = Field(description="The edit command to execute")
path: str = Field(description="Path to the file to edit")
file_text: Optional[str] = Field(
None, description="File content for create command"
)
view_range: Optional[Tuple[int, int]] = Field(
None, description="Line range for view command"
)
old_str: Optional[str] = Field(
None, description="String to replace for replace command"
)
new_str: Optional[str] = Field(None, description="New string for replace command")
insert_line: Optional[int] = Field(
None, description="Line number for insert command"
)


class EditTool(Tool):
"""A filesystem editor tool that allows the agent to view, create, and edit files."""

Expand All @@ -50,29 +93,31 @@ class EditTool(Tool):
def __init__(self, instance: Instance) -> None:
super().__init__(
name="str_replace_editor",
description="View, create, and edit files in the filesystem",
parameters=EditToolParameters,
)
self._instance = instance

def __call__(self, **kwargs: Any) -> Any:
command = kwargs.pop("command")
path = kwargs.pop("path")
file_text = kwargs.pop("file_text", None)
view_range = kwargs.pop("view_range", None)
old_str = kwargs.pop("old_str", None)
new_str = kwargs.pop("new_str", None)
insert_line = kwargs.pop("insert_line", None)

params = EditToolParameters.model_validate(kwargs)
return self._instance.edit(
command=command,
path=path,
file_text=file_text,
view_range=view_range,
old_str=old_str,
new_str=new_str,
insert_line=insert_line,
command=params.command,
path=params.path,
file_text=params.file_text,
view_range=params.view_range,
old_str=params.old_str,
new_str=params.new_str,
insert_line=params.insert_line,
)


class BashToolParameters(BaseModel):
"""Parameters for bash command execution."""

command: str = Field(description="The bash command to execute")
restart: Optional[bool] = Field(False, description="Whether to restart the shell")


class BashTool(Tool):
"""A shell execution tool that allows the agent to run bash commands."""

Expand All @@ -81,14 +126,59 @@ class BashTool(Tool):
def __init__(self, instance: Instance) -> None:
super().__init__(
name="bash",
description="Execute bash commands in the shell",
parameters=BashToolParameters,
)
self._instance = instance

def __call__(self, **kwargs: Any) -> Any:
command = kwargs.pop("command")
restart = kwargs.pop("restart", False)

return self._instance.bash(command=command, restart=restart)
params = BashToolParameters.model_validate(kwargs)
return self._instance.bash(command=params.command, restart=params.restart)


class BrowserToolParameters(BaseModel):
"""Parameters for browser interaction commands."""

command: Literal[
"go_to", # Navigate to a URL
"get_html", # Get current page HTML
"evaluate", # Run JavaScript code
"click", # Click on an element
"type", # Type into an element
"screenshot", # Take a screenshot
"get_text", # Get text content of element
"get_attribute", # Get attribute of element
] = Field(
description="The browser command to execute. Required parameters per command:\n"
"- go_to: requires 'url'\n"
"- evaluate: requires 'code'\n"
"- click: requires 'selector'\n"
"- type: requires 'selector' and 'text'\n"
"- get_text: requires 'selector'\n"
"- get_attribute: requires 'selector' and 'attribute'\n"
"- get_html: no additional parameters\n"
"- screenshot: no additional parameters"
)
url: Optional[str] = Field(
None, description="URL for go_to command (required for go_to)"
)
selector: Optional[str] = Field(
None,
description="CSS selector for element operations (required for click, type, get_text, get_attribute)",
)
code: Optional[str] = Field(
None, description="JavaScript code for evaluate command (required for evaluate)"
)
text: Optional[str] = Field(
None, description="Text to type for type command (required for type)"
)
timeout: Optional[int] = Field(
30000, description="Timeout in milliseconds for operations"
)
attribute: Optional[str] = Field(
None,
description="Attribute name for get_attribute command (required for get_attribute)",
)


class BrowserTool(Tool):
Expand All @@ -100,62 +190,19 @@ def __init__(self, instance: Instance) -> None:
super().__init__(
name="browser",
description="Interact with a browser for web scraping and automation",
parameters={
"type": "object",
"properties": {
"command": {
"type": "string",
"enum": [
"go_to", # Navigate to a URL
"get_html", # Get current page HTML
"evaluate", # Run JavaScript code
"click", # Click on an element
"type", # Type into an element
"screenshot", # Take a screenshot
"get_text", # Get text content of element
"get_attribute", # Get attribute of element
],
"description": "The browser command to execute. Required parameters per command:\n- go_to: requires 'url'\n- evaluate: requires 'code'\n- click: requires 'selector'\n- type: requires 'selector' and 'text'\n- get_text: requires 'selector'\n- get_attribute: requires 'selector' and 'attribute'\n- get_html: no additional parameters\n- screenshot: no additional parameters",
},
"url": {
"type": "string",
"description": "URL for go_to command (required for go_to)",
},
"selector": {
"type": "string",
"description": "CSS selector for element operations (required for click, type, get_text, get_attribute)",
},
"code": {
"type": "string",
"description": "JavaScript code for evaluate command (required for evaluate)",
},
"text": {
"type": "string",
"description": "Text to type for type command (required for type)",
},
"timeout": {
"type": "integer",
"description": "Timeout in milliseconds for operations",
"default": 30000,
},
"attribute": {
"type": "string",
"description": "Attribute name for get_attribute command (required for get_attribute)",
},
},
"required": ["command"],
},
parameters=BrowserToolParameters,
)
self._instance = instance

def __call__(self, **kwargs: Any) -> Any:
command = kwargs.pop("command")
url = kwargs.pop("url", None)
selector = kwargs.pop("selector", None)
code = kwargs.pop("code", None)
text = kwargs.pop("text", None)
timeout = kwargs.pop("timeout", 30000)
attribute = kwargs.pop("attribute", None)
params = BrowserToolParameters.model_validate(kwargs)
command = params.command
url = params.url
selector = params.selector
code = params.code
text = params.text
timeout = params.timeout or 30000
attribute = params.attribute

cdp_url = self._instance.browser.get_cdp_url().cdp_url
if cdp_url is None:
Expand All @@ -171,6 +218,8 @@ def __call__(self, **kwargs: Any) -> Any:

try:
if command == "go_to":
if not url:
raise ValueError("URL is required for go_to command")
page.goto(url, timeout=timeout)
return True

Expand All @@ -182,13 +231,21 @@ def __call__(self, **kwargs: Any) -> Any:
return page.evaluate("() => document.documentElement.innerHTML")

elif command == "evaluate":
if not code:
raise ValueError("Code is required for evaluate command")
return page.evaluate(code)

elif command == "click":
if not selector:
raise ValueError("Selector is required for click command")
page.click(selector, timeout=timeout)
return True

elif command == "type":
if not selector:
raise ValueError("Selector is required for type command")
if not text:
raise ValueError("Text is required for type command")
page.type(selector, text, timeout=timeout)
return True

Expand All @@ -198,12 +255,22 @@ def __call__(self, **kwargs: Any) -> Any:
)

elif command == "get_text":
if not selector:
raise ValueError("Selector is required for get_text command")
element = page.wait_for_selector(selector, timeout=timeout)
if element is None:
raise ValueError(f"Element not found: {selector}")
return element.text_content()

elif command == "get_attribute":
if not selector:
raise ValueError(
"Selector is required for get_attribute command"
)
if not attribute:
raise ValueError(
"Attribute is required for get_attribute command"
)
element = page.wait_for_selector(selector, timeout=timeout)
if element is None:
raise ValueError(f"Element not found: {selector}")
Expand Down
Loading
Loading