Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .fernignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
src/scrapybara/client.py
src/scrapybara/anthropic/
src/scrapybara/herd/
src/scrapybara/openai/
src/scrapybara/prompts/
src/scrapybara/tools/
src/scrapybara/types/__init__.py
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "scrapybara"

[tool.poetry]
name = "scrapybara"
version = "2.3.7"
version = "2.4.0"
description = ""
readme = "README.md"
authors = []
Expand Down
2 changes: 1 addition & 1 deletion src/scrapybara/core/client_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]:
headers: typing.Dict[str, str] = {
"X-Fern-Language": "Python",
"X-Fern-SDK-Name": "scrapybara",
"X-Fern-SDK-Version": "2.3.7",
"X-Fern-SDK-Version": "2.4.0",
}
headers["x-api-key"] = self.api_key
return headers
Expand Down
140 changes: 140 additions & 0 deletions src/scrapybara/openai/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from typing import Literal, Optional

from pydantic import Field

from ..types.act import Model
from datetime import datetime


class OpenAI(Model):
"""Model adapter for OpenAI.

Supported models:
- computer-use-preview-2025-02-04

Args:
name: OpenAI model name, defaults to "computer-use-preview-2025-02-04"
api_key: Optional OpenAI API key

Returns:
A Model configuration object
"""

provider: Literal["openai"] = Field(default="openai", frozen=True)

def __init__(
self,
name: Optional[str] = "computer-use-preview-2025-02-04",
api_key: Optional[str] = None,
) -> None:
super().__init__(provider="openai", name=name, api_key=api_key)


UBUNTU_SYSTEM_PROMPT = f"""You have access to an Ubuntu VM with internet connectivity. You can install Ubuntu applications using the bash tool (prefer curl over wget).

### Running GUI Applications
- To run GUI applications with the bash tool, use a subshell: `(DISPLAY=:1 xterm &)`
- GUI apps may take time to load; confirm their appearance with an extra screenshot.
- Chromium is the default browser. Start it using `(DISPLAY=:1 chromium &)` via the bash tool, but interact with it visually via the computer tool.

### Handling HTML and Large Text Output
- To read an HTML file, open it in Chromium using the address bar.
- For commands with large text output:
- Redirect output to a temp file.
- Use `str_replace_editor` or `grep` with context flags (`-B` and `-A`) to extract relevant sections.

### Interacting with Web Pages and Forms
- Zoom out or scroll to ensure all content is visible.
- When interacting with input fields:
- Clear the field first using `Ctrl+A` and `Delete`.
- Take an extra screenshot after pressing "Enter" to confirm the input was submitted correctly.
- Move the mouse to the next field after submission.

### Efficiency and Authentication
- Computer function calls take time; optimize by stringing together related actions when possible.
- You are allowed to take actions on authenticated sites on behalf of the user.
- Assume the user has already authenticated if they request access to a site.
- For logging into additional sites, ask the user to use Auth Contexts or the Interactive Desktop.

### Handling Black Screens
- If the first screenshot shows a black screen:
- Click the center of the screen.
- Take another screenshot.

### Best Practices
- If given a complex task, break it down into smaller steps and ask for details only when necessary.
- Read web pages thoroughly by scrolling down until sufficient information is gathered.
- Explain each action you take and why.
- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information).
- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly.

### Date Context
Today's date is {datetime.today().strftime('%A, %B %d, %Y')}."""

BROWSER_SYSTEM_PROMPT = f"""You have access to a Chromium VM with internet connectivity. Chromium should already be open and running.

### Interacting with Web Pages
- Use the computer tool to interact with web pages.
- Zoom out or scroll to ensure all content is visible.

### Handling Input Fields
- Always clear fields before entering text using `Ctrl+A` and `Delete`.
- After submitting a field by pressing "Enter":
- Take an extra screenshot to confirm the input was properly submitted.
- Move the mouse to the next field.

### Efficiency and Authentication
- Computer function calls take time; optimize by combining related actions when possible.
- You are allowed to take actions on authenticated sites on behalf of the user.
- Assume the user has already authenticated if they request access to a site.
- To log into additional sites, ask the user to use Auth Contexts.

### Handling Black Screens
- If the first screenshot shows a black screen:
- Click the center of the screen.
- Take another screenshot.

### Best Practices
- If given a complex task, break it down into smaller steps and ask for details only when necessary.
- Read web pages thoroughly by scrolling down until sufficient information is gathered.
- Explain each action you take and why.
- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information).
- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly.

### Date Context
Today's date is {datetime.today().strftime('%A, %B %d, %Y')}."""

WINDOWS_SYSTEM_PROMPT = f"""You have access to a Windows VM with internet connectivity and can interact with the Windows desktop using the computer tool.

### Interacting with Applications and Web Pages
- GUI applications may take time to load—confirm with an extra screenshot.
- Microsoft Edge is the default browser.
- When viewing pages:
- Zoom out or scroll to ensure all content is visible.

### Handling Input Fields
- Always clear fields before entering text using `Ctrl+A` and `Delete`.
- After submitting a field by pressing "Enter":
- Take an extra screenshot to confirm the input was properly submitted.
- Move the mouse to the next field.

### Efficiency and Authentication
- Computer function calls take time; optimize by combining related actions when possible.
- You are allowed to take actions on authenticated sites on behalf of the user.
- Assume the user has already authenticated if they request access to a site.
- To log into additional sites, ask the user to use Auth Contexts or the Interactive Desktop.

### Handling Black Screens
- If the first screenshot shows a black screen:
- Click the center of the screen.
- Take another screenshot.

### Best Practices
- If given a complex task, break it down into smaller steps and ask for details only when necessary.
- Read web pages thoroughly by scrolling down until sufficient information is gathered.
- Explain each action you take and why.
- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information).
- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly.

### Date Context
Today's date is {datetime.today().strftime('%A, %B %d, %Y')}."""
2 changes: 1 addition & 1 deletion src/scrapybara/types/act.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class ToolMessage(BaseModel):

# Request/Response models
class Model(BaseModel):
provider: Literal["anthropic", "herd"]
provider: Literal["anthropic", "openai", "herd"]
name: str
api_key: Optional[str] = None

Expand Down
84 changes: 76 additions & 8 deletions tests/custom/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@

from scrapybara.anthropic import (
Anthropic,
UBUNTU_SYSTEM_PROMPT,
BROWSER_SYSTEM_PROMPT,
WINDOWS_SYSTEM_PROMPT,
UBUNTU_SYSTEM_PROMPT as UBUNTU_SYSTEM_PROMPT_ANTHROPIC,
BROWSER_SYSTEM_PROMPT as BROWSER_SYSTEM_PROMPT_ANTHROPIC,
WINDOWS_SYSTEM_PROMPT as WINDOWS_SYSTEM_PROMPT_ANTHROPIC,
)
from scrapybara.openai import (
OpenAI,
UBUNTU_SYSTEM_PROMPT as UBUNTU_SYSTEM_PROMPT_OPENAI,
BROWSER_SYSTEM_PROMPT as BROWSER_SYSTEM_PROMPT_OPENAI,
WINDOWS_SYSTEM_PROMPT as WINDOWS_SYSTEM_PROMPT_OPENAI,
)
from scrapybara.tools import BashTool, ComputerTool, EditTool

Expand Down Expand Up @@ -38,7 +44,40 @@ def test_ubuntu() -> None:
assert cdp_url is not None
response = client.act(
model=Anthropic(),
system=UBUNTU_SYSTEM_PROMPT,
system=UBUNTU_SYSTEM_PROMPT_ANTHROPIC,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(ubuntu_instance),
BashTool(ubuntu_instance),
EditTool(ubuntu_instance),
],
schema=YCStats,
on_step=lambda step: print(step.text, step.tool_calls),
)
print(response.output)
assert response.output is not None
assert response.output.number_of_startups is not None
assert response.output.combined_valuation is not None
ubuntu_instance.browser.stop()
ubuntu_instance.stop()

def test_ubuntu_openai() -> None:
_check_api_key()
client = Scrapybara()

ubuntu_instance = client.start_ubuntu()
print(ubuntu_instance.get_stream_url().stream_url)
assert ubuntu_instance.id is not None
instances = client.get_instances()
assert len(instances) > 0
screenshot_response = ubuntu_instance.screenshot()
assert screenshot_response.base_64_image is not None
ubuntu_instance.browser.start()
cdp_url = ubuntu_instance.browser.get_cdp_url()
assert cdp_url is not None
response = client.act(
model=OpenAI(),
system=UBUNTU_SYSTEM_PROMPT_OPENAI,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(ubuntu_instance),
Expand Down Expand Up @@ -69,7 +108,34 @@ def test_browser() -> None:
assert cdp_url is not None
response = client.act(
model=Anthropic(),
system=BROWSER_SYSTEM_PROMPT,
system=BROWSER_SYSTEM_PROMPT_ANTHROPIC,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(browser_instance),
],
schema=YCStats,
on_step=lambda step: print(step.text, step.tool_calls),
)
print(response.output)
assert response.output is not None
assert response.output.number_of_startups is not None
assert response.output.combined_valuation is not None
browser_instance.stop()

def test_browser_openai() -> None:
_check_api_key()
client = Scrapybara()

browser_instance = client.start_browser()
print(browser_instance.get_stream_url().stream_url)
assert browser_instance.id is not None
screenshot_response = browser_instance.screenshot()
assert screenshot_response.base_64_image is not None
cdp_url = browser_instance.get_cdp_url()
assert cdp_url is not None
response = client.act(
model=OpenAI(),
system=BROWSER_SYSTEM_PROMPT_OPENAI,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(browser_instance),
Expand All @@ -96,7 +162,7 @@ def test_windows() -> None:
assert screenshot_response.base_64_image is not None
response = client.act(
model=Anthropic(),
system=WINDOWS_SYSTEM_PROMPT,
system=WINDOWS_SYSTEM_PROMPT_ANTHROPIC,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(windows_instance),
Expand Down Expand Up @@ -128,7 +194,7 @@ def test_ubuntu_thinking() -> None:
assert cdp_url is not None
response = client.act(
model=Anthropic(name="claude-3-7-sonnet-20250219-thinking"),
system=UBUNTU_SYSTEM_PROMPT,
system=UBUNTU_SYSTEM_PROMPT_ANTHROPIC,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(ubuntu_instance),
Expand Down Expand Up @@ -160,7 +226,7 @@ def test_browser_thinking() -> None:
assert cdp_url is not None
response = client.act(
model=Anthropic(name="claude-3-7-sonnet-20250219-thinking"),
system=BROWSER_SYSTEM_PROMPT,
system=BROWSER_SYSTEM_PROMPT_ANTHROPIC,
prompt="Go to the YC website and get the number of funded startups and combined valuation",
tools=[
ComputerTool(browser_instance),
Expand All @@ -178,6 +244,8 @@ def test_browser_thinking() -> None:
if __name__ == "__main__":
test_ubuntu()
test_browser()
test_ubuntu_openai()
test_browser_openai()
# test_ubuntu_thinking()
# test_browser_thinking()
# test_windows()
Loading