From b98e888c4ae609a7f8ba7f4a636e609e59017bc9 Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Tue, 11 Mar 2025 10:59:01 -0500 Subject: [PATCH 1/2] feat: oai --- .fernignore | 1 + pyproject.toml | 2 +- src/scrapybara/core/client_wrapper.py | 2 +- src/scrapybara/openai/__init__.py | 140 ++++++++++++++++++++++++++ src/scrapybara/types/act.py | 2 +- tests/custom/test_client.py | 84 ++++++++++++++-- 6 files changed, 220 insertions(+), 11 deletions(-) create mode 100644 src/scrapybara/openai/__init__.py diff --git a/.fernignore b/.fernignore index 3eb9c64..fc13243 100644 --- a/.fernignore +++ b/.fernignore @@ -3,6 +3,7 @@ src/scrapybara/client.py src/scrapybara/anthropic/ src/scrapybara/herd/ +src/scrapybara/openai/ src/scrapybara/prompts/ src/scrapybara/tools/ src/scrapybara/types/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 0333485..909aa43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "scrapybara" [tool.poetry] name = "scrapybara" -version = "2.3.7" +version = "2.4.0" description = "" readme = "README.md" authors = [] diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py index a024291..4149dfc 100644 --- a/src/scrapybara/core/client_wrapper.py +++ b/src/scrapybara/core/client_wrapper.py @@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { "X-Fern-Language": "Python", "X-Fern-SDK-Name": "scrapybara", - "X-Fern-SDK-Version": "2.3.7", + "X-Fern-SDK-Version": "2.4.0", } headers["x-api-key"] = self.api_key return headers diff --git a/src/scrapybara/openai/__init__.py b/src/scrapybara/openai/__init__.py new file mode 100644 index 0000000..c7d4c42 --- /dev/null +++ b/src/scrapybara/openai/__init__.py @@ -0,0 +1,140 @@ +from typing import Literal, Optional + +from pydantic import Field + +from ..types.act import Model +from datetime import datetime + + +class OpenAI(Model): + """Model adapter for OpenAI. + + Supported models: + - computer-use + + Args: + name: OpenAI model name, defaults to "computer-use" + api_key: Optional OpenAI API key + + Returns: + A Model configuration object + """ + + provider: Literal["openai"] = Field(default="openai", frozen=True) + + def __init__( + self, + name: Optional[str] = "computer-use", + api_key: Optional[str] = None, + ) -> None: + super().__init__(provider="openai", name=name, api_key=api_key) + + +UBUNTU_SYSTEM_PROMPT = f"""You have access to an Ubuntu VM with internet connectivity. You can install Ubuntu applications using the bash tool (prefer curl over wget). + +### Running GUI Applications +- To run GUI applications with the bash tool, use a subshell: `(DISPLAY=:1 xterm &)` +- GUI apps may take time to load; confirm their appearance with an extra screenshot. +- Chromium is the default browser. Start it using `(DISPLAY=:1 chromium &)` via the bash tool, but interact with it visually via the computer tool. + +### Handling HTML and Large Text Output +- To read an HTML file, open it in Chromium using the address bar. +- For commands with large text output: + - Redirect output to a temp file. + - Use `str_replace_editor` or `grep` with context flags (`-B` and `-A`) to extract relevant sections. + +### Interacting with Web Pages and Forms +- Zoom out or scroll to ensure all content is visible. +- When interacting with input fields: + - Clear the field first using `Ctrl+A` and `Delete`. + - Take an extra screenshot after pressing "Enter" to confirm the input was submitted correctly. + - Move the mouse to the next field after submission. + +### Efficiency and Authentication +- Computer function calls take time; optimize by stringing together related actions when possible. +- You are allowed to take actions on authenticated sites on behalf of the user. +- Assume the user has already authenticated if they request access to a site. +- For logging into additional sites, ask the user to use Auth Contexts or the Interactive Desktop. + +### Handling Black Screens +- If the first screenshot shows a black screen: + - Click the center of the screen. + - Take another screenshot. + +### Best Practices +- If given a complex task, break it down into smaller steps and ask for details only when necessary. +- Read web pages thoroughly by scrolling down until sufficient information is gathered. +- Explain each action you take and why. +- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information). +- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly. + +### Date Context +Today's date is {datetime.today().strftime('%A, %B %d, %Y')}.""" + +BROWSER_SYSTEM_PROMPT = f"""You have access to a Chromium VM with internet connectivity. Chromium should already be open and running. + +### Interacting with Web Pages +- Use the computer tool to interact with web pages. +- Zoom out or scroll to ensure all content is visible. + +### Handling Input Fields +- Always clear fields before entering text using `Ctrl+A` and `Delete`. +- After submitting a field by pressing "Enter": + - Take an extra screenshot to confirm the input was properly submitted. + - Move the mouse to the next field. + +### Efficiency and Authentication +- Computer function calls take time; optimize by combining related actions when possible. +- You are allowed to take actions on authenticated sites on behalf of the user. +- Assume the user has already authenticated if they request access to a site. +- To log into additional sites, ask the user to use Auth Contexts. + +### Handling Black Screens +- If the first screenshot shows a black screen: + - Click the center of the screen. + - Take another screenshot. + +### Best Practices +- If given a complex task, break it down into smaller steps and ask for details only when necessary. +- Read web pages thoroughly by scrolling down until sufficient information is gathered. +- Explain each action you take and why. +- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information). +- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly. + +### Date Context +Today's date is {datetime.today().strftime('%A, %B %d, %Y')}.""" + +WINDOWS_SYSTEM_PROMPT = f"""You have access to a Windows VM with internet connectivity and can interact with the Windows desktop using the computer tool. + +### Interacting with Applications and Web Pages +- GUI applications may take time to load—confirm with an extra screenshot. +- Microsoft Edge is the default browser. +- When viewing pages: + - Zoom out or scroll to ensure all content is visible. + +### Handling Input Fields +- Always clear fields before entering text using `Ctrl+A` and `Delete`. +- After submitting a field by pressing "Enter": + - Take an extra screenshot to confirm the input was properly submitted. + - Move the mouse to the next field. + +### Efficiency and Authentication +- Computer function calls take time; optimize by combining related actions when possible. +- You are allowed to take actions on authenticated sites on behalf of the user. +- Assume the user has already authenticated if they request access to a site. +- To log into additional sites, ask the user to use Auth Contexts or the Interactive Desktop. + +### Handling Black Screens +- If the first screenshot shows a black screen: + - Click the center of the screen. + - Take another screenshot. + +### Best Practices +- If given a complex task, break it down into smaller steps and ask for details only when necessary. +- Read web pages thoroughly by scrolling down until sufficient information is gathered. +- Explain each action you take and why. +- Avoid asking for confirmation on routine actions (e.g., pressing "Enter" after typing a URL). Seek clarification only for ambiguous or critical actions (e.g., deleting files or submitting sensitive information). +- If a user's request implies the need for external information, assume they want you to search for it and provide the answer directly. + +### Date Context +Today's date is {datetime.today().strftime('%A, %B %d, %Y')}.""" \ No newline at end of file diff --git a/src/scrapybara/types/act.py b/src/scrapybara/types/act.py index a3be307..269551d 100644 --- a/src/scrapybara/types/act.py +++ b/src/scrapybara/types/act.py @@ -61,7 +61,7 @@ class ToolMessage(BaseModel): # Request/Response models class Model(BaseModel): - provider: Literal["anthropic", "herd"] + provider: Literal["anthropic", "openai", "herd"] name: str api_key: Optional[str] = None diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index 533d843..a949167 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -5,9 +5,15 @@ from scrapybara.anthropic import ( Anthropic, - UBUNTU_SYSTEM_PROMPT, - BROWSER_SYSTEM_PROMPT, - WINDOWS_SYSTEM_PROMPT, + UBUNTU_SYSTEM_PROMPT as UBUNTU_SYSTEM_PROMPT_ANTHROPIC, + BROWSER_SYSTEM_PROMPT as BROWSER_SYSTEM_PROMPT_ANTHROPIC, + WINDOWS_SYSTEM_PROMPT as WINDOWS_SYSTEM_PROMPT_ANTHROPIC, +) +from scrapybara.openai import ( + OpenAI, + UBUNTU_SYSTEM_PROMPT as UBUNTU_SYSTEM_PROMPT_OPENAI, + BROWSER_SYSTEM_PROMPT as BROWSER_SYSTEM_PROMPT_OPENAI, + WINDOWS_SYSTEM_PROMPT as WINDOWS_SYSTEM_PROMPT_OPENAI, ) from scrapybara.tools import BashTool, ComputerTool, EditTool @@ -38,7 +44,40 @@ def test_ubuntu() -> None: assert cdp_url is not None response = client.act( model=Anthropic(), - system=UBUNTU_SYSTEM_PROMPT, + system=UBUNTU_SYSTEM_PROMPT_ANTHROPIC, + prompt="Go to the YC website and get the number of funded startups and combined valuation", + tools=[ + ComputerTool(ubuntu_instance), + BashTool(ubuntu_instance), + EditTool(ubuntu_instance), + ], + schema=YCStats, + on_step=lambda step: print(step.text, step.tool_calls), + ) + print(response.output) + assert response.output is not None + assert response.output.number_of_startups is not None + assert response.output.combined_valuation is not None + ubuntu_instance.browser.stop() + ubuntu_instance.stop() + +def test_ubuntu_openai() -> None: + _check_api_key() + client = Scrapybara() + + ubuntu_instance = client.start_ubuntu() + print(ubuntu_instance.get_stream_url().stream_url) + assert ubuntu_instance.id is not None + instances = client.get_instances() + assert len(instances) > 0 + screenshot_response = ubuntu_instance.screenshot() + assert screenshot_response.base_64_image is not None + ubuntu_instance.browser.start() + cdp_url = ubuntu_instance.browser.get_cdp_url() + assert cdp_url is not None + response = client.act( + model=OpenAI(), + system=UBUNTU_SYSTEM_PROMPT_OPENAI, prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(ubuntu_instance), @@ -69,7 +108,34 @@ def test_browser() -> None: assert cdp_url is not None response = client.act( model=Anthropic(), - system=BROWSER_SYSTEM_PROMPT, + system=BROWSER_SYSTEM_PROMPT_ANTHROPIC, + prompt="Go to the YC website and get the number of funded startups and combined valuation", + tools=[ + ComputerTool(browser_instance), + ], + schema=YCStats, + on_step=lambda step: print(step.text, step.tool_calls), + ) + print(response.output) + assert response.output is not None + assert response.output.number_of_startups is not None + assert response.output.combined_valuation is not None + browser_instance.stop() + +def test_browser_openai() -> None: + _check_api_key() + client = Scrapybara() + + browser_instance = client.start_browser() + print(browser_instance.get_stream_url().stream_url) + assert browser_instance.id is not None + screenshot_response = browser_instance.screenshot() + assert screenshot_response.base_64_image is not None + cdp_url = browser_instance.get_cdp_url() + assert cdp_url is not None + response = client.act( + model=OpenAI(), + system=BROWSER_SYSTEM_PROMPT_OPENAI, prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(browser_instance), @@ -96,7 +162,7 @@ def test_windows() -> None: assert screenshot_response.base_64_image is not None response = client.act( model=Anthropic(), - system=WINDOWS_SYSTEM_PROMPT, + system=WINDOWS_SYSTEM_PROMPT_ANTHROPIC, prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(windows_instance), @@ -128,7 +194,7 @@ def test_ubuntu_thinking() -> None: assert cdp_url is not None response = client.act( model=Anthropic(name="claude-3-7-sonnet-20250219-thinking"), - system=UBUNTU_SYSTEM_PROMPT, + system=UBUNTU_SYSTEM_PROMPT_ANTHROPIC, prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(ubuntu_instance), @@ -160,7 +226,7 @@ def test_browser_thinking() -> None: assert cdp_url is not None response = client.act( model=Anthropic(name="claude-3-7-sonnet-20250219-thinking"), - system=BROWSER_SYSTEM_PROMPT, + system=BROWSER_SYSTEM_PROMPT_ANTHROPIC, prompt="Go to the YC website and get the number of funded startups and combined valuation", tools=[ ComputerTool(browser_instance), @@ -178,6 +244,8 @@ def test_browser_thinking() -> None: if __name__ == "__main__": test_ubuntu() test_browser() + test_ubuntu_openai() + test_browser_openai() # test_ubuntu_thinking() # test_browser_thinking() # test_windows() From 6f6046082ab0d781c841d4fa636f4c8f84e6d6bd Mon Sep 17 00:00:00 2001 From: Cooper Miller Date: Tue, 11 Mar 2025 11:19:53 -0500 Subject: [PATCH 2/2] name --- src/scrapybara/openai/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scrapybara/openai/__init__.py b/src/scrapybara/openai/__init__.py index c7d4c42..796384a 100644 --- a/src/scrapybara/openai/__init__.py +++ b/src/scrapybara/openai/__init__.py @@ -10,10 +10,10 @@ class OpenAI(Model): """Model adapter for OpenAI. Supported models: - - computer-use + - computer-use-preview-2025-02-04 Args: - name: OpenAI model name, defaults to "computer-use" + name: OpenAI model name, defaults to "computer-use-preview-2025-02-04" api_key: Optional OpenAI API key Returns: @@ -24,7 +24,7 @@ class OpenAI(Model): def __init__( self, - name: Optional[str] = "computer-use", + name: Optional[str] = "computer-use-preview-2025-02-04", api_key: Optional[str] = None, ) -> None: super().__init__(provider="openai", name=name, api_key=api_key)