diff --git a/.fernignore b/.fernignore index 17fad2b..ced6301 100644 --- a/.fernignore +++ b/.fernignore @@ -2,7 +2,9 @@ src/scrapybara/client.py src/scrapybara/anthropic/ +src/scrapybara/prompts/ src/scrapybara/tools/ src/scrapybara/types/act.py +src/scrapybara/types/tool.py tests/custom/test_client.py .github/workflows/ci.yml \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 178562f..ee5d06a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapybara" -version = "2.1.0" +version = "2.1.1" description = "" readme = "README.md" authors = [] diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py index fe993b6..a2eb2f4 100644 --- a/src/scrapybara/core/client_wrapper.py +++ b/src/scrapybara/core/client_wrapper.py @@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { "X-Fern-Language": "Python", "X-Fern-SDK-Name": "scrapybara", - "X-Fern-SDK-Version": "2.1.0", + "X-Fern-SDK-Version": "2.1.1", } headers["x-api-key"] = self.api_key return headers diff --git a/src/scrapybara/prompts/__init__.py b/src/scrapybara/prompts/__init__.py new file mode 100644 index 0000000..3d9668b --- /dev/null +++ b/src/scrapybara/prompts/__init__.py @@ -0,0 +1,43 @@ +from datetime import datetime + +SYSTEM_PROMPT = f""" +* You have access to an Ubuntu virtual machine with internet connectivity +* You can install Ubuntu applications using the bash tool (use curl over wget) +* To run GUI applications with the bash tool: + - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses + - GUI apps will appear but may take time to load - confirm with an extra screenshot +* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool +* In Chromium, click the address bar directly to enter URLs/searches +* If you need to read a full PDF after initial screenshot + - Download with curl + - Convert to text using pdftotext + - Read the text file with StrReplaceEditTool +* If you need to read a HTML file: + - Open with the address bar in Chromium +* For commands with large text output: + - Redirect to a temp file + - Use str_replace_editor or grep with context (-B and -A flags) to view output +* When viewing pages: + - Zoom out to see full content, or + - Scroll to ensure you see everything +* Computer function calls take time, string together calls when possible +* You are allowed to take actions on behalf of the user on sites that are authenticated +* If the user asks you to access a site, assume that the user has already authenticated +* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc. +* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop +* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')} + + + +* If first screenshot shows black screen: + - Click mouse in screen center + - Take another screenshot +* When interacting with a field, always clear the field first using "ctrl+A" and "delete" + - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field +* If given a complex task, break down into smaller steps and ask the user for details only if necessary +* Research facts with Google searches in Chromium +* Read through web pages thoroughly by scrolling down till the end +* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings +* Wait for actions to complete (examine previous screenshots) before taking another action +* Be concise! +""" diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py index 31e6110..6ceb10b 100644 --- a/src/scrapybara/tools/__init__.py +++ b/src/scrapybara/tools/__init__.py @@ -1,3 +1,5 @@ +import base64 +import json from typing import Any from playwright.sync_api import sync_playwright @@ -179,7 +181,9 @@ def __call__(self, **kwargs: Any) -> Any: return True elif command == "screenshot": - return page.screenshot(type="png") + return image_result( + base64.b64encode(page.screenshot(type="png")).decode("utf-8") + ) elif command == "get_text": element = page.wait_for_selector(selector, timeout=timeout) @@ -201,3 +205,15 @@ def __call__(self, **kwargs: Any) -> Any: finally: browser.close() + + +def image_result(base64: str) -> str: + """Return an image result that is interpretable by the model.""" + return json.dumps( + { + "output": "", + "error": "", + "base64_image": base64, + "system": None, + } + ) diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index 3e3208d..70f026a 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -2,50 +2,8 @@ import os from scrapybara.anthropic import Anthropic +from scrapybara.prompts import SYSTEM_PROMPT from scrapybara.tools import BashTool, BrowserTool, ComputerTool, EditTool -from datetime import datetime - -SYSTEM_PROMPT = f""" -* You have access to an Ubuntu virtual machine with internet connectivity -* You can install Ubuntu applications using the bash tool (use curl over wget) -* To run GUI applications with the bash tool: - - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses - - GUI apps will appear but may take time to load - confirm with an extra screenshot -* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool -* In Chromium, click the address bar directly to enter URLs/searches -* If you need to read a full PDF after initial screenshot - - Download with curl - - Convert to text using pdftotext - - Read the text file with StrReplaceEditTool -* If you need to read a HTML file: - - Open with the address bar in Chromium -* For commands with large text output: - - Redirect to a temp file - - Use str_replace_editor or grep with context (-B and -A flags) to view output -* When viewing pages: - - Zoom out to see full content, or - - Scroll to ensure you see everything -* Computer function calls take time, string together calls when possible -* You are allowed to take actions on behalf of the user on sites that are authenticated -* If the user asks you to access a site, assume that the user has already authenticated -* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc. -* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop -* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')} - - - -* If first screenshot shows black screen: - - Click mouse in screen center - - Take another screenshot -* When interacting with a field, always clear the field first using "ctrl+A" and "delete" - - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field -* If given a complex task, break down into smaller steps and ask the user for details only if necessary -* Research facts with Google searches in Chromium -* Read through web pages thoroughly by scrolling down till the end -* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings -* Wait for actions to complete (examine previous screenshots) before taking another action -* Be concise! -""" def test_client() -> None: