diff --git a/.fernignore b/.fernignore
index 17fad2b..ced6301 100644
--- a/.fernignore
+++ b/.fernignore
@@ -2,7 +2,9 @@
src/scrapybara/client.py
src/scrapybara/anthropic/
+src/scrapybara/prompts/
src/scrapybara/tools/
src/scrapybara/types/act.py
+src/scrapybara/types/tool.py
tests/custom/test_client.py
.github/workflows/ci.yml
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 178562f..ee5d06a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapybara"
-version = "2.1.0"
+version = "2.1.1"
description = ""
readme = "README.md"
authors = []
diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py
index fe993b6..a2eb2f4 100644
--- a/src/scrapybara/core/client_wrapper.py
+++ b/src/scrapybara/core/client_wrapper.py
@@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]:
headers: typing.Dict[str, str] = {
"X-Fern-Language": "Python",
"X-Fern-SDK-Name": "scrapybara",
- "X-Fern-SDK-Version": "2.1.0",
+ "X-Fern-SDK-Version": "2.1.1",
}
headers["x-api-key"] = self.api_key
return headers
diff --git a/src/scrapybara/prompts/__init__.py b/src/scrapybara/prompts/__init__.py
new file mode 100644
index 0000000..3d9668b
--- /dev/null
+++ b/src/scrapybara/prompts/__init__.py
@@ -0,0 +1,43 @@
+from datetime import datetime
+
+SYSTEM_PROMPT = f"""
+* You have access to an Ubuntu virtual machine with internet connectivity
+* You can install Ubuntu applications using the bash tool (use curl over wget)
+* To run GUI applications with the bash tool:
+ - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
+ - GUI apps will appear but may take time to load - confirm with an extra screenshot
+* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
+* In Chromium, click the address bar directly to enter URLs/searches
+* If you need to read a full PDF after initial screenshot
+ - Download with curl
+ - Convert to text using pdftotext
+ - Read the text file with StrReplaceEditTool
+* If you need to read a HTML file:
+ - Open with the address bar in Chromium
+* For commands with large text output:
+ - Redirect to a temp file
+ - Use str_replace_editor or grep with context (-B and -A flags) to view output
+* When viewing pages:
+ - Zoom out to see full content, or
+ - Scroll to ensure you see everything
+* Computer function calls take time, string together calls when possible
+* You are allowed to take actions on behalf of the user on sites that are authenticated
+* If the user asks you to access a site, assume that the user has already authenticated
+* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
+* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
+* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
+
+
+
+* If first screenshot shows black screen:
+ - Click mouse in screen center
+ - Take another screenshot
+* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
+ - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
+* If given a complex task, break down into smaller steps and ask the user for details only if necessary
+* Research facts with Google searches in Chromium
+* Read through web pages thoroughly by scrolling down till the end
+* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
+* Wait for actions to complete (examine previous screenshots) before taking another action
+* Be concise!
+"""
diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py
index 31e6110..6ceb10b 100644
--- a/src/scrapybara/tools/__init__.py
+++ b/src/scrapybara/tools/__init__.py
@@ -1,3 +1,5 @@
+import base64
+import json
from typing import Any
from playwright.sync_api import sync_playwright
@@ -179,7 +181,9 @@ def __call__(self, **kwargs: Any) -> Any:
return True
elif command == "screenshot":
- return page.screenshot(type="png")
+ return image_result(
+ base64.b64encode(page.screenshot(type="png")).decode("utf-8")
+ )
elif command == "get_text":
element = page.wait_for_selector(selector, timeout=timeout)
@@ -201,3 +205,15 @@ def __call__(self, **kwargs: Any) -> Any:
finally:
browser.close()
+
+
+def image_result(base64: str) -> str:
+ """Return an image result that is interpretable by the model."""
+ return json.dumps(
+ {
+ "output": "",
+ "error": "",
+ "base64_image": base64,
+ "system": None,
+ }
+ )
diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py
index 3e3208d..70f026a 100644
--- a/tests/custom/test_client.py
+++ b/tests/custom/test_client.py
@@ -2,50 +2,8 @@
import os
from scrapybara.anthropic import Anthropic
+from scrapybara.prompts import SYSTEM_PROMPT
from scrapybara.tools import BashTool, BrowserTool, ComputerTool, EditTool
-from datetime import datetime
-
-SYSTEM_PROMPT = f"""
-* You have access to an Ubuntu virtual machine with internet connectivity
-* You can install Ubuntu applications using the bash tool (use curl over wget)
-* To run GUI applications with the bash tool:
- - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
- - GUI apps will appear but may take time to load - confirm with an extra screenshot
-* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
-* In Chromium, click the address bar directly to enter URLs/searches
-* If you need to read a full PDF after initial screenshot
- - Download with curl
- - Convert to text using pdftotext
- - Read the text file with StrReplaceEditTool
-* If you need to read a HTML file:
- - Open with the address bar in Chromium
-* For commands with large text output:
- - Redirect to a temp file
- - Use str_replace_editor or grep with context (-B and -A flags) to view output
-* When viewing pages:
- - Zoom out to see full content, or
- - Scroll to ensure you see everything
-* Computer function calls take time, string together calls when possible
-* You are allowed to take actions on behalf of the user on sites that are authenticated
-* If the user asks you to access a site, assume that the user has already authenticated
-* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
-* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
-* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
-
-
-
-* If first screenshot shows black screen:
- - Click mouse in screen center
- - Take another screenshot
-* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
- - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
-* If given a complex task, break down into smaller steps and ask the user for details only if necessary
-* Research facts with Google searches in Chromium
-* Read through web pages thoroughly by scrolling down till the end
-* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
-* Wait for actions to complete (examine previous screenshots) before taking another action
-* Be concise!
-"""
def test_client() -> None: