Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .fernignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

src/scrapybara/client.py
src/scrapybara/anthropic/
src/scrapybara/prompts/
src/scrapybara/tools/
src/scrapybara/types/act.py
src/scrapybara/types/tool.py
tests/custom/test_client.py
.github/workflows/ci.yml
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapybara"
version = "2.1.0"
version = "2.1.1"
description = ""
readme = "README.md"
authors = []
Expand Down
2 changes: 1 addition & 1 deletion src/scrapybara/core/client_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]:
headers: typing.Dict[str, str] = {
"X-Fern-Language": "Python",
"X-Fern-SDK-Name": "scrapybara",
"X-Fern-SDK-Version": "2.1.0",
"X-Fern-SDK-Version": "2.1.1",
}
headers["x-api-key"] = self.api_key
return headers
Expand Down
43 changes: 43 additions & 0 deletions src/scrapybara/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from datetime import datetime

SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You have access to an Ubuntu virtual machine with internet connectivity
* You can install Ubuntu applications using the bash tool (use curl over wget)
* To run GUI applications with the bash tool:
- Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
- GUI apps will appear but may take time to load - confirm with an extra screenshot
* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
* In Chromium, click the address bar directly to enter URLs/searches
* If you need to read a full PDF after initial screenshot
- Download with curl
- Convert to text using pdftotext
- Read the text file with StrReplaceEditTool
* If you need to read a HTML file:
- Open with the address bar in Chromium
* For commands with large text output:
- Redirect to a temp file
- Use str_replace_editor or grep with context (-B and -A flags) to view output
* When viewing pages:
- Zoom out to see full content, or
- Scroll to ensure you see everything
* Computer function calls take time, string together calls when possible
* You are allowed to take actions on behalf of the user on sites that are authenticated
* If the user asks you to access a site, assume that the user has already authenticated
* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
</SYSTEM_CAPABILITY>

<IMPORTANT>
* If first screenshot shows black screen:
- Click mouse in screen center
- Take another screenshot
* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
- Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
* If given a complex task, break down into smaller steps and ask the user for details only if necessary
* Research facts with Google searches in Chromium
* Read through web pages thoroughly by scrolling down till the end
* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
* Wait for actions to complete (examine previous screenshots) before taking another action
* Be concise!
</IMPORTANT>"""
18 changes: 17 additions & 1 deletion src/scrapybara/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64
import json
from typing import Any
from playwright.sync_api import sync_playwright

Expand Down Expand Up @@ -179,7 +181,9 @@ def __call__(self, **kwargs: Any) -> Any:
return True

elif command == "screenshot":
return page.screenshot(type="png")
return image_result(
base64.b64encode(page.screenshot(type="png")).decode("utf-8")
)

elif command == "get_text":
element = page.wait_for_selector(selector, timeout=timeout)
Expand All @@ -201,3 +205,15 @@ def __call__(self, **kwargs: Any) -> Any:

finally:
browser.close()


def image_result(base64: str) -> str:
"""Return an image result that is interpretable by the model."""
return json.dumps(
{
"output": "",
"error": "",
"base64_image": base64,
"system": None,
}
)
44 changes: 1 addition & 43 deletions tests/custom/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,8 @@
import os

from scrapybara.anthropic import Anthropic
from scrapybara.prompts import SYSTEM_PROMPT
from scrapybara.tools import BashTool, BrowserTool, ComputerTool, EditTool
from datetime import datetime

SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
* You have access to an Ubuntu virtual machine with internet connectivity
* You can install Ubuntu applications using the bash tool (use curl over wget)
* To run GUI applications with the bash tool:
- Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
- GUI apps will appear but may take time to load - confirm with an extra screenshot
* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
* In Chromium, click the address bar directly to enter URLs/searches
* If you need to read a full PDF after initial screenshot
- Download with curl
- Convert to text using pdftotext
- Read the text file with StrReplaceEditTool
* If you need to read a HTML file:
- Open with the address bar in Chromium
* For commands with large text output:
- Redirect to a temp file
- Use str_replace_editor or grep with context (-B and -A flags) to view output
* When viewing pages:
- Zoom out to see full content, or
- Scroll to ensure you see everything
* Computer function calls take time, string together calls when possible
* You are allowed to take actions on behalf of the user on sites that are authenticated
* If the user asks you to access a site, assume that the user has already authenticated
* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
</SYSTEM_CAPABILITY>

<IMPORTANT>
* If first screenshot shows black screen:
- Click mouse in screen center
- Take another screenshot
* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
- Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
* If given a complex task, break down into smaller steps and ask the user for details only if necessary
* Research facts with Google searches in Chromium
* Read through web pages thoroughly by scrolling down till the end
* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
* Wait for actions to complete (examine previous screenshots) before taking another action
* Be concise!
</IMPORTANT>"""


def test_client() -> None:
Expand Down