From b6a7e6554928816a40419211966ed0be9a50d8a5 Mon Sep 17 00:00:00 2001
From: fern-api <115122769+fern-api[bot]@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:42:15 +0000
Subject: [PATCH 1/6] SDK regeneration

---
 pyproject.toml                        |  2 +-
 src/scrapybara/core/client_wrapper.py |  2 +-
 src/scrapybara/types/tool.py          | 11 -----------
 3 files changed, 2 insertions(+), 13 deletions(-)
 delete mode 100644 src/scrapybara/types/tool.py

diff --git a/pyproject.toml b/pyproject.toml
index 178562f..ee5d06a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scrapybara"
-version = "2.1.0"
+version = "2.1.1"
 description = ""
 readme = "README.md"
 authors = []
diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py
index fe993b6..a2eb2f4 100644
--- a/src/scrapybara/core/client_wrapper.py
+++ b/src/scrapybara/core/client_wrapper.py
@@ -16,7 +16,7 @@ def get_headers(self) -> typing.Dict[str, str]:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "scrapybara",
-            "X-Fern-SDK-Version": "2.1.0",
+            "X-Fern-SDK-Version": "2.1.1",
         }
         headers["x-api-key"] = self.api_key
         return headers
diff --git a/src/scrapybara/types/tool.py b/src/scrapybara/types/tool.py
deleted file mode 100644
index 0a79b94..0000000
--- a/src/scrapybara/types/tool.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from typing import Any, Dict, Optional
-from pydantic import BaseModel
-
-
-class Tool(BaseModel):
-    name: str
-    description: Optional[str] = None
-    parameters: Optional[Dict[str, Any]] = None
-
-    def __call__(self, **kwargs: Any) -> Any:
-        raise NotImplementedError("Tool.__call__ must be implemented by subclasses")

From 3d6cc1f1cb3feb892bd5f43105c42fa9495f98a7 Mon Sep 17 00:00:00 2001
From: Justin Sun <justinsunyt@gmail.com>
Date: Mon, 13 Jan 2025 14:56:49 -0800
Subject: [PATCH 2/6] add SYSTEM_PROMPT + image_result

---
 .fernignore                        |  2 ++
 src/scrapybara/prompts/__init__.py | 43 +++++++++++++++++++++++++++++
 src/scrapybara/tools/__init__.py   | 18 +++++++++++-
 src/scrapybara/types/tool.py       | 11 ++++++++
 tests/custom/test_client.py        | 44 +-----------------------------
 5 files changed, 74 insertions(+), 44 deletions(-)
 create mode 100644 src/scrapybara/prompts/__init__.py
 create mode 100644 src/scrapybara/types/tool.py

diff --git a/.fernignore b/.fernignore
index 17fad2b..ced6301 100644
--- a/.fernignore
+++ b/.fernignore
@@ -2,7 +2,9 @@
 
 src/scrapybara/client.py
 src/scrapybara/anthropic/
+src/scrapybara/prompts/
 src/scrapybara/tools/
 src/scrapybara/types/act.py
+src/scrapybara/types/tool.py
 tests/custom/test_client.py
 .github/workflows/ci.yml
\ No newline at end of file
diff --git a/src/scrapybara/prompts/__init__.py b/src/scrapybara/prompts/__init__.py
new file mode 100644
index 0000000..3d9668b
--- /dev/null
+++ b/src/scrapybara/prompts/__init__.py
@@ -0,0 +1,43 @@
+from datetime import datetime
+
+SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
+* You have access to an Ubuntu virtual machine with internet connectivity
+* You can install Ubuntu applications using the bash tool (use curl over wget)
+* To run GUI applications with the bash tool:
+  - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
+  - GUI apps will appear but may take time to load - confirm with an extra screenshot
+* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
+* In Chromium, click the address bar directly to enter URLs/searches
+* If you need to read a full PDF after initial screenshot
+  - Download with curl
+  - Convert to text using pdftotext
+  - Read the text file with StrReplaceEditTool
+* If you need to read a HTML file:
+  - Open with the address bar in Chromium
+* For commands with large text output:
+  - Redirect to a temp file
+  - Use str_replace_editor or grep with context (-B and -A flags) to view output
+* When viewing pages:
+  - Zoom out to see full content, or
+  - Scroll to ensure you see everything
+* Computer function calls take time, string together calls when possible
+* You are allowed to take actions on behalf of the user on sites that are authenticated
+* If the user asks you to access a site, assume that the user has already authenticated
+* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
+* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
+* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
+</SYSTEM_CAPABILITY>
+
+<IMPORTANT>
+* If first screenshot shows black screen:
+  - Click mouse in screen center
+  - Take another screenshot
+* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
+  - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
+* If given a complex task, break down into smaller steps and ask the user for details only if necessary
+* Research facts with Google searches in Chromium
+* Read through web pages thoroughly by scrolling down till the end
+* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
+* Wait for actions to complete (examine previous screenshots) before taking another action
+* Be concise!
+</IMPORTANT>"""
diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py
index 31e6110..6ceb10b 100644
--- a/src/scrapybara/tools/__init__.py
+++ b/src/scrapybara/tools/__init__.py
@@ -1,3 +1,5 @@
+import base64
+import json
 from typing import Any
 from playwright.sync_api import sync_playwright
 
@@ -179,7 +181,9 @@ def __call__(self, **kwargs: Any) -> Any:
                     return True
 
                 elif command == "screenshot":
-                    return page.screenshot(type="png")
+                    return image_result(
+                        base64.b64encode(page.screenshot(type="png")).decode("utf-8")
+                    )
 
                 elif command == "get_text":
                     element = page.wait_for_selector(selector, timeout=timeout)
@@ -201,3 +205,15 @@ def __call__(self, **kwargs: Any) -> Any:
 
             finally:
                 browser.close()
+
+
+def image_result(base64: str) -> str:
+    """Return an image result that is interpretable by the model."""
+    return json.dumps(
+        {
+            "output": "",
+            "error": "",
+            "base64_image": base64,
+            "system": None,
+        }
+    )
diff --git a/src/scrapybara/types/tool.py b/src/scrapybara/types/tool.py
new file mode 100644
index 0000000..0a79b94
--- /dev/null
+++ b/src/scrapybara/types/tool.py
@@ -0,0 +1,11 @@
+from typing import Any, Dict, Optional
+from pydantic import BaseModel
+
+
+class Tool(BaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+
+    def __call__(self, **kwargs: Any) -> Any:
+        raise NotImplementedError("Tool.__call__ must be implemented by subclasses")
diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py
index 3e3208d..70f026a 100644
--- a/tests/custom/test_client.py
+++ b/tests/custom/test_client.py
@@ -2,50 +2,8 @@
 import os
 
 from scrapybara.anthropic import Anthropic
+from scrapybara.prompts import SYSTEM_PROMPT
 from scrapybara.tools import BashTool, BrowserTool, ComputerTool, EditTool
-from datetime import datetime
-
-SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
-* You have access to an Ubuntu virtual machine with internet connectivity
-* You can install Ubuntu applications using the bash tool (use curl over wget)
-* To run GUI applications with the bash tool:
-  - Use a subshell, e.g. "(DISPLAY=:1 xterm &)", make sure to include the parantheses
-  - GUI apps will appear but may take time to load - confirm with an extra screenshot
-* Start Chromium (default browser) via the bash tool "(DISPLAY=:1 chromium &)", but interact with it visually via the computer tool
-* In Chromium, click the address bar directly to enter URLs/searches
-* If you need to read a full PDF after initial screenshot
-  - Download with curl
-  - Convert to text using pdftotext
-  - Read the text file with StrReplaceEditTool
-* If you need to read a HTML file:
-  - Open with the address bar in Chromium
-* For commands with large text output:
-  - Redirect to a temp file
-  - Use str_replace_editor or grep with context (-B and -A flags) to view output
-* When viewing pages:
-  - Zoom out to see full content, or
-  - Scroll to ensure you see everything
-* Computer function calls take time, string together calls when possible
-* You are allowed to take actions on behalf of the user on sites that are authenticated
-* If the user asks you to access a site, assume that the user has already authenticated
-* For accessibility, you can use and create accounts for communication tools such as email, Discord, Telegram on behalf of the user - e.g. join Discord channels, send emails, etc.
-* To login additional sites, ask the user to use Auth Contexts or the Interactive Desktop
-* Today's date is {datetime.today().strftime('%A, %B %-d, %Y')}
-</SYSTEM_CAPABILITY>
-
-<IMPORTANT>
-* If first screenshot shows black screen:
-  - Click mouse in screen center
-  - Take another screenshot
-* When interacting with a field, always clear the field first using "ctrl+A" and "delete"
-  - Take an extra screenshot after clicking "enter" to confirm the field is properly submitted and move the mouse to the next field
-* If given a complex task, break down into smaller steps and ask the user for details only if necessary
-* Research facts with Google searches in Chromium
-* Read through web pages thoroughly by scrolling down till the end
-* Use more generalized websites during research, e.g. use Google Flights instead of United when searching for flights, only use United when finalizing bookings
-* Wait for actions to complete (examine previous screenshots) before taking another action
-* Be concise!
-</IMPORTANT>"""
 
 
 def test_client() -> None:

From c6a9deb143ae8b8382613c154e68789c0b319b2e Mon Sep 17 00:00:00 2001
From: Justin Sun <justinsunyt@gmail.com>
Date: Thu, 16 Jan 2025 15:21:14 -0800
Subject: [PATCH 3/6] structured outputs + ActResponse

---
 src/scrapybara/client.py         | 159 ++++++++++++++++++++++++++-----
 src/scrapybara/tools/__init__.py |  24 ++---
 src/scrapybara/types/act.py      |  17 +++-
 tests/custom/test_client.py      |  17 +++-
 4 files changed, 173 insertions(+), 44 deletions(-)

diff --git a/src/scrapybara/client.py b/src/scrapybara/client.py
index 7db66c6..8f5f7e6 100644
--- a/src/scrapybara/client.py
+++ b/src/scrapybara/client.py
@@ -5,6 +5,8 @@
     Dict,
     List,
     Sequence,
+    Type,
+    TypeVar,
     Union,
     Literal,
     Generator,
@@ -16,6 +18,7 @@
 import asyncio
 
 import httpx
+from pydantic import BaseModel, ConfigDict
 
 from scrapybara.core.http_client import AsyncHttpClient, HttpClient
 from scrapybara.environment import ScrapybaraEnvironment
@@ -43,8 +46,8 @@
     StopInstanceResponse,
 )
 from .types.act import (
-    ActRequest,
-    ActResponse,
+    SingleActRequest,
+    SingleActResponse,
     Message,
     Model,
     TextPart,
@@ -55,11 +58,38 @@
     UserMessage,
     AssistantMessage,
     Step,
+    ActResponse,
+    TokenUsage,
 )
 from .base_client import BaseClient, AsyncBaseClient
 from .instance.types import Action, Command
 
 OMIT = typing.cast(typing.Any, ...)
+SchemaT = TypeVar("SchemaT", bound=BaseModel)
+
+
+class StructuredOutputTool(Tool):
+    """A tool that allows the agent to output structured data."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    _model: Type[BaseModel]
+
+    def __init__(self, model: Type[BaseModel]):
+        schema = model.model_json_schema()
+        super().__init__(
+            name="structured_output",
+            description="Output structured data according to the provided schema parameters. Only use this tool at the end of your task. The output data is final and will be passed directly back to the user.",
+            parameters={
+                "type": "object",
+                "properties": schema.get("properties", {}),
+                "required": schema.get("required", []),
+            },
+        )
+        self._model = model
+
+    def __call__(self, **kwargs: Any) -> Dict[str, Any]:
+        validated = self._model.model_validate(kwargs)
+        return validated.model_dump()
 
 
 class Browser:
@@ -853,46 +883,56 @@ def act(
         self,
         *,
         model: Model,
+        tools: Optional[List[Tool]] = None,
         system: Optional[str] = None,
         prompt: Optional[str] = None,
         messages: Optional[List[Message]] = None,
-        tools: Optional[List[Tool]] = None,
+        schema: Optional[Type[SchemaT]] = None,
         on_step: Optional[Callable[[Step], None]] = None,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
         request_options: Optional[RequestOptions] = None,
-    ) -> List[Message]:
+    ) -> ActResponse[SchemaT]:
         """
         Run an agent loop with the given tools and model, returning all messages at the end.
 
         Args:
-            tools: List of tools available to the agent
             model: The model to use for generating responses
+            tools: List of tools available to the agent
             system: System prompt for the agent
             prompt: Initial user prompt
             messages: List of messages to start with
+            schema: Optional Pydantic model class to structure the final output
             on_step: Callback for each step of the conversation
             temperature: Optional temperature parameter for the model
             max_tokens: Optional max tokens parameter for the model
             request_options: Optional request configuration
 
         Returns:
-            List of all messages from the conversation
+            ActResponse containing all messages, steps, final output (structured according to schema if provided), and token usage
         """
         result_messages: List[Message] = []
+        steps: List[Step] = []
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+
         if messages:
             result_messages.extend(messages)
+
         for step in self.act_stream(
-            tools=tools,
             model=model,
+            tools=tools,
             system=system,
             prompt=prompt,
             messages=messages,
+            schema=schema,
+            on_step=on_step,
             temperature=temperature,
             max_tokens=max_tokens,
-            on_step=on_step,
             request_options=request_options,
         ):
+            steps.append(step)
             assistant_msg = AssistantMessage(
                 content=[TextPart(text=step.text)] + (step.tool_calls or [])
             )
@@ -900,16 +940,40 @@ def act(
             if step.tool_results:
                 tool_msg = ToolMessage(content=step.tool_results)
                 result_messages.append(tool_msg)
-        return result_messages
+
+            if step.usage:
+                total_prompt_tokens += step.usage.prompt_tokens
+                total_completion_tokens += step.usage.completion_tokens
+                total_tokens += step.usage.total_tokens
+
+        text = steps[-1].text if steps else None
+        if schema:
+            output = (
+                steps[-1].tool_results[-1].result if steps[-1].tool_results else None
+            )
+            output = schema.model_validate(output)
+
+        usage = None
+        if total_tokens > 0:
+            usage = TokenUsage(
+                prompt_tokens=total_prompt_tokens,
+                completion_tokens=total_completion_tokens,
+                total_tokens=total_tokens,
+            )
+
+        return ActResponse(
+            messages=result_messages, steps=steps, text=text, output=output, usage=usage
+        )
 
     def act_stream(
         self,
         *,
         model: Model,
+        tools: Optional[List[Tool]] = None,
         system: Optional[str] = None,
         prompt: Optional[str] = None,
         messages: Optional[List[Message]] = None,
-        tools: Optional[List[Tool]] = None,
+        schema: Optional[Type[BaseModel]] = None,
         on_step: Optional[Callable[[Step], None]] = None,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
@@ -919,11 +983,12 @@ def act_stream(
         Run an interactive agent loop with the given tools and model.
 
         Args:
-            tools: List of tools available to the agent
             model: The model to use for generating responses
+            tools: List of tools available to the agent
             system: System prompt for the agent
             prompt: Initial user prompt
             messages: List of messages to start with
+            schema: Optional Pydantic model class to structure the final output
             on_step: Callback for each step of the conversation
             temperature: Optional temperature parameter for the model
             max_tokens: Optional max tokens parameter for the model
@@ -942,8 +1007,11 @@ def act_stream(
 
         current_tools = [] if tools is None else list(tools)
 
+        if schema:
+            current_tools.append(StructuredOutputTool(schema))
+
         while True:
-            request = ActRequest(
+            request = SingleActRequest(
                 model=model,
                 system=system,
                 messages=current_messages,
@@ -963,7 +1031,7 @@ def act_stream(
             if not 200 <= response.status_code < 300:
                 raise ApiError(status_code=response.status_code, body=response.json())
 
-            act_response = ActResponse.model_validate(response.json())
+            act_response = SingleActResponse.model_validate(response.json())
             current_messages.append(act_response.message)
 
             # Extract text from assistant message
@@ -988,14 +1056,17 @@ def act_stream(
                 usage=act_response.usage,
             )
 
-            # Check if we should continue the loop
+            # Check if there are tool calls
             has_tool_calls = bool(tool_calls)
+            has_structured_output = False
 
             if has_tool_calls:
                 tool_results: List[ToolResultPart] = []
                 for part in tool_calls:
                     tool = next(t for t in current_tools if t.name == part.tool_name)
                     try:
+                        if tool.name == "structured_output" and schema:
+                            has_structured_output = True
                         result = tool(**part.args)
                         tool_results.append(
                             ToolResultPart(
@@ -1021,7 +1092,7 @@ def act_stream(
                 on_step(step)
             yield step
 
-            if not has_tool_calls:
+            if not has_tool_calls or has_structured_output:
                 break
 
 
@@ -1117,16 +1188,17 @@ async def get_auth_states(
     async def act(
         self,
         *,
+        tools: Optional[List[Tool]] = None,
         model: Model,
         system: Optional[str] = None,
         prompt: Optional[str] = None,
         messages: Optional[List[Message]] = None,
-        tools: Optional[List[Tool]] = None,
+        schema: Optional[Type[SchemaT]] = None,
         on_step: Optional[Callable[[Step], None]] = None,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
         request_options: Optional[RequestOptions] = None,
-    ) -> List[Message]:
+    ) -> ActResponse[SchemaT]:
         """
         Run an agent loop with the given tools and model, returning all messages at the end.
 
@@ -1136,28 +1208,37 @@ async def act(
             system: System prompt for the agent
             prompt: Initial user prompt
             messages: List of messages to start with
+            schema: Optional Pydantic model class to structure the final output
             on_step: Callback for each step of the conversation
             temperature: Optional temperature parameter for the model
             max_tokens: Optional max tokens parameter for the model
             request_options: Optional request configuration
 
         Returns:
-            List of all messages from the conversation
+            ActResponse containing all messages, steps, final output (structured according to schema if provided), and token usage
         """
         result_messages: List[Message] = []
+        steps: List[Step] = []
+        total_prompt_tokens = 0
+        total_completion_tokens = 0
+        total_tokens = 0
+
         if messages:
             result_messages.extend(messages)
+
         async for step in self.act_stream(
             tools=tools,
             model=model,
             system=system,
             prompt=prompt,
             messages=messages,
+            schema=schema,
             temperature=temperature,
             max_tokens=max_tokens,
             on_step=on_step,
             request_options=request_options,
         ):
+            steps.append(step)
             assistant_msg = AssistantMessage(
                 content=[TextPart(text=step.text)] + (step.tool_calls or [])
             )
@@ -1165,16 +1246,40 @@ async def act(
             if step.tool_results:
                 tool_msg = ToolMessage(content=step.tool_results)
                 result_messages.append(tool_msg)
-        return result_messages
+
+            if step.usage:
+                total_prompt_tokens += step.usage.prompt_tokens
+                total_completion_tokens += step.usage.completion_tokens
+                total_tokens += step.usage.total_tokens
+
+        text = steps[-1].text if steps else None
+        if schema:
+            output = (
+                steps[-1].tool_results[-1].result if steps[-1].tool_results else None
+            )
+            output = schema.model_validate(output)
+
+        usage = None
+        if total_tokens > 0:
+            usage = TokenUsage(
+                prompt_tokens=total_prompt_tokens,
+                completion_tokens=total_completion_tokens,
+                total_tokens=total_tokens,
+            )
+
+        return ActResponse(
+            messages=result_messages, steps=steps, text=text, output=output, usage=usage
+        )
 
     async def act_stream(
         self,
         *,
         model: Model,
+        tools: Optional[List[Tool]] = None,
         system: Optional[str] = None,
         prompt: Optional[str] = None,
         messages: Optional[List[Message]] = None,
-        tools: Optional[List[Tool]] = None,
+        schema: Optional[Type[SchemaT]] = None,
         on_step: Optional[Callable[[Step], None]] = None,
         temperature: Optional[float] = None,
         max_tokens: Optional[int] = None,
@@ -1184,11 +1289,12 @@ async def act_stream(
         Run an interactive agent loop with the given tools and model.
 
         Args:
-            tools: List of tools available to the agent
             model: The model to use for generating responses
+            tools: List of tools available to the agent
             system: System prompt for the agent
             prompt: Initial user prompt
             messages: List of messages to start with
+            schema: Optional Pydantic model class to structure the final output
             on_step: Callback for each step of the conversation
             temperature: Optional temperature parameter for the model
             max_tokens: Optional max tokens parameter for the model
@@ -1208,7 +1314,7 @@ async def act_stream(
         current_tools = [] if tools is None else list(tools)
 
         while True:
-            request = ActRequest(
+            request = SingleActRequest(
                 model=model,
                 system=system,
                 messages=current_messages,
@@ -1228,7 +1334,7 @@ async def act_stream(
             if not 200 <= response.status_code < 300:
                 raise ApiError(status_code=response.status_code, body=response.json())
 
-            act_response = ActResponse.model_validate(response.json())
+            act_response = SingleActResponse.model_validate(response.json())
             current_messages.append(act_response.message)
 
             # Extract text from assistant message
@@ -1253,14 +1359,17 @@ async def act_stream(
                 usage=act_response.usage,
             )
 
-            # Check if we should continue the loop
+            # Check if there are tool calls
             has_tool_calls = bool(tool_calls)
+            has_structured_output = False
 
             if has_tool_calls:
                 tool_results: List[ToolResultPart] = []
                 for part in tool_calls:
                     tool = next(t for t in current_tools if t.name == part.tool_name)
                     try:
+                        if tool.name == "structured_output" and schema:
+                            has_structured_output = True
                         loop = asyncio.get_event_loop()
                         result = await loop.run_in_executor(
                             None, lambda: tool(**part.args)
@@ -1289,5 +1398,5 @@ async def act_stream(
                 on_step(step)
             yield step
 
-            if not has_tool_calls:
+            if not has_tool_calls or has_structured_output:
                 break
diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py
index 6ceb10b..f31379b 100644
--- a/src/scrapybara/tools/__init__.py
+++ b/src/scrapybara/tools/__init__.py
@@ -7,6 +7,18 @@
 from ..client import Instance
 
 
+def image_result(base64: str) -> str:
+    """Return an image result that is interpretable by the model."""
+    return json.dumps(
+        {
+            "output": "",
+            "error": "",
+            "base64_image": base64,
+            "system": None,
+        }
+    )
+
+
 class ComputerTool(Tool):
     """A computer interaction tool that allows the agent to control mouse and keyboard."""
 
@@ -205,15 +217,3 @@ def __call__(self, **kwargs: Any) -> Any:
 
             finally:
                 browser.close()
-
-
-def image_result(base64: str) -> str:
-    """Return an image result that is interpretable by the model."""
-    return json.dumps(
-        {
-            "output": "",
-            "error": "",
-            "base64_image": base64,
-            "system": None,
-        }
-    )
diff --git a/src/scrapybara/types/act.py b/src/scrapybara/types/act.py
index c70899b..6d7c052 100644
--- a/src/scrapybara/types/act.py
+++ b/src/scrapybara/types/act.py
@@ -1,7 +1,9 @@
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union, Generic, TypeVar
 from pydantic import BaseModel
 from .tool import Tool
 
+OutputT = TypeVar("OutputT")
+
 
 # Message part types
 class TextPart(BaseModel):
@@ -55,7 +57,7 @@ class Model(BaseModel):
     api_key: Optional[str] = None
 
 
-class ActRequest(BaseModel):
+class SingleActRequest(BaseModel):
     model: Model
     system: Optional[str] = None
     messages: Optional[List[Message]] = None
@@ -70,7 +72,7 @@ class TokenUsage(BaseModel):
     total_tokens: int
 
 
-class ActResponse(BaseModel):
+class SingleActResponse(BaseModel):
     message: AssistantMessage
     finish_reason: Literal[
         "stop", "length", "content-filter", "tool-calls", "error", "other", "unknown"
@@ -95,3 +97,12 @@ class Step(BaseModel):
         ]
     ] = None
     usage: Optional[TokenUsage] = None
+
+
+# Act response
+class ActResponse(BaseModel, Generic[OutputT]):
+    messages: List[Message]
+    steps: List[Step]
+    text: Optional[str] = None
+    output: OutputT
+    usage: Optional[TokenUsage] = None
diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py
index 70f026a..4747ba0 100644
--- a/tests/custom/test_client.py
+++ b/tests/custom/test_client.py
@@ -1,3 +1,4 @@
+from pydantic import BaseModel
 from scrapybara import Scrapybara
 import os
 
@@ -20,19 +21,27 @@ def test_client() -> None:
     cdp_url = instance.browser.get_cdp_url()
     assert cdp_url is not None
 
-    messages = client.act(
+    class YCStats(BaseModel):
+        number_of_startups: int
+        combined_valuation: int
+
+    response = client.act(
         model=Anthropic(),
         system=SYSTEM_PROMPT,
-        prompt="Go to the YC website and fetch the HTML",
+        prompt="Go to the YC website and get the number of funded startups and combined valuation",
         tools=[
             ComputerTool(instance),
             BashTool(instance),
             EditTool(instance),
             BrowserTool(instance),
         ],
-        on_step=lambda step: print(f"{step}\n"),
+        schema=YCStats,
     )
-    assert len(messages) > 0
+    print(response)
+
+    assert response.output is not None
+    assert response.output.number_of_startups is not None
+    assert response.output.combined_valuation is not None
 
     instance.browser.stop()
     instance.stop()

From 9f1c19ebfae2c62f6925653babaed1a519cba976 Mon Sep 17 00:00:00 2001
From: Justin Sun <justinsunyt@gmail.com>
Date: Thu, 16 Jan 2025 15:26:39 -0800
Subject: [PATCH 4/6] bump version

---
 pyproject.toml                        |  2 +-
 src/scrapybara/core/client_wrapper.py | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ee5d06a..9552bec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scrapybara"
-version = "2.1.1"
+version = "2.1.2"
 description = ""
 readme = "README.md"
 authors = []
diff --git a/src/scrapybara/core/client_wrapper.py b/src/scrapybara/core/client_wrapper.py
index a2eb2f4..e3442ae 100644
--- a/src/scrapybara/core/client_wrapper.py
+++ b/src/scrapybara/core/client_wrapper.py
@@ -7,7 +7,9 @@
 
 
 class BaseClientWrapper:
-    def __init__(self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None):
+    def __init__(
+        self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None
+    ):
         self.api_key = api_key
         self._base_url = base_url
         self._timeout = timeout
@@ -16,7 +18,7 @@ def get_headers(self) -> typing.Dict[str, str]:
         headers: typing.Dict[str, str] = {
             "X-Fern-Language": "Python",
             "X-Fern-SDK-Name": "scrapybara",
-            "X-Fern-SDK-Version": "2.1.1",
+            "X-Fern-SDK-Version": "2.1.2",
         }
         headers["x-api-key"] = self.api_key
         return headers
@@ -30,7 +32,12 @@ def get_timeout(self) -> typing.Optional[float]:
 
 class SyncClientWrapper(BaseClientWrapper):
     def __init__(
-        self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None, httpx_client: httpx.Client
+        self,
+        *,
+        api_key: str,
+        base_url: str,
+        timeout: typing.Optional[float] = None,
+        httpx_client: httpx.Client
     ):
         super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
         self.httpx_client = HttpClient(
@@ -43,7 +50,12 @@ def __init__(
 
 class AsyncClientWrapper(BaseClientWrapper):
     def __init__(
-        self, *, api_key: str, base_url: str, timeout: typing.Optional[float] = None, httpx_client: httpx.AsyncClient
+        self,
+        *,
+        api_key: str,
+        base_url: str,
+        timeout: typing.Optional[float] = None,
+        httpx_client: httpx.AsyncClient
     ):
         super().__init__(api_key=api_key, base_url=base_url, timeout=timeout)
         self.httpx_client = AsyncHttpClient(

From 2ad0f64eefe20da232260767d17f25bb7a849fa6 Mon Sep 17 00:00:00 2001
From: Justin Sun <justinsunyt@gmail.com>
Date: Thu, 16 Jan 2025 15:41:49 -0800
Subject: [PATCH 5/6] improve docstrings

---
 src/scrapybara/anthropic/__init__.py | 15 +++++++++++++++
 src/scrapybara/client.py             |  4 ++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/scrapybara/anthropic/__init__.py b/src/scrapybara/anthropic/__init__.py
index b99c7c2..a3cfda2 100644
--- a/src/scrapybara/anthropic/__init__.py
+++ b/src/scrapybara/anthropic/__init__.py
@@ -14,6 +14,21 @@
 
 # New: universal act API
 class Anthropic(Model):
+    """Model adapter for Anthropic.
+
+    Supported models:
+    - claude-3-5-sonnet-20241022 (with computer use beta)
+
+    If an API key is not provided, each call will cost 1 agent credit.
+
+    Args:
+        name: Anthropic model name, defaults to "claude-3-5-sonnet-20241022"
+        api_key: Optional Anthropic API key
+
+    Returns:
+        A Model configuration object
+    """
+
     provider: Literal["anthropic"] = Field(default="anthropic", frozen=True)
 
     def __init__(
diff --git a/src/scrapybara/client.py b/src/scrapybara/client.py
index 8f5f7e6..a4d8f3d 100644
--- a/src/scrapybara/client.py
+++ b/src/scrapybara/client.py
@@ -909,7 +909,7 @@ def act(
             request_options: Optional request configuration
 
         Returns:
-            ActResponse containing all messages, steps, final output (structured according to schema if provided), and token usage
+            ActResponse containing all messages, steps, text, output (if schema is provided), and token usage
         """
         result_messages: List[Message] = []
         steps: List[Step] = []
@@ -1215,7 +1215,7 @@ async def act(
             request_options: Optional request configuration
 
         Returns:
-            ActResponse containing all messages, steps, final output (structured according to schema if provided), and token usage
+            ActResponse containing all messages, steps, text, output (if schema is provided), and token usage
         """
         result_messages: List[Message] = []
         steps: List[Step] = []

From c7171444e5bfa9fe6a1e71a9a63b600db8c99c37 Mon Sep 17 00:00:00 2001
From: justin sun <33591641+justinsunyt@users.noreply.github.com>
Date: Thu, 16 Jan 2025 19:09:53 -0500
Subject: [PATCH 6/6] remove duplicate image_result

---
 src/scrapybara/tools/__init__.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/scrapybara/tools/__init__.py b/src/scrapybara/tools/__init__.py
index 890c152..f31379b 100644
--- a/src/scrapybara/tools/__init__.py
+++ b/src/scrapybara/tools/__init__.py
@@ -217,15 +217,3 @@ def __call__(self, **kwargs: Any) -> Any:
 
             finally:
                 browser.close()
-
-
-def image_result(base64: str) -> str:
-    """Return an image result that is interpretable by the model."""
-    return json.dumps(
-        {
-            "output": "",
-            "error": "",
-            "base64_image": base64,
-            "system": None,
-        }
-    )