From d965b1a2d71cb55a7f90f0d40c2da41d77572208 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 20 Jan 2025 16:13:06 +0000 Subject: [PATCH 1/3] Fix issue #6361: [Feature]: Document the App Browser Feature in the OpenHands Documentation Page --- docs/modules/usage/how-to/gui-mode.md | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/modules/usage/how-to/gui-mode.md b/docs/modules/usage/how-to/gui-mode.md index 483f8869e9eb..80fc12b8526e 100644 --- a/docs/modules/usage/how-to/gui-mode.md +++ b/docs/modules/usage/how-to/gui-mode.md @@ -109,6 +109,34 @@ The main interface consists of several key components: - **Settings Button**: A gear icon that opens the settings modal, allowing you to adjust your configuration at any time. - **Workspace Panel**: Displays the files and folders in your workspace, allowing you to navigate and view files, or the agent's past commands or web browsing history. +### App Browser Feature + +The App Browser is a powerful feature that allows the AI assistant to interact with frontend applications: + +- **Purpose**: Enables the AI to view, navigate, and interact with web-based user interfaces, making it capable of testing and debugging frontend applications. +- **Capabilities**: + - **Navigation**: The AI can browse web pages and navigate through different sections of the application. + - **Interaction**: Supports clicking buttons, filling forms, and other common web interactions. + - **Visual Feedback**: The AI can see and interpret the application's interface, helping with UI-related tasks. + - **Testing**: Facilitates automated testing of frontend applications by allowing the AI to simulate user interactions. + +#### Using the App Browser + +1. **Accessing the Browser**: + - The browser view appears in the workspace panel when the AI is interacting with web applications. + - You can see the current page and the AI's interactions in real-time. + +2. **Common Use Cases**: + - Testing frontend applications + - Debugging UI issues + - Automating web-based workflows + - Validating user interface changes + +3. **Browser Controls**: + - The AI automatically handles navigation and interaction + - You can observe the AI's actions in the browser view + - The chat interface allows you to guide the AI's interactions with the application + ### Interacting with the AI 1. Type your question, request, or task description in the input box. From 87fc17d7def937c27410e570ad7a835469b051b6 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 20 Jan 2025 16:41:29 +0000 Subject: [PATCH 2/3] Fix pr #6362: Fix issue #6361: [Feature]: Document the App Browser Feature in the OpenHands Documentation Page --- docs/modules/usage/how-to/app-browser.md | 28 ++++++++++++++++++++++++ docs/sidebars.ts | 5 +++++ 2 files changed, 33 insertions(+) create mode 100644 docs/modules/usage/how-to/app-browser.md diff --git a/docs/modules/usage/how-to/app-browser.md b/docs/modules/usage/how-to/app-browser.md new file mode 100644 index 000000000000..6c4f0c881374 --- /dev/null +++ b/docs/modules/usage/how-to/app-browser.md @@ -0,0 +1,28 @@ +# App Browser + +The App Browser is a feature in OpenHands that allows you to monitor and verify the AI agent's web interactions in real-time. When the agent performs actions in a web browser (like navigating to URLs, clicking buttons, or filling forms), the App Browser displays screenshots of what the agent sees, helping you ensure that the agent is interacting with web pages correctly. + +## Features + +- **URL Display**: Shows the current URL the agent is visiting +- **Live Screenshots**: Displays real-time screenshots of the web pages the agent is interacting with +- **Visual Verification**: Helps you verify that the agent's web interactions are working as intended + +## How It Works + +1. When the agent performs web interactions using the `browser` tool, it captures screenshots of the web pages +2. These screenshots are displayed in the App Browser panel in real-time +3. You can see exactly what the agent sees, making it easier to debug or verify web interactions + +## Use Cases + +The App Browser is particularly useful when: + +- Debugging web automation tasks +- Verifying that the agent is interacting with the correct elements on a page +- Ensuring web scraping or form filling tasks are working correctly +- Monitoring the agent's progress during web-based tasks + +## Location + +You can find the App Browser panel in the OpenHands UI. It displays "No page loaded" when the agent is not currently performing any web interactions. diff --git a/docs/sidebars.ts b/docs/sidebars.ts index b7def32dd68f..8575fd79650b 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -69,6 +69,11 @@ const sidebars: SidebarsConfig = { label: 'Github Actions', id: 'usage/how-to/github-action', }, + { + type: 'doc', + label: 'App Browser', + id: 'usage/how-to/app-browser', + }, ], }, { From 5423ab98172c89d8ca691abd3d104272e1387a54 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 5 Feb 2025 03:24:00 +0000 Subject: [PATCH 3/3] Fix app browser documentation to correctly describe its purpose as a monitoring tool --- docs/modules/usage/how-to/gui-mode.md | 29 +++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/docs/modules/usage/how-to/gui-mode.md b/docs/modules/usage/how-to/gui-mode.md index 80fc12b8526e..c276f728da66 100644 --- a/docs/modules/usage/how-to/gui-mode.md +++ b/docs/modules/usage/how-to/gui-mode.md @@ -111,31 +111,30 @@ The main interface consists of several key components: ### App Browser Feature -The App Browser is a powerful feature that allows the AI assistant to interact with frontend applications: +The App Browser is a feature that allows you to monitor and verify the AI agent's web interactions: -- **Purpose**: Enables the AI to view, navigate, and interact with web-based user interfaces, making it capable of testing and debugging frontend applications. +- **Purpose**: Enables human users to see and verify that the AI agent is correctly implementing web-based tasks and interactions. - **Capabilities**: - - **Navigation**: The AI can browse web pages and navigate through different sections of the application. - - **Interaction**: Supports clicking buttons, filling forms, and other common web interactions. - - **Visual Feedback**: The AI can see and interpret the application's interface, helping with UI-related tasks. - - **Testing**: Facilitates automated testing of frontend applications by allowing the AI to simulate user interactions. + - **Real-time Monitoring**: Watch the agent's web interactions as they happen + - **Visual Verification**: See exactly what the agent sees when interacting with web pages + - **Quality Assurance**: Verify that the agent is performing the correct actions on web pages #### Using the App Browser 1. **Accessing the Browser**: - - The browser view appears in the workspace panel when the AI is interacting with web applications. - - You can see the current page and the AI's interactions in real-time. + - The browser view appears in the workspace panel when the agent is performing web interactions + - You can see the current page and the agent's actions in real-time 2. **Common Use Cases**: - - Testing frontend applications - - Debugging UI issues - - Automating web-based workflows - - Validating user interface changes + - Verifying that the agent is interacting with the correct web elements + - Monitoring web automation tasks for accuracy + - Ensuring web-based tasks are being executed as intended + - Debugging issues when web interactions aren't working as expected 3. **Browser Controls**: - - The AI automatically handles navigation and interaction - - You can observe the AI's actions in the browser view - - The chat interface allows you to guide the AI's interactions with the application + - The browser panel shows live screenshots of web pages the agent is interacting with + - You can see the current URL and page state + - The chat interface allows you to guide or correct the agent if needed ### Interacting with the AI