Skip to content

Commit

Permalink
feat(page_analyzer): infer selectors
Browse files Browse the repository at this point in the history
  • Loading branch information
idiotWu committed Oct 16, 2024
1 parent c97c7d3 commit cccf67c
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 6 deletions.
7 changes: 3 additions & 4 deletions npiai/core/browser/_playwright.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
import pathlib
import tempfile
import os
import uuid
from urllib.request import urlretrieve
import traceback

from playwright.async_api import (
async_playwright,
Playwright,
Expand All @@ -13,7 +12,7 @@
FileChooser,
)

__BROWSER_UTILS_VERSION__ = "0.0.3"
__BROWSER_UTILS_VERSION__ = "0.0.4"


def _prepare_browser_utils():
Expand Down
12 changes: 11 additions & 1 deletion npiai/tools/web/page_analyzer/__test__/full_tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import asyncio
import json
from textwrap import indent

from npiai.tools.web.page_analyzer import PageAnalyzer

# from npiai.utils.test_utils import DebugContext
Expand All @@ -9,7 +12,7 @@
"https://www.bardeen.ai/playbooks/get-data-from-the-currently-opened-imdb-com-title-page",
"https://ifttt.com/explore",
"https://retool.com/templates",
"https://www.google.com/search?q=test",
"https://www.google.com/search?q=test&hl=ja",
"https://www.amazon.com/s?k=test",
"https://github.com/facebook/react/issues",
]
Expand Down Expand Up @@ -41,6 +44,13 @@ async def main():

print(" - Inferred scraping type:", scraping_type)

if scraping_type == "list-like":
anchors = await analyzer.get_similar_items(ctx, url)

print(
" - Possible anchors:", indent(json.dumps(anchors, indent=2), " ")
)

print()


Expand Down
169 changes: 168 additions & 1 deletion npiai/tools/web/page_analyzer/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
from textwrap import dedent
from typing import Literal
from typing import Literal, List
from typing_extensions import TypedDict


from litellm.types.completion import (
ChatCompletionSystemMessageParam,
Expand All @@ -14,6 +16,12 @@
_ScrapingType = Literal["list-like", "single"]


class CommonSelectors(TypedDict):
items: str
ancestor: str
anchors: str


class PageAnalyzer(BrowserTool):
name = "page_analyzer"
description = "Analyze a web page for scraping purposes"
Expand Down Expand Up @@ -157,6 +165,65 @@ async def get_selector_of_marker(self, marker_id: int = -1) -> str | None:
marker_id,
)

async def compute_common_selectors(
self, anchor_ids: List[int]
) -> CommonSelectors | None:
"""
Expand the anchors with the given IDs and compute the common items and ancestor selector.
Args:
anchor_ids: An array of IDs of the elements that are similar to each other and represent a meaningful list of items.
"""
# print("anchor_ids:", anchor_ids)

if not anchor_ids:
return None

# extract the first 3 elements and expand their anchors
# to find common items and ancestor selector
return await self.playwright.page.evaluate(
"""(anchorIds) => {
try {
const anchorElements = anchorIds.map(id => npi.getElement(id));
const expandedAnchors = new Set(anchorElements.flatMap(el => {
return npi.selectorUtils.expandAnchorFrom(el) || [];
}));
let selectors;
if (expandedAnchors.size >= 2) {
selectors = npi.selectorUtils.getCommonItemsAndAncestor(...expandedAnchors);
} else {
selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements);
}
if (!selectors) {
return null;
}
const matches = [...document.querySelectorAll(selectors.items)];
if (matches.length < 3 || matches.length > 1000) {
return null;
}
const anchorsSelector = matches
.slice(0, 3)
.map(el => npi.getUniqueSelector(el))
.join(", ");
return {
...selectors,
anchors: anchorsSelector,
}
} catch {
return null;
}
}""",
anchor_ids[:3],
)

@function
async def support_infinite_scroll(self, url: str) -> bool:
"""
Expand Down Expand Up @@ -375,3 +442,103 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> _ScrapingType:
)

return await self.set_scraping_type(**res.model_dump())

@function
async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | None:
"""
Open the given URL and determine whether there are similar elements representing a meaningful list of items. If there are, return the common selector of the similar elements, the ancestor selector, and the selectors of the anchor elements. Otherwise, return None.
Args:
ctx: NPi Context
url: URL of the page
"""
await self._load_page(url)

# use latest page url in case of redirections
page_url = await self.get_page_url()
page_title = await self.get_page_title()
raw_screenshot = await self.get_screenshot(full_page=True)
elements, _ = await self.get_interactive_elements(
screenshot=raw_screenshot,
full_page=True,
)
annotated_screenshot = await self.get_screenshot(full_page=True)

filtered_elements = []

for elem in elements:
if elem["role"] != "button" and (
len(elem["accessibleName"]) > 10
or len(elem["accessibleDescription"]) > 10
):
filtered_elements.append(elem)

# print("filtered_elements:", filtered_elements)

res = await llm_tool_call(
llm=ctx.llm,
tool=self.compute_common_selectors,
messages=[
ChatCompletionSystemMessageParam(
role="system",
content=dedent(
"""
Analyze the given page and determine whether there are similar elements representing **the most meaningful** list of items. If there are, use the tool to calculate the common selector of the similar elements.
## Provided Context
- An annotated screenshot of the target page where the interactive elements are surrounded with rectangular bounding boxes in different colors. At the top left of each bounding box is a small rectangle in the same color as the bounding box. This is the label and it contains a number indicating the ID of that box. The label number starts from 0.
- The URL of the page.
- The title of the page.
- An array of the interactive elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out.
## Element Object
The original HTML elements are described as the following JSON objects:
type Element = {
id: string; // The Marker ID of the element
tag: string; // The tag of the element
role: string | null; // The WAI-ARIA accessible role of the element
accessibleName: string; // The WAI-ARIA accessible name of the element
accessibleDescription: string; // The WAI-ARIA accessible description of the element
attributes: Record<string, string>; // Some helpful attributes of the element
options?: string[]; // Available options of an <select> element. This property is only provided when the element is a <select> element.
}
## Instructions
Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page:
1. Examine the screenshots, the URL, and the title of the page to understand the context, and then think about what the current page is.
2. Go through the elements array, pay attention to the `role`, `accessibleName`, and `accessibleDescription` properties to grab semantic information of the elements.
3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases.
4. If you find meaningful similar elements, call the tool with a list of the IDs of the elements to compute the common selectors. Otherwise, call the tool with an empty list.
"""
),
),
ChatCompletionUserMessageParam(
role="user",
content=[
{
"type": "text",
"text": json.dumps(
{
"url": page_url,
"title": page_title,
"elements": filtered_elements,
},
ensure_ascii=False,
),
},
{
"type": "image_url",
"image_url": {
"url": annotated_screenshot,
},
},
],
),
],
)

return await self.compute_common_selectors(**res.model_dump())

0 comments on commit cccf67c

Please sign in to comment.