From cccf67cfa4d3a052037f5c5f169bd7c0cd62b1ad Mon Sep 17 00:00:00 2001 From: Daofeng Wu Date: Wed, 16 Oct 2024 21:56:32 +0900 Subject: [PATCH] feat(page_analyzer): infer selectors --- npiai/core/browser/_playwright.py | 7 +- .../web/page_analyzer/__test__/full_tests.py | 12 +- npiai/tools/web/page_analyzer/app.py | 169 +++++++++++++++++- 3 files changed, 182 insertions(+), 6 deletions(-) diff --git a/npiai/core/browser/_playwright.py b/npiai/core/browser/_playwright.py index e3aa507f..c8e0700e 100644 --- a/npiai/core/browser/_playwright.py +++ b/npiai/core/browser/_playwright.py @@ -1,9 +1,8 @@ +import os import pathlib import tempfile -import os -import uuid from urllib.request import urlretrieve -import traceback + from playwright.async_api import ( async_playwright, Playwright, @@ -13,7 +12,7 @@ FileChooser, ) -__BROWSER_UTILS_VERSION__ = "0.0.3" +__BROWSER_UTILS_VERSION__ = "0.0.4" def _prepare_browser_utils(): diff --git a/npiai/tools/web/page_analyzer/__test__/full_tests.py b/npiai/tools/web/page_analyzer/__test__/full_tests.py index fa3ecd87..6c040066 100644 --- a/npiai/tools/web/page_analyzer/__test__/full_tests.py +++ b/npiai/tools/web/page_analyzer/__test__/full_tests.py @@ -1,4 +1,7 @@ import asyncio +import json +from textwrap import indent + from npiai.tools.web.page_analyzer import PageAnalyzer # from npiai.utils.test_utils import DebugContext @@ -9,7 +12,7 @@ "https://www.bardeen.ai/playbooks/get-data-from-the-currently-opened-imdb-com-title-page", "https://ifttt.com/explore", "https://retool.com/templates", - "https://www.google.com/search?q=test", + "https://www.google.com/search?q=test&hl=ja", "https://www.amazon.com/s?k=test", "https://github.com/facebook/react/issues", ] @@ -41,6 +44,13 @@ async def main(): print(" - Inferred scraping type:", scraping_type) + if scraping_type == "list-like": + anchors = await analyzer.get_similar_items(ctx, url) + + print( + " - Possible anchors:", indent(json.dumps(anchors, indent=2), " ") + ) + print() diff --git a/npiai/tools/web/page_analyzer/app.py b/npiai/tools/web/page_analyzer/app.py index c5e5876d..14fd8179 100644 --- a/npiai/tools/web/page_analyzer/app.py +++ b/npiai/tools/web/page_analyzer/app.py @@ -1,6 +1,8 @@ import json from textwrap import dedent -from typing import Literal +from typing import Literal, List +from typing_extensions import TypedDict + from litellm.types.completion import ( ChatCompletionSystemMessageParam, @@ -14,6 +16,12 @@ _ScrapingType = Literal["list-like", "single"] +class CommonSelectors(TypedDict): + items: str + ancestor: str + anchors: str + + class PageAnalyzer(BrowserTool): name = "page_analyzer" description = "Analyze a web page for scraping purposes" @@ -157,6 +165,65 @@ async def get_selector_of_marker(self, marker_id: int = -1) -> str | None: marker_id, ) + async def compute_common_selectors( + self, anchor_ids: List[int] + ) -> CommonSelectors | None: + """ + Expand the anchors with the given IDs and compute the common items and ancestor selector. + + Args: + anchor_ids: An array of IDs of the elements that are similar to each other and represent a meaningful list of items. + """ + # print("anchor_ids:", anchor_ids) + + if not anchor_ids: + return None + + # extract the first 3 elements and expand their anchors + # to find common items and ancestor selector + return await self.playwright.page.evaluate( + """(anchorIds) => { + try { + const anchorElements = anchorIds.map(id => npi.getElement(id)); + + const expandedAnchors = new Set(anchorElements.flatMap(el => { + return npi.selectorUtils.expandAnchorFrom(el) || []; + })); + + let selectors; + + if (expandedAnchors.size >= 2) { + selectors = npi.selectorUtils.getCommonItemsAndAncestor(...expandedAnchors); + } else { + selectors = npi.selectorUtils.getCommonItemsAndAncestor(...anchorElements); + } + + if (!selectors) { + return null; + } + + const matches = [...document.querySelectorAll(selectors.items)]; + + if (matches.length < 3 || matches.length > 1000) { + return null; + } + + const anchorsSelector = matches + .slice(0, 3) + .map(el => npi.getUniqueSelector(el)) + .join(", "); + + return { + ...selectors, + anchors: anchorsSelector, + } + } catch { + return null; + } + }""", + anchor_ids[:3], + ) + @function async def support_infinite_scroll(self, url: str) -> bool: """ @@ -375,3 +442,103 @@ async def infer_scraping_type(self, ctx: Context, url: str) -> _ScrapingType: ) return await self.set_scraping_type(**res.model_dump()) + + @function + async def get_similar_items(self, ctx: Context, url: str) -> CommonSelectors | None: + """ + Open the given URL and determine whether there are similar elements representing a meaningful list of items. If there are, return the common selector of the similar elements, the ancestor selector, and the selectors of the anchor elements. Otherwise, return None. + + Args: + ctx: NPi Context + url: URL of the page + """ + await self._load_page(url) + + # use latest page url in case of redirections + page_url = await self.get_page_url() + page_title = await self.get_page_title() + raw_screenshot = await self.get_screenshot(full_page=True) + elements, _ = await self.get_interactive_elements( + screenshot=raw_screenshot, + full_page=True, + ) + annotated_screenshot = await self.get_screenshot(full_page=True) + + filtered_elements = [] + + for elem in elements: + if elem["role"] != "button" and ( + len(elem["accessibleName"]) > 10 + or len(elem["accessibleDescription"]) > 10 + ): + filtered_elements.append(elem) + + # print("filtered_elements:", filtered_elements) + + res = await llm_tool_call( + llm=ctx.llm, + tool=self.compute_common_selectors, + messages=[ + ChatCompletionSystemMessageParam( + role="system", + content=dedent( + """ + Analyze the given page and determine whether there are similar elements representing **the most meaningful** list of items. If there are, use the tool to calculate the common selector of the similar elements. + + ## Provided Context + + - An annotated screenshot of the target page where the interactive elements are surrounded with rectangular bounding boxes in different colors. At the top left of each bounding box is a small rectangle in the same color as the bounding box. This is the label and it contains a number indicating the ID of that box. The label number starts from 0. + - The URL of the page. + - The title of the page. + - An array of the interactive elements on the page. The elements are described as JSON objects defined in the Element Object section. Some irrelevant elements are filtered out. + + ## Element Object + + The original HTML elements are described as the following JSON objects: + + type Element = { + id: string; // The Marker ID of the element + tag: string; // The tag of the element + role: string | null; // The WAI-ARIA accessible role of the element + accessibleName: string; // The WAI-ARIA accessible name of the element + accessibleDescription: string; // The WAI-ARIA accessible description of the element + attributes: Record; // Some helpful attributes of the element + options?: string[]; // Available options of an element. + } + + ## Instructions + + Follow the instructions to determine whether there is a pagination button on the current page for navigating to the next page: + 1. Examine the screenshots, the URL, and the title of the page to understand the context, and then think about what the current page is. + 2. Go through the elements array, pay attention to the `role`, `accessibleName`, and `accessibleDescription` properties to grab semantic information of the elements. + 3. Check if there are similar elements representing **the most meaningful list** of items. Typically, these elements link to the detail pages of the items. Note that these elements should not be the pagination buttons and should contain enough meaningful information, not just some short phrases. + 4. If you find meaningful similar elements, call the tool with a list of the IDs of the elements to compute the common selectors. Otherwise, call the tool with an empty list. + """ + ), + ), + ChatCompletionUserMessageParam( + role="user", + content=[ + { + "type": "text", + "text": json.dumps( + { + "url": page_url, + "title": page_title, + "elements": filtered_elements, + }, + ensure_ascii=False, + ), + }, + { + "type": "image_url", + "image_url": { + "url": annotated_screenshot, + }, + }, + ], + ), + ], + ) + + return await self.compute_common_selectors(**res.model_dump())