From f252f582c841fc4445bfd6164d1b31e9fd2ec64a Mon Sep 17 00:00:00 2001 From: Hugo Saporetti Junior Date: Fri, 22 Nov 2024 18:48:24 -0300 Subject: [PATCH] Screenshot improvements --- src/main/askai/core/component/camera.py | 6 +- .../askai/core/model/screenshot_result.py | 19 ++++- src/main/askai/core/router/agent_tools.py | 4 +- src/main/askai/core/router/tools/vision.py | 73 ++++++++++++++++--- src/main/askai/core/router/tools/webcam.py | 4 +- .../askai/resources/prompts/ss-caption.txt | 2 +- src/main/askai/resources/rag/accuracy.csv | 1 + 7 files changed, 87 insertions(+), 22 deletions(-) diff --git a/src/main/askai/core/component/camera.py b/src/main/askai/core/component/camera.py index 7b3530fb..3dea8561 100644 --- a/src/main/askai/core/component/camera.py +++ b/src/main/askai/core/component/camera.py @@ -21,7 +21,7 @@ from askai.core.component.image_store import ImageData, ImageFile, ImageMetadata, store from askai.core.model.ai_reply import AIReply from askai.core.model.image_result import ImageResult -from askai.core.router.tools.vision import image_captioner, parse_caption +from askai.core.router.tools.vision import image_captioner, parse_image_caption from askai.core.support.utilities import build_img_path from askai.exception.exceptions import CameraAccessFailure, WebCamInitializationFailure from hspylib.core.metaclass.classpath import AnyPath @@ -127,7 +127,7 @@ def capture( hash_text(basename(final_path)), final_path, store.PHOTO_CATEGORY, - parse_caption(image_captioner(final_path)) if with_caption else msg.no_caption(), + parse_image_caption(image_captioner(final_path)) if with_caption else msg.no_caption(), ) if store_image: store.store_image(photo_file) @@ -222,7 +222,7 @@ def _read_file(img_path: str) -> ImageFile: hash_text(basename(img_path)), img_path, store.IMPORTS_CATEGORY, - parse_caption(image_captioner(img_path)), + parse_image_caption(image_captioner(img_path)), ) def _do_import(*img_path: str) -> None: diff --git a/src/main/askai/core/model/screenshot_result.py b/src/main/askai/core/model/screenshot_result.py index b7b73c08..18fd3449 100644 --- a/src/main/askai/core/model/screenshot_result.py +++ b/src/main/askai/core/model/screenshot_result.py @@ -2,6 +2,7 @@ import json from typing import AnyStr +from kubernetes.watch.watch import SimpleNamespace from pydantic import BaseModel, Field @@ -10,9 +11,23 @@ class ScreenshotResult(BaseModel): using Pydantic's data validation features. """ + class DocumentModel(BaseModel): + """TODO""" + + page_number: int = Field(description="Document page number") + header: str = Field(description="Document header content") + footer: str = Field(description="Document footer content") + content_overview: str = Field(description="Document content overview") + + class WebsiteModel(BaseModel): + """TODO""" + + website_description: str = Field(description="Website description") + website_url: str = Field(description="Website URL") + open_applications: list[str] = Field(description="List of open applications") - docs_description: list[str] = Field(description="List of document descriptions") - web_pages: str = Field(description="Description of visible web pages") + open_documents: list[DocumentModel] = Field(description="List of document descriptions") + web_pages: list[WebsiteModel] = Field(description="Description of visible web pages") user_response: str = Field(description="A response to the user question") @staticmethod diff --git a/src/main/askai/core/router/agent_tools.py b/src/main/askai/core/router/agent_tools.py index 743a299d..85b8e5cd 100644 --- a/src/main/askai/core/router/agent_tools.py +++ b/src/main/askai/core/router/agent_tools.py @@ -32,7 +32,7 @@ from askai.core.router.tools.generation import generate_content, save_content from askai.core.router.tools.summarization import summarize from askai.core.router.tools.terminal import execute_command, list_contents, open_command -from askai.core.router.tools.vision import image_captioner, parse_caption, capture_screenshot +from askai.core.router.tools.vision import image_captioner, parse_image_caption, capture_screenshot from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE from askai.exception.exceptions import TerminatingQuery @@ -128,7 +128,7 @@ def image_captioner(self, image_path: str) -> str: :param image_path: The absolute path of the image file to be analyzed. :return: A string containing the generated caption describing the image. """ - image_caption: list[str] = parse_caption(image_captioner(image_path)) + image_caption: list[str] = parse_image_caption(image_captioner(image_path)) return CAPTION_TEMPLATE.substitute( image_path=image_path, image_caption=os.linesep.join(image_caption) if image_caption else "" ) diff --git a/src/main/askai/core/router/tools/vision.py b/src/main/askai/core/router/tools/vision.py index c0c7ac17..0aa2a97d 100644 --- a/src/main/askai/core/router/tools/vision.py +++ b/src/main/askai/core/router/tools/vision.py @@ -1,4 +1,5 @@ import os +from string import Template from textwrap import indent from typing import Literal @@ -21,9 +22,20 @@ from askai.core.engine.ai_vision import AIVision from askai.core.model.ai_reply import AIReply from askai.core.model.image_result import ImageResult +from askai.core.model.screenshot_result import ScreenshotResult from askai.core.router.evaluation import resolve_x_refs from askai.core.support.shared_instances import shared +SCREENSHOT_TEMPLATE: Template = Template( + """\ + +>  Screenshot `${image_path}`: + +${image_caption} + +""" +) + class HFModel(Enumeration): """Available Hugging Face models""" @@ -107,7 +119,7 @@ def image_captioner( return image_caption -def parse_caption(image_caption: str) -> list[str]: +def parse_image_caption(image_caption: str) -> list[str]: """Parse the given image caption. :param image_caption: The caption to parse. :return: The parsed caption as a string. @@ -135,6 +147,43 @@ def parse_caption(image_caption: str) -> list[str]: return [msg.no_caption()] +def parse_screenshot_caption(screenshot_caption: str) -> list[str]: + """Parse the given screenshot caption. + :param screenshot_caption: The caption to parse. + :return: The parsed caption as a string. + """ + if screenshot_caption: + events.reply.emit(reply=AIReply.full(msg.parsing_caption())) + result: ScreenshotResult = ScreenshotResult.of(screenshot_caption) + ln: str = os.linesep + apps_desc: list[str] = [] + docs_desc: list[str] = [] + web_pages: list[str] = [] + user_response_desc: list[str] = [] + if result.open_applications: + apps_desc = [ + f"- **Applications:**", + indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_applications])}", " "), + ] + if result.open_documents: + docs_desc = [ + f"- **Documents:**", + indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_documents])}", " "), + ] + if result.web_pages: + web_pages = [ + f"- **WebPages:**", + indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.web_pages])}", " "), + ] + if result.user_response: + user_response_desc = [f"- **Answer**: `{result.user_response}`"] + # fmt: off + return apps_desc + docs_desc + web_pages + user_response_desc + # fmt: on + + return [msg.no_caption()] + + def capture_screenshot( path_name: AnyPath | None = None, save_dir: AnyPath | None = None, query: str | None = None ) -> str: @@ -148,23 +197,23 @@ def capture_screenshot( file_path: str = ensure_endswith(path_name or f"ASKAI-SCREENSHOT-{now('%Y%m%d%H%M')}", ".jpeg") posix_path: PathObject = PathObject.of(file_path) check_argument(os.path.exists(posix_path.abs_dir)) - desktop_caption: str = "No screenshot captured" i = 3 + events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}"))) while (i := (i - 1)) >= 0: player.play_sfx("click") pause.seconds(1) - events.reply.emit(reply=AIReply.mute(str(i)), erase_last=True) + events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}")), erase_last=True) player.play_sfx("camera-shutter") events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True) - if screenshot := pyautogui.screenshot(): - _, ext = os.path.splitext(posix_path.filename) - if ext.casefold().endswith((".jpg", ".jpeg")): - screenshot = screenshot.convert("RGB") - final_path: str = os.path.join(save_dir or posix_path.abs_dir or SCREENSHOTS_DIR, posix_path.filename) - screenshot.save(final_path) - events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path))) - desktop_caption = image_captioner(final_path, save_dir, query, "screenshot") + screenshot = pyautogui.screenshot() + _, ext = os.path.splitext(posix_path.filename) + if ext.casefold().endswith((".jpg", ".jpeg")): + screenshot = screenshot.convert("RGB") + final_path: str = os.path.join(save_dir or SCREENSHOTS_DIR, posix_path.filename) + screenshot.save(final_path) + events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path))) + desktop_caption = parse_screenshot_caption(image_captioner(final_path, save_dir, query, "screenshot")) - return desktop_caption + return SCREENSHOT_TEMPLATE.substitute(image_path=final_path, image_caption=os.linesep.join(desktop_caption)) diff --git a/src/main/askai/core/router/tools/webcam.py b/src/main/askai/core/router/tools/webcam.py index c63e8a3e..a9a05cfd 100644 --- a/src/main/askai/core/router/tools/webcam.py +++ b/src/main/askai/core/router/tools/webcam.py @@ -8,7 +8,7 @@ from askai.core.askai_messages import msg from askai.core.component.camera import camera from askai.core.model.ai_reply import AIReply -from askai.core.router.tools.vision import image_captioner, parse_caption +from askai.core.router.tools.vision import image_captioner, parse_image_caption PHOTO_TEMPLATE: Template = Template( """\ @@ -72,7 +72,7 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False, query: s else [] ) - image_description: list[str] = parse_caption(image_captioner(pic_file.img_path, query=query)) + image_description: list[str] = parse_image_caption(image_captioner(pic_file.img_path, query=query)) return PHOTO_TEMPLATE.substitute( pic_file=pic_file.img_path, diff --git a/src/main/askai/resources/prompts/ss-caption.txt b/src/main/askai/resources/prompts/ss-caption.txt index 96671f47..5f90c764 100644 --- a/src/main/askai/resources/prompts/ss-caption.txt +++ b/src/main/askai/resources/prompts/ss-caption.txt @@ -18,7 +18,7 @@ Given the provided screenshot, please perform the following tasks: - **List all open websites** visible in the screenshot. - For each website, include: - **Website Description**: Offer a detailed description of the website's purpose and content. - - **Identified URLs**: Mention any URLs or web addresses visible. + - **Website URL**: Mention the URLs or web addresses visible. 4. **Respond to Human Questions (If Provided):** - If a **Human Question** is provided at the end of the screenshot, **provide a clear and concise response** to it. diff --git a/src/main/askai/resources/rag/accuracy.csv b/src/main/askai/resources/rag/accuracy.csv index 511726a9..17ef5521 100644 --- a/src/main/askai/resources/rag/accuracy.csv +++ b/src/main/askai/resources/rag/accuracy.csv @@ -1,4 +1,5 @@ Human Input;" AI Response";" Classification";" Reasoning" +Use the screenshot tool to capture the current screen.;{'open_applications': ['PyCharm'], 'open_documents': [], 'content_overview': "This document contains Python code for evaluating the accuracy of an AI's response to a user's question within a defined evaluation framework."}], 'web_pages': '', 'user_response': 'None'};Green;The AI successfully described the screenshot. Do you know what is Gus Fraba?;I don't know;Black;The AI responded negatively, stating that gabiroba song files do not exist and, as a result, cannot proceed with further interactions. Is there any gabiroba song?;No gabiroba song files found.;Black;The AI provided a negative answer, so, interactions should stop. Did I download and gabiroba file?;No files containing 'gabiroba' were found in the specified directory.;Black;The AI responded negatively, stating that files containing 'gabiroba' were not found and, as a result, cannot proceed with further interactions.