Screenshot improvements

yorevs · Nov 22, 2024 · f252f58 · f252f58
1 parent 4c8d91e
commit f252f58
Show file tree

Hide file tree

Showing 7 changed files with 87 additions and 22 deletions.
diff --git a/src/main/askai/core/component/camera.py b/src/main/askai/core/component/camera.py
@@ -21,7 +21,7 @@
 from askai.core.component.image_store import ImageData, ImageFile, ImageMetadata, store
 from askai.core.model.ai_reply import AIReply
 from askai.core.model.image_result import ImageResult
-from askai.core.router.tools.vision import image_captioner, parse_caption
+from askai.core.router.tools.vision import image_captioner, parse_image_caption
 from askai.core.support.utilities import build_img_path
 from askai.exception.exceptions import CameraAccessFailure, WebCamInitializationFailure
 from hspylib.core.metaclass.classpath import AnyPath
@@ -127,7 +127,7 @@ def capture(
                 hash_text(basename(final_path)),
                 final_path,
                 store.PHOTO_CATEGORY,
-                parse_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
+                parse_image_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
             )
             if store_image:
                 store.store_image(photo_file)
@@ -222,7 +222,7 @@ def _read_file(img_path: str) -> ImageFile:
                 hash_text(basename(img_path)),
                 img_path,
                 store.IMPORTS_CATEGORY,
-                parse_caption(image_captioner(img_path)),
+                parse_image_caption(image_captioner(img_path)),
             )
 
         def _do_import(*img_path: str) -> None:

diff --git a/src/main/askai/core/model/screenshot_result.py b/src/main/askai/core/model/screenshot_result.py
@@ -2,6 +2,7 @@
 import json
 from typing import AnyStr
 
+from kubernetes.watch.watch import SimpleNamespace
 from pydantic import BaseModel, Field
 
 
@@ -10,9 +11,23 @@ class ScreenshotResult(BaseModel):
     using Pydantic's data validation features.
     """
 
+    class DocumentModel(BaseModel):
+        """TODO"""
+
+        page_number: int = Field(description="Document page number")
+        header: str = Field(description="Document header content")
+        footer: str = Field(description="Document footer content")
+        content_overview: str = Field(description="Document content overview")
+
+    class WebsiteModel(BaseModel):
+        """TODO"""
+
+        website_description: str = Field(description="Website description")
+        website_url: str = Field(description="Website URL")
+
     open_applications: list[str] = Field(description="List of open applications")
-    docs_description: list[str] = Field(description="List of document descriptions")
-    web_pages: str = Field(description="Description of visible web pages")
+    open_documents: list[DocumentModel] = Field(description="List of document descriptions")
+    web_pages: list[WebsiteModel] = Field(description="Description of visible web pages")
     user_response: str = Field(description="A response to the user question")
 
     @staticmethod

diff --git a/src/main/askai/core/router/agent_tools.py b/src/main/askai/core/router/agent_tools.py
@@ -32,7 +32,7 @@
 from askai.core.router.tools.generation import generate_content, save_content
 from askai.core.router.tools.summarization import summarize
 from askai.core.router.tools.terminal import execute_command, list_contents, open_command
-from askai.core.router.tools.vision import image_captioner, parse_caption, capture_screenshot
+from askai.core.router.tools.vision import image_captioner, parse_image_caption, capture_screenshot
 from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE
 from askai.exception.exceptions import TerminatingQuery
 
@@ -128,7 +128,7 @@ def image_captioner(self, image_path: str) -> str:
         :param image_path: The absolute path of the image file to be analyzed.
         :return: A string containing the generated caption describing the image.
         """
-        image_caption: list[str] = parse_caption(image_captioner(image_path))
+        image_caption: list[str] = parse_image_caption(image_captioner(image_path))
         return CAPTION_TEMPLATE.substitute(
             image_path=image_path, image_caption=os.linesep.join(image_caption) if image_caption else ""
         )

diff --git a/src/main/askai/core/router/tools/vision.py b/src/main/askai/core/router/tools/vision.py
@@ -1,4 +1,5 @@
 import os
+from string import Template
 from textwrap import indent
 from typing import Literal
 
@@ -21,9 +22,20 @@
 from askai.core.engine.ai_vision import AIVision
 from askai.core.model.ai_reply import AIReply
 from askai.core.model.image_result import ImageResult
+from askai.core.model.screenshot_result import ScreenshotResult
 from askai.core.router.evaluation import resolve_x_refs
 from askai.core.support.shared_instances import shared
 
+SCREENSHOT_TEMPLATE: Template = Template(
+    """\
+
+>   Screenshot `${image_path}`:
+
+${image_caption}
+
+"""
+)
+
 
 class HFModel(Enumeration):
     """Available Hugging Face models"""
@@ -107,7 +119,7 @@ def image_captioner(
     return image_caption
 
 
-def parse_caption(image_caption: str) -> list[str]:
+def parse_image_caption(image_caption: str) -> list[str]:
     """Parse the given image caption.
     :param image_caption: The caption to parse.
     :return: The parsed caption as a string.
@@ -135,6 +147,43 @@ def parse_caption(image_caption: str) -> list[str]:
     return [msg.no_caption()]
 
 
+def parse_screenshot_caption(screenshot_caption: str) -> list[str]:
+    """Parse the given screenshot caption.
+    :param screenshot_caption: The caption to parse.
+    :return: The parsed caption as a string.
+    """
+    if screenshot_caption:
+        events.reply.emit(reply=AIReply.full(msg.parsing_caption()))
+        result: ScreenshotResult = ScreenshotResult.of(screenshot_caption)
+        ln: str = os.linesep
+        apps_desc: list[str] = []
+        docs_desc: list[str] = []
+        web_pages: list[str] = []
+        user_response_desc: list[str] = []
+        if result.open_applications:
+            apps_desc = [
+                f"- **Applications:**",
+                indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_applications])}", "    "),
+            ]
+        if result.open_documents:
+            docs_desc = [
+                f"- **Documents:**",
+                indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_documents])}", "    "),
+            ]
+        if result.web_pages:
+            web_pages = [
+                f"- **WebPages:**",
+                indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.web_pages])}", "    "),
+            ]
+        if result.user_response:
+            user_response_desc = [f"- **Answer**: `{result.user_response}`"]
+        # fmt: off
+        return apps_desc + docs_desc + web_pages + user_response_desc
+        # fmt: on
+
+    return [msg.no_caption()]
+
+
 def capture_screenshot(
     path_name: AnyPath | None = None, save_dir: AnyPath | None = None, query: str | None = None
 ) -> str:
@@ -148,23 +197,23 @@ def capture_screenshot(
     file_path: str = ensure_endswith(path_name or f"ASKAI-SCREENSHOT-{now('%Y%m%d%H%M')}", ".jpeg")
     posix_path: PathObject = PathObject.of(file_path)
     check_argument(os.path.exists(posix_path.abs_dir))
-    desktop_caption: str = "No screenshot captured"
     i = 3
 
+    events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}")))
     while (i := (i - 1)) >= 0:
         player.play_sfx("click")
         pause.seconds(1)
-        events.reply.emit(reply=AIReply.mute(str(i)), erase_last=True)
+        events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}")), erase_last=True)
     player.play_sfx("camera-shutter")
     events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)
 
-    if screenshot := pyautogui.screenshot():
-        _, ext = os.path.splitext(posix_path.filename)
-        if ext.casefold().endswith((".jpg", ".jpeg")):
-            screenshot = screenshot.convert("RGB")
-        final_path: str = os.path.join(save_dir or posix_path.abs_dir or SCREENSHOTS_DIR, posix_path.filename)
-        screenshot.save(final_path)
-        events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
-        desktop_caption = image_captioner(final_path, save_dir, query, "screenshot")
+    screenshot = pyautogui.screenshot()
+    _, ext = os.path.splitext(posix_path.filename)
+    if ext.casefold().endswith((".jpg", ".jpeg")):
+        screenshot = screenshot.convert("RGB")
+    final_path: str = os.path.join(save_dir or SCREENSHOTS_DIR, posix_path.filename)
+    screenshot.save(final_path)
+    events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
+    desktop_caption = parse_screenshot_caption(image_captioner(final_path, save_dir, query, "screenshot"))
 
-    return desktop_caption
+    return SCREENSHOT_TEMPLATE.substitute(image_path=final_path, image_caption=os.linesep.join(desktop_caption))
diff --git a/src/main/askai/core/router/tools/webcam.py b/src/main/askai/core/router/tools/webcam.py
@@ -8,7 +8,7 @@
 from askai.core.askai_messages import msg
 from askai.core.component.camera import camera
 from askai.core.model.ai_reply import AIReply
-from askai.core.router.tools.vision import image_captioner, parse_caption
+from askai.core.router.tools.vision import image_captioner, parse_image_caption
 
 PHOTO_TEMPLATE: Template = Template(
     """\
@@ -72,7 +72,7 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False, query: s
             else []
         )
 
-    image_description: list[str] = parse_caption(image_captioner(pic_file.img_path, query=query))
+    image_description: list[str] = parse_image_caption(image_captioner(pic_file.img_path, query=query))
 
     return PHOTO_TEMPLATE.substitute(
         pic_file=pic_file.img_path,

diff --git a/src/main/askai/resources/prompts/ss-caption.txt b/src/main/askai/resources/prompts/ss-caption.txt
@@ -18,7 +18,7 @@ Given the provided screenshot, please perform the following tasks:
    - **List all open websites** visible in the screenshot.
    - For each website, include:
      - **Website Description**: Offer a detailed description of the website's purpose and content.
-     - **Identified URLs**: Mention any URLs or web addresses visible.
+     - **Website URL**: Mention the URLs or web addresses visible.
 
 4. **Respond to Human Questions (If Provided):**
    - If a **Human Question** is provided at the end of the screenshot, **provide a clear and concise response** to it.

diff --git a/src/main/askai/resources/rag/accuracy.csv b/src/main/askai/resources/rag/accuracy.csv
@@ -1,4 +1,5 @@
 Human Input;" AI Response";" Classification";" Reasoning"
+Use the screenshot tool to capture the current screen.;{'open_applications': ['PyCharm'], 'open_documents': [], 'content_overview': "This document contains Python code for evaluating the accuracy of an AI's response to a user's question within a defined evaluation framework."}], 'web_pages': '', 'user_response': 'None'};Green;The AI successfully described the screenshot.
 Do you know what is Gus Fraba?;I don't know;Black;The AI responded negatively, stating that gabiroba song files do not exist and, as a result, cannot proceed with further interactions.
 Is there any gabiroba song?;No gabiroba song files found.;Black;The AI provided a negative answer, so, interactions should stop.
 Did I download and gabiroba file?;No files containing 'gabiroba' were found in the specified directory.;Black;The AI responded negatively, stating that files containing 'gabiroba' were not found and, as a result, cannot proceed with further interactions.