Select among photo or screenshot caption

yorevs · Nov 22, 2024 · 4c8d91e · 4c8d91e
1 parent 83f79b7
commit 4c8d91e
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 22 deletions.
diff --git a/src/main/askai/core/engine/ai_vision.py b/src/main/askai/core/engine/ai_vision.py
@@ -13,17 +13,24 @@
    Copyright (c) 2024, HomeSetup
 """
 from hspylib.core.metaclass.classpath import AnyPath
-from typing import Protocol
+from typing import Protocol, Literal
 
 
 class AIVision(Protocol):
     """Provide an interface for AI vision."""
 
-    def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
+    def caption(
+        self,
+        filename: AnyPath,
+        load_dir: AnyPath | None,
+        query: str | None = None,
+        image_type: Literal["photo", "screenshot"] = "photo",
+    ) -> str:
         """Generate a caption for the provided image.
         :param filename: File name of the image for which the caption is to be generated.
         :param load_dir: Optional directory path for loading related resources.
         :param query: Optional question about details of the image.
-        :return: A dictionary containing the generated caption.
+        :param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
+        :return: A string containing the generated caption.
         """
         ...
diff --git a/src/main/askai/core/engine/openai/openai_vision.py b/src/main/askai/core/engine/openai/openai_vision.py
@@ -12,7 +12,7 @@
 
    Copyright (c) 2024, HomeSetup
 """
-from typing import TypeAlias
+from typing import TypeAlias, Literal
 import os
 
 from langchain_core.prompts import PromptTemplate
@@ -25,10 +25,10 @@
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.runnables import chain
 from langchain_openai import ChatOpenAI
-from retry import retry
 
 from askai.core.askai_prompt import prompt
 from askai.core.model.image_result import ImageResult
+from askai.core.model.screenshot_result import ScreenshotResult
 from askai.core.support.utilities import encode_image, find_file
 
 Base64Image: TypeAlias = dict[str, str]
@@ -39,8 +39,6 @@
 class OpenAIVision:
     """Provide a base class for OpenAI vision features. This class implements the AIVision protocol."""
 
-    _OUT_PARSER = JsonOutputParser(pydantic_object=ImageResult)
-
     @staticmethod
     def _encode_image(inputs: dict) -> dict[str, str]:
         """Load an image from file and encode it as a base64 string.
@@ -65,33 +63,69 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
                 HumanMessage(
                     content=[
                         {"type": "text", "text": inputs["prompt"]},
-                        {"type": "text", "text": OpenAIVision._OUT_PARSER.get_format_instructions()},
+                        {"type": "text", "text": inputs["parser_guides"]},
                         {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
                     ]
                 )
             ]
         )
         return msg.content
 
-    def template(self, question: str = "") -> str:
-        return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("vision")).format(
+    def image_template(self, question: str = "") -> str:
+        return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("img-caption")).format(
+            question=question
+        )
+
+    def screenshot_template(self, question: str = "") -> str:
+        return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("ss-caption")).format(
             question=question
         )
 
-    @retry()
-    def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
+    def caption(
+        self,
+        filename: AnyPath,
+        load_dir: AnyPath | None,
+        query: str | None = None,
+        image_type: Literal["photo", "screenshot"] = "photo",
+    ) -> str:
         """Generate a caption for the provided image.
         :param filename: File name of the image for which the caption is to be generated.
         :param load_dir: Optional directory path for loading related resources.
         :param query: Optional question about details of the image.
+        :param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
         :return: A string containing the generated caption.
         """
         final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
         check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
-        vision_prompt: str = self.template(query)
+        vision_prompt: str = self._get_vision_prompt(query, image_type)
         load_image_chain = TransformChain(
-            input_variables=["image_path"], output_variables=["image"], transform=self._encode_image
+            input_variables=["image_path", "parser_guides"], output_variables=["image"], transform=self._encode_image
         )
-        vision_chain = load_image_chain | self.create_image_caption_chain | OpenAIVision._OUT_PARSER
-        args: dict[str, str] = {"image_path": f"{final_path}", "prompt": vision_prompt}
+        out_parser: JsonOutputParser = self._get_out_parser(image_type)
+        vision_chain = load_image_chain | self.create_image_caption_chain | out_parser
+        args: dict[str, str] = {
+            "image_path": f"{final_path}",
+            "prompt": vision_prompt,
+            "parser_guides": out_parser.get_format_instructions(),
+        }
         return str(vision_chain.invoke(args))
+
+    def _get_out_parser(self, image_type: Literal["photo", "screenshot"]) -> JsonOutputParser:
+        """TODO"""
+        match image_type:
+            case "photo":
+                return JsonOutputParser(pydantic_object=ImageResult)
+            case "screenshot":
+                return JsonOutputParser(pydantic_object=ScreenshotResult)
+            case _:
+                raise ValueError(f"Parser not found for: {image_type}")
+
+    def _get_vision_prompt(self, query: str, image_type: Literal["photo", "screenshot"]) -> str:
+        """TODO"""
+        match image_type:
+            case "photo":
+                return self.image_template(query)
+            case "screenshot":
+                return self.screenshot_template(query)
+            case _:
+                raise ValueError(f"Prompt not found for: {image_type}")
diff --git a/src/main/askai/core/model/screenshot_result.py b/src/main/askai/core/model/screenshot_result.py
@@ -0,0 +1,36 @@
+import ast
+import json
+from typing import AnyStr
+
+from pydantic import BaseModel, Field
+
+
+class ScreenshotResult(BaseModel):
+    """Information about an image. This class provides a schema for storing and validating image-related information
+    using Pydantic's data validation features.
+    """
+
+    open_applications: list[str] = Field(description="List of open applications")
+    docs_description: list[str] = Field(description="List of document descriptions")
+    web_pages: str = Field(description="Description of visible web pages")
+    user_response: str = Field(description="A response to the user question")
+
+    @staticmethod
+    def of(image_caption: AnyStr) -> "ScreenshotResult":
+        """Parses a string into an ScreenshotResult instance with enhanced handling for mixed quotes.
+        :param image_caption: The string to parse.
+        :return: An instance of ScreenshotResult populated with the parsed data.
+        :raises ValueError: If the string cannot be parsed as a Python object or JSON.
+        """
+
+        try:
+            parsed_data = ast.literal_eval(image_caption)
+        except (ValueError, SyntaxError):
+            try:
+                parsed_data = json.loads(image_caption)
+            except json.JSONDecodeError as e_json:
+                raise ValueError("String could not be parsed as Python object or JSON.") from e_json
+        try:
+            return ScreenshotResult(**parsed_data)
+        except Exception as e_pydantic:
+            raise ValueError("Parsed data does not conform to ScreenshotResult schema.") from e_pydantic
diff --git a/src/main/askai/core/router/tools/vision.py b/src/main/askai/core/router/tools/vision.py
@@ -1,6 +1,6 @@
 import os
-from fileinput import filename
 from textwrap import indent
+from typing import Literal
 
 import pause
 import pyautogui
@@ -74,11 +74,17 @@ def offline_captioner(path_name: AnyPath) -> str:
     return caption
 
 
-def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None, query: str | None = None) -> str:
+def image_captioner(
+    path_name: AnyPath,
+    load_dir: AnyPath | None = None,
+    query: str | None = None,
+    image_type: Literal["photo", "screenshot"] = "photo",
+) -> str:
     """This tool is used to describe an image.
     :param path_name: The path of the image to describe.
     :param load_dir: Optional directory path for loading related resources.
     :param query: Optional query about the photo taken.
+    :param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
     :return: A string containing the description of the image, or None if the description could not be generated.
     """
     image_caption: str = "Unavailable"
@@ -94,7 +100,9 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None, query:
     if posix_path.exists:
         events.reply.emit(reply=AIReply.full(msg.describe_image(posix_path)))
         vision: AIVision = shared.engine.vision()
-        image_caption = vision.caption(posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR, query)
+        image_caption = vision.caption(
+            posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR, query, image_type
+        )
 
     return image_caption
 
@@ -127,10 +135,13 @@ def parse_caption(image_caption: str) -> list[str]:
     return [msg.no_caption()]
 
 
-def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | None = None) -> str:
+def capture_screenshot(
+    path_name: AnyPath | None = None, save_dir: AnyPath | None = None, query: str | None = None
+) -> str:
     """Capture a screenshot and save it to the specified path.
     :param path_name: Optional path name of the captured screenshot.
     :param save_dir: Optional directory to save the screenshot.
+    :param query: Optional query about the screenshot taken.
     :return: The path to the saved screenshot.
     """
 
@@ -143,6 +154,7 @@ def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | Non
     while (i := (i - 1)) >= 0:
         player.play_sfx("click")
         pause.seconds(1)
+        events.reply.emit(reply=AIReply.mute(str(i)), erase_last=True)
     player.play_sfx("camera-shutter")
     events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)
 
@@ -153,6 +165,6 @@ def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | Non
         final_path: str = os.path.join(save_dir or posix_path.abs_dir or SCREENSHOTS_DIR, posix_path.filename)
         screenshot.save(final_path)
         events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
-        desktop_caption = image_captioner(final_path, save_dir)
+        desktop_caption = image_captioner(final_path, save_dir, query, "screenshot")
 
     return desktop_caption
diff --git a/src/main/askai/resources/prompts/vision.txt → ...n/askai/resources/prompts/img-caption.txt b/src/main/askai/resources/prompts/vision.txt → ...n/askai/resources/prompts/img-caption.txt
@@ -26,7 +26,7 @@ Given the provided image, please perform the following tasks:
      - Consider elements such as lighting, weather conditions, color tones, and textures.
      - Mention any emotions or feelings the scene might evoke (e.g., serene, chaotic, eerie).
 
-5. If the Human question is provided in the end, Also, respond to it.
+5. If the Human question is provided in the end, also, respond to it.
 
 
 Human Question: "{question}"
diff --git a/src/main/askai/resources/prompts/ss-caption.txt b/src/main/askai/resources/prompts/ss-caption.txt
@@ -0,0 +1,27 @@
+You are an Image Captioner specialized in describing Screenshots.
+
+**Instructions:**
+
+Given the provided screenshot, please perform the following tasks:
+
+1. **Identify Open Applications:**
+   - **List all open applications** visible in the screenshot.
+
+2. **Detailed Descriptions of Documents:**
+   - For each open document identified, provide a comprehensive description including:
+     - **Page Number**: Indicate the current page number.
+     - **Header/Footer**: Describe any headers or footers present.
+     - **Headlines**: Summarize the main headlines or titles.
+     - **Content Overview**: Provide an overview of the document's content.
+
+3. **Detailed Descriptions of Web Pages:**
+   - **List all open websites** visible in the screenshot.
+   - For each website, include:
+     - **Website Description**: Offer a detailed description of the website's purpose and content.
+     - **Identified URLs**: Mention any URLs or web addresses visible.
+
+4. **Respond to Human Questions (If Provided):**
+   - If a **Human Question** is provided at the end of the screenshot, **provide a clear and concise response** to it.
+
+
+Human Question: "{question}"