Skip to content

Commit

Permalink
Select among photo or screenshot caption
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Nov 22, 2024
1 parent 83f79b7 commit 4c8d91e
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 22 deletions.
13 changes: 10 additions & 3 deletions src/main/askai/core/engine/ai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,24 @@
Copyright (c) 2024, HomeSetup
"""
from hspylib.core.metaclass.classpath import AnyPath
from typing import Protocol
from typing import Protocol, Literal


class AIVision(Protocol):
"""Provide an interface for AI vision."""

def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
def caption(
self,
filename: AnyPath,
load_dir: AnyPath | None,
query: str | None = None,
image_type: Literal["photo", "screenshot"] = "photo",
) -> str:
"""Generate a caption for the provided image.
:param filename: File name of the image for which the caption is to be generated.
:param load_dir: Optional directory path for loading related resources.
:param query: Optional question about details of the image.
:return: A dictionary containing the generated caption.
:param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
:return: A string containing the generated caption.
"""
...
60 changes: 47 additions & 13 deletions src/main/askai/core/engine/openai/openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Copyright (c) 2024, HomeSetup
"""
from typing import TypeAlias
from typing import TypeAlias, Literal
import os

from langchain_core.prompts import PromptTemplate
Expand All @@ -25,10 +25,10 @@
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import chain
from langchain_openai import ChatOpenAI
from retry import retry

from askai.core.askai_prompt import prompt
from askai.core.model.image_result import ImageResult
from askai.core.model.screenshot_result import ScreenshotResult
from askai.core.support.utilities import encode_image, find_file

Base64Image: TypeAlias = dict[str, str]
Expand All @@ -39,8 +39,6 @@
class OpenAIVision:
"""Provide a base class for OpenAI vision features. This class implements the AIVision protocol."""

_OUT_PARSER = JsonOutputParser(pydantic_object=ImageResult)

@staticmethod
def _encode_image(inputs: dict) -> dict[str, str]:
"""Load an image from file and encode it as a base64 string.
Expand All @@ -65,33 +63,69 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
HumanMessage(
content=[
{"type": "text", "text": inputs["prompt"]},
{"type": "text", "text": OpenAIVision._OUT_PARSER.get_format_instructions()},
{"type": "text", "text": inputs["parser_guides"]},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
]
)
]
)
return msg.content

def template(self, question: str = "") -> str:
return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("vision")).format(
def image_template(self, question: str = "") -> str:
return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("img-caption")).format(
question=question
)

def screenshot_template(self, question: str = "") -> str:
return PromptTemplate(input_variables=["question"], template=prompt.read_prompt("ss-caption")).format(
question=question
)

@retry()
def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
def caption(
self,
filename: AnyPath,
load_dir: AnyPath | None,
query: str | None = None,
image_type: Literal["photo", "screenshot"] = "photo",
) -> str:
"""Generate a caption for the provided image.
:param filename: File name of the image for which the caption is to be generated.
:param load_dir: Optional directory path for loading related resources.
:param query: Optional question about details of the image.
:param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
:return: A string containing the generated caption.
"""
final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
vision_prompt: str = self.template(query)
vision_prompt: str = self._get_vision_prompt(query, image_type)
load_image_chain = TransformChain(
input_variables=["image_path"], output_variables=["image"], transform=self._encode_image
input_variables=["image_path", "parser_guides"], output_variables=["image"], transform=self._encode_image
)
vision_chain = load_image_chain | self.create_image_caption_chain | OpenAIVision._OUT_PARSER
args: dict[str, str] = {"image_path": f"{final_path}", "prompt": vision_prompt}
out_parser: JsonOutputParser = self._get_out_parser(image_type)
vision_chain = load_image_chain | self.create_image_caption_chain | out_parser
args: dict[str, str] = {
"image_path": f"{final_path}",
"prompt": vision_prompt,
"parser_guides": out_parser.get_format_instructions(),
}
return str(vision_chain.invoke(args))

def _get_out_parser(self, image_type: Literal["photo", "screenshot"]) -> JsonOutputParser:
"""TODO"""
match image_type:
case "photo":
return JsonOutputParser(pydantic_object=ImageResult)
case "screenshot":
return JsonOutputParser(pydantic_object=ScreenshotResult)
case _:
raise ValueError(f"Parser not found for: {image_type}")

def _get_vision_prompt(self, query: str, image_type: Literal["photo", "screenshot"]) -> str:
"""TODO"""
match image_type:
case "photo":
return self.image_template(query)
case "screenshot":
return self.screenshot_template(query)
case _:
raise ValueError(f"Prompt not found for: {image_type}")
36 changes: 36 additions & 0 deletions src/main/askai/core/model/screenshot_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import ast
import json
from typing import AnyStr

from pydantic import BaseModel, Field


class ScreenshotResult(BaseModel):
"""Information about an image. This class provides a schema for storing and validating image-related information
using Pydantic's data validation features.
"""

open_applications: list[str] = Field(description="List of open applications")
docs_description: list[str] = Field(description="List of document descriptions")
web_pages: str = Field(description="Description of visible web pages")
user_response: str = Field(description="A response to the user question")

@staticmethod
def of(image_caption: AnyStr) -> "ScreenshotResult":
"""Parses a string into an ScreenshotResult instance with enhanced handling for mixed quotes.
:param image_caption: The string to parse.
:return: An instance of ScreenshotResult populated with the parsed data.
:raises ValueError: If the string cannot be parsed as a Python object or JSON.
"""

try:
parsed_data = ast.literal_eval(image_caption)
except (ValueError, SyntaxError):
try:
parsed_data = json.loads(image_caption)
except json.JSONDecodeError as e_json:
raise ValueError("String could not be parsed as Python object or JSON.") from e_json
try:
return ScreenshotResult(**parsed_data)
except Exception as e_pydantic:
raise ValueError("Parsed data does not conform to ScreenshotResult schema.") from e_pydantic
22 changes: 17 additions & 5 deletions src/main/askai/core/router/tools/vision.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from fileinput import filename
from textwrap import indent
from typing import Literal

import pause
import pyautogui
Expand Down Expand Up @@ -74,11 +74,17 @@ def offline_captioner(path_name: AnyPath) -> str:
return caption


def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None, query: str | None = None) -> str:
def image_captioner(
path_name: AnyPath,
load_dir: AnyPath | None = None,
query: str | None = None,
image_type: Literal["photo", "screenshot"] = "photo",
) -> str:
"""This tool is used to describe an image.
:param path_name: The path of the image to describe.
:param load_dir: Optional directory path for loading related resources.
:param query: Optional query about the photo taken.
:param image_type: The type of the image to be captioned; one of 'photo' or 'screenshot'.
:return: A string containing the description of the image, or None if the description could not be generated.
"""
image_caption: str = "Unavailable"
Expand All @@ -94,7 +100,9 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None, query:
if posix_path.exists:
events.reply.emit(reply=AIReply.full(msg.describe_image(posix_path)))
vision: AIVision = shared.engine.vision()
image_caption = vision.caption(posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR, query)
image_caption = vision.caption(
posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR, query, image_type
)

return image_caption

Expand Down Expand Up @@ -127,10 +135,13 @@ def parse_caption(image_caption: str) -> list[str]:
return [msg.no_caption()]


def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | None = None) -> str:
def capture_screenshot(
path_name: AnyPath | None = None, save_dir: AnyPath | None = None, query: str | None = None
) -> str:
"""Capture a screenshot and save it to the specified path.
:param path_name: Optional path name of the captured screenshot.
:param save_dir: Optional directory to save the screenshot.
:param query: Optional query about the screenshot taken.
:return: The path to the saved screenshot.
"""

Expand All @@ -143,6 +154,7 @@ def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | Non
while (i := (i - 1)) >= 0:
player.play_sfx("click")
pause.seconds(1)
events.reply.emit(reply=AIReply.mute(str(i)), erase_last=True)
player.play_sfx("camera-shutter")
events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)

Expand All @@ -153,6 +165,6 @@ def capture_screenshot(path_name: AnyPath | None = None, save_dir: AnyPath | Non
final_path: str = os.path.join(save_dir or posix_path.abs_dir or SCREENSHOTS_DIR, posix_path.filename)
screenshot.save(final_path)
events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
desktop_caption = image_captioner(final_path, save_dir)
desktop_caption = image_captioner(final_path, save_dir, query, "screenshot")

return desktop_caption
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Given the provided image, please perform the following tasks:
- Consider elements such as lighting, weather conditions, color tones, and textures.
- Mention any emotions or feelings the scene might evoke (e.g., serene, chaotic, eerie).

5. If the Human question is provided in the end, Also, respond to it.
5. If the Human question is provided in the end, also, respond to it.


Human Question: "{question}"
27 changes: 27 additions & 0 deletions src/main/askai/resources/prompts/ss-caption.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
You are an Image Captioner specialized in describing Screenshots.

**Instructions:**

Given the provided screenshot, please perform the following tasks:

1. **Identify Open Applications:**
- **List all open applications** visible in the screenshot.

2. **Detailed Descriptions of Documents:**
- For each open document identified, provide a comprehensive description including:
- **Page Number**: Indicate the current page number.
- **Header/Footer**: Describe any headers or footers present.
- **Headlines**: Summarize the main headlines or titles.
- **Content Overview**: Provide an overview of the document's content.

3. **Detailed Descriptions of Web Pages:**
- **List all open websites** visible in the screenshot.
- For each website, include:
- **Website Description**: Offer a detailed description of the website's purpose and content.
- **Identified URLs**: Mention any URLs or web addresses visible.

4. **Respond to Human Questions (If Provided):**
- If a **Human Question** is provided at the end of the screenshot, **provide a clear and concise response** to it.


Human Question: "{question}"

0 comments on commit 4c8d91e

Please sign in to comment.