Skip to content

Commit

Permalink
Screenshot improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Nov 22, 2024
1 parent 4c8d91e commit f252f58
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 22 deletions.
6 changes: 3 additions & 3 deletions src/main/askai/core/component/camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from askai.core.component.image_store import ImageData, ImageFile, ImageMetadata, store
from askai.core.model.ai_reply import AIReply
from askai.core.model.image_result import ImageResult
from askai.core.router.tools.vision import image_captioner, parse_caption
from askai.core.router.tools.vision import image_captioner, parse_image_caption
from askai.core.support.utilities import build_img_path
from askai.exception.exceptions import CameraAccessFailure, WebCamInitializationFailure
from hspylib.core.metaclass.classpath import AnyPath
Expand Down Expand Up @@ -127,7 +127,7 @@ def capture(
hash_text(basename(final_path)),
final_path,
store.PHOTO_CATEGORY,
parse_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
parse_image_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
)
if store_image:
store.store_image(photo_file)
Expand Down Expand Up @@ -222,7 +222,7 @@ def _read_file(img_path: str) -> ImageFile:
hash_text(basename(img_path)),
img_path,
store.IMPORTS_CATEGORY,
parse_caption(image_captioner(img_path)),
parse_image_caption(image_captioner(img_path)),
)

def _do_import(*img_path: str) -> None:
Expand Down
19 changes: 17 additions & 2 deletions src/main/askai/core/model/screenshot_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
from typing import AnyStr

from kubernetes.watch.watch import SimpleNamespace
from pydantic import BaseModel, Field


Expand All @@ -10,9 +11,23 @@ class ScreenshotResult(BaseModel):
using Pydantic's data validation features.
"""

class DocumentModel(BaseModel):
"""TODO"""

page_number: int = Field(description="Document page number")
header: str = Field(description="Document header content")
footer: str = Field(description="Document footer content")
content_overview: str = Field(description="Document content overview")

class WebsiteModel(BaseModel):
"""TODO"""

website_description: str = Field(description="Website description")
website_url: str = Field(description="Website URL")

open_applications: list[str] = Field(description="List of open applications")
docs_description: list[str] = Field(description="List of document descriptions")
web_pages: str = Field(description="Description of visible web pages")
open_documents: list[DocumentModel] = Field(description="List of document descriptions")
web_pages: list[WebsiteModel] = Field(description="Description of visible web pages")
user_response: str = Field(description="A response to the user question")

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions src/main/askai/core/router/agent_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from askai.core.router.tools.generation import generate_content, save_content
from askai.core.router.tools.summarization import summarize
from askai.core.router.tools.terminal import execute_command, list_contents, open_command
from askai.core.router.tools.vision import image_captioner, parse_caption, capture_screenshot
from askai.core.router.tools.vision import image_captioner, parse_image_caption, capture_screenshot
from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE
from askai.exception.exceptions import TerminatingQuery

Expand Down Expand Up @@ -128,7 +128,7 @@ def image_captioner(self, image_path: str) -> str:
:param image_path: The absolute path of the image file to be analyzed.
:return: A string containing the generated caption describing the image.
"""
image_caption: list[str] = parse_caption(image_captioner(image_path))
image_caption: list[str] = parse_image_caption(image_captioner(image_path))
return CAPTION_TEMPLATE.substitute(
image_path=image_path, image_caption=os.linesep.join(image_caption) if image_caption else ""
)
Expand Down
73 changes: 61 additions & 12 deletions src/main/askai/core/router/tools/vision.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from string import Template
from textwrap import indent
from typing import Literal

Expand All @@ -21,9 +22,20 @@
from askai.core.engine.ai_vision import AIVision
from askai.core.model.ai_reply import AIReply
from askai.core.model.image_result import ImageResult
from askai.core.model.screenshot_result import ScreenshotResult
from askai.core.router.evaluation import resolve_x_refs
from askai.core.support.shared_instances import shared

SCREENSHOT_TEMPLATE: Template = Template(
"""\
>  Screenshot `${image_path}`:
${image_caption}
"""
)


class HFModel(Enumeration):
"""Available Hugging Face models"""
Expand Down Expand Up @@ -107,7 +119,7 @@ def image_captioner(
return image_caption


def parse_caption(image_caption: str) -> list[str]:
def parse_image_caption(image_caption: str) -> list[str]:
"""Parse the given image caption.
:param image_caption: The caption to parse.
:return: The parsed caption as a string.
Expand Down Expand Up @@ -135,6 +147,43 @@ def parse_caption(image_caption: str) -> list[str]:
return [msg.no_caption()]


def parse_screenshot_caption(screenshot_caption: str) -> list[str]:
"""Parse the given screenshot caption.
:param screenshot_caption: The caption to parse.
:return: The parsed caption as a string.
"""
if screenshot_caption:
events.reply.emit(reply=AIReply.full(msg.parsing_caption()))
result: ScreenshotResult = ScreenshotResult.of(screenshot_caption)
ln: str = os.linesep
apps_desc: list[str] = []
docs_desc: list[str] = []
web_pages: list[str] = []
user_response_desc: list[str] = []
if result.open_applications:
apps_desc = [
f"- **Applications:**",
indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_applications])}", " "),
]
if result.open_documents:
docs_desc = [
f"- **Documents:**",
indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.open_documents])}", " "),
]
if result.web_pages:
web_pages = [
f"- **WebPages:**",
indent(f"- {'- '.join([f'`{app}{ln}`' + ln for app in result.web_pages])}", " "),
]
if result.user_response:
user_response_desc = [f"- **Answer**: `{result.user_response}`"]
# fmt: off
return apps_desc + docs_desc + web_pages + user_response_desc
# fmt: on

return [msg.no_caption()]


def capture_screenshot(
path_name: AnyPath | None = None, save_dir: AnyPath | None = None, query: str | None = None
) -> str:
Expand All @@ -148,23 +197,23 @@ def capture_screenshot(
file_path: str = ensure_endswith(path_name or f"ASKAI-SCREENSHOT-{now('%Y%m%d%H%M')}", ".jpeg")
posix_path: PathObject = PathObject.of(file_path)
check_argument(os.path.exists(posix_path.abs_dir))
desktop_caption: str = "No screenshot captured"
i = 3

events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}")))
while (i := (i - 1)) >= 0:
player.play_sfx("click")
pause.seconds(1)
events.reply.emit(reply=AIReply.mute(str(i)), erase_last=True)
events.reply.emit(reply=AIReply.mute(msg.t(f"Screenshot in: {i}")), erase_last=True)
player.play_sfx("camera-shutter")
events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)

if screenshot := pyautogui.screenshot():
_, ext = os.path.splitext(posix_path.filename)
if ext.casefold().endswith((".jpg", ".jpeg")):
screenshot = screenshot.convert("RGB")
final_path: str = os.path.join(save_dir or posix_path.abs_dir or SCREENSHOTS_DIR, posix_path.filename)
screenshot.save(final_path)
events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
desktop_caption = image_captioner(final_path, save_dir, query, "screenshot")
screenshot = pyautogui.screenshot()
_, ext = os.path.splitext(posix_path.filename)
if ext.casefold().endswith((".jpg", ".jpeg")):
screenshot = screenshot.convert("RGB")
final_path: str = os.path.join(save_dir or SCREENSHOTS_DIR, posix_path.filename)
screenshot.save(final_path)
events.reply.emit(reply=AIReply.full(msg.screenshot_saved(final_path)))
desktop_caption = parse_screenshot_caption(image_captioner(final_path, save_dir, query, "screenshot"))

return desktop_caption
return SCREENSHOT_TEMPLATE.substitute(image_path=final_path, image_caption=os.linesep.join(desktop_caption))
4 changes: 2 additions & 2 deletions src/main/askai/core/router/tools/webcam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from askai.core.askai_messages import msg
from askai.core.component.camera import camera
from askai.core.model.ai_reply import AIReply
from askai.core.router.tools.vision import image_captioner, parse_caption
from askai.core.router.tools.vision import image_captioner, parse_image_caption

PHOTO_TEMPLATE: Template = Template(
"""\
Expand Down Expand Up @@ -72,7 +72,7 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False, query: s
else []
)

image_description: list[str] = parse_caption(image_captioner(pic_file.img_path, query=query))
image_description: list[str] = parse_image_caption(image_captioner(pic_file.img_path, query=query))

return PHOTO_TEMPLATE.substitute(
pic_file=pic_file.img_path,
Expand Down
2 changes: 1 addition & 1 deletion src/main/askai/resources/prompts/ss-caption.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Given the provided screenshot, please perform the following tasks:
- **List all open websites** visible in the screenshot.
- For each website, include:
- **Website Description**: Offer a detailed description of the website's purpose and content.
- **Identified URLs**: Mention any URLs or web addresses visible.
- **Website URL**: Mention the URLs or web addresses visible.

4. **Respond to Human Questions (If Provided):**
- If a **Human Question** is provided at the end of the screenshot, **provide a clear and concise response** to it.
Expand Down
1 change: 1 addition & 0 deletions src/main/askai/resources/rag/accuracy.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Human Input;" AI Response";" Classification";" Reasoning"
Use the screenshot tool to capture the current screen.;{'open_applications': ['PyCharm'], 'open_documents': [], 'content_overview': "This document contains Python code for evaluating the accuracy of an AI's response to a user's question within a defined evaluation framework."}], 'web_pages': '', 'user_response': 'None'};Green;The AI successfully described the screenshot.
Do you know what is Gus Fraba?;I don't know;Black;The AI responded negatively, stating that gabiroba song files do not exist and, as a result, cannot proceed with further interactions.
Is there any gabiroba song?;No gabiroba song files found.;Black;The AI provided a negative answer, so, interactions should stop.
Did I download and gabiroba file?;No files containing 'gabiroba' were found in the specified directory.;Black;The AI responded negatively, stating that files containing 'gabiroba' were not found and, as a result, cannot proceed with further interactions.
Expand Down

0 comments on commit f252f58

Please sign in to comment.