From 176b9b191b484c6bacfd1d7dba597b9606b1ed14 Mon Sep 17 00:00:00 2001 From: Hugo Saporetti Junior Date: Fri, 1 Nov 2024 21:13:47 -0300 Subject: [PATCH] Vision features improvements, specially formatting --- src/demo/components/webcam_demo.py | 6 +- src/demo/utils.py | 3 +- src/main/askai/core/askai_messages.py | 7 +- src/main/askai/core/component/camera.py | 2 +- src/main/askai/core/engine/ai_vision.py | 3 +- .../askai/core/engine/openai/openai_vision.py | 13 ++- src/main/askai/core/router/agent_tools.py | 35 ++++---- src/main/askai/core/router/tools/vision.py | 46 +++++----- src/main/askai/core/router/tools/webcam.py | 86 +++++++++++-------- 9 files changed, 112 insertions(+), 89 deletions(-) diff --git a/src/demo/components/webcam_demo.py b/src/demo/components/webcam_demo.py index 960ed034..290b967d 100644 --- a/src/demo/components/webcam_demo.py +++ b/src/demo/components/webcam_demo.py @@ -12,7 +12,7 @@ Copyright (c) 2024, HomeSetup """ -from askai.core.router.tools.webcam import webcam_capturer +from askai.core.router.tools.webcam import * from hspylib.core.tools.commons import sysout from utils import init_context @@ -21,6 +21,6 @@ sysout("-=" * 40) sysout("AskAI WebCam Demo") sysout("-=" * 40) - info: str = webcam_capturer("hugo", True) - # info: str = webcam_identifier() + # info: str = webcam_capturer("hugo", True) + info: str = webcam_identifier() sysout(info, markdown=True) diff --git a/src/demo/utils.py b/src/demo/utils.py index 801f57cf..25ef6124 100644 --- a/src/demo/utils.py +++ b/src/demo/utils.py @@ -2,6 +2,7 @@ from askai.core.askai_events import events from askai.core.commander.commander import commands from askai.core.component.cache_service import cache +from askai.core.enums.router_mode import RouterMode from askai.core.support.shared_instances import shared from askai.core.support.utilities import display_text from clitt.core.tui.line_input.keyboard_input import KeyboardInput @@ -39,7 +40,7 @@ def init_context( console_enable=console_enable, ) KeyboardInput.preload_history(cache.load_input_history(commands())) - shared.create_engine(engine_name=engine_name, model_name=model_name) + shared.create_engine(engine_name=engine_name, model_name=model_name, mode=RouterMode.default()) shared.create_context(context_size) events.reply.subscribe(cb_event_handler=lambda ev: display_text(ev.args.reply)) atexit.register(cache.save_input_history) diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py index b2930bef..a9864cdc 100644 --- a/src/main/askai/core/askai_messages.py +++ b/src/main/askai/core/askai_messages.py @@ -90,10 +90,13 @@ def goodbye(self) -> str: return "Goodbye, have a nice day !" def smile(self, countdown: int) -> str: - return f"\nSmile {str(countdown)} " + return f"\n Smile {str(countdown)}…" + + def click(self) -> str: + return " !!! Click !!!" def look_at_camera(self) -> str: - return "Look at the camera…" + return " Look at the camera…" def cmd_success(self, command_line: AnyStr) -> str: return f"OK, command `{command_line}` succeeded" diff --git a/src/main/askai/core/component/camera.py b/src/main/askai/core/component/camera.py index 418a561e..04575db4 100644 --- a/src/main/askai/core/component/camera.py +++ b/src/main/askai/core/component/camera.py @@ -70,7 +70,7 @@ def _countdown(count: int) -> None: pause.seconds(1) events.reply.emit(reply=AIReply.mute(msg.smile(i)), erase_last=True) player.play_sfx("camera-shutter") - events.reply.emit(reply=AIReply.mute(" !!!Click!!!"), erase_last=True) + events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True) def __init__(self): self._cam = None diff --git a/src/main/askai/core/engine/ai_vision.py b/src/main/askai/core/engine/ai_vision.py index 315c4f06..ffaacca2 100644 --- a/src/main/askai/core/engine/ai_vision.py +++ b/src/main/askai/core/engine/ai_vision.py @@ -19,10 +19,11 @@ class AIVision(Protocol): """Provide an interface for AI vision.""" - def caption(self, filename: AnyPath, load_dir: AnyPath | None = None) -> str: + def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str: """Generate a caption for the provided image. :param filename: File name of the image for which the caption is to be generated. :param load_dir: Optional directory path for loading related resources. + :param query: Optional question about details of the image. :return: A dictionary containing the generated caption. """ ... diff --git a/src/main/askai/core/engine/openai/openai_vision.py b/src/main/askai/core/engine/openai/openai_vision.py index 5762bb9f..0c0a46fe 100644 --- a/src/main/askai/core/engine/openai/openai_vision.py +++ b/src/main/askai/core/engine/openai/openai_vision.py @@ -72,26 +72,25 @@ def create_image_caption_chain(inputs: dict) -> MessageContent: return msg.content def template(self, question: str | None = None) -> str: - return dedent( - f""" + return dedent(f"""\ Given the image, provide the following information: - A count of how many living beings are in the image. - A list of the main objects present in the image. - A description the atmosphere of the environment. - A list of detailed descriptions all living beings you find in the image. - {'- ' + question if question else ''}""" - ).strip() + {'- ' + question if question else ''}""").strip() @retry() - def caption(self, filename: AnyPath, load_dir: AnyPath | None, question: str | None = None) -> str: + def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str: """Generate a caption for the provided image. :param filename: File name of the image for which the caption is to be generated. :param load_dir: Optional directory path for loading related resources. - :return: A dictionary containing the generated caption. + :param query: Optional question about details of the image. + :return: A string containing the generated caption. """ final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd() check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}") - vision_prompt = self.template() + vision_prompt = self.template(query) load_image_chain = TransformChain( input_variables=["image_path"], output_variables=["image"], transform=self._encode_image ) diff --git a/src/main/askai/core/router/agent_tools.py b/src/main/askai/core/router/agent_tools.py index 824d395f..be622453 100644 --- a/src/main/askai/core/router/agent_tools.py +++ b/src/main/askai/core/router/agent_tools.py @@ -12,6 +12,19 @@ Copyright (c) 2024, HomeSetup """ +import inspect +import logging as log +import os +import re +from functools import lru_cache +from textwrap import dedent +from typing import Callable, Optional + +from clitt.core.tui.line_input.line_input import line_input +from hspylib.core.metaclass.classpath import AnyPath +from hspylib.core.metaclass.singleton import Singleton +from langchain_core.tools import BaseTool, StructuredTool + from askai.core.askai_messages import msg from askai.core.router.tools.analysis import query_output from askai.core.router.tools.browser import browse, open_url @@ -20,21 +33,8 @@ from askai.core.router.tools.summarization import summarize from askai.core.router.tools.terminal import execute_command, list_contents, open_command from askai.core.router.tools.vision import image_captioner, parse_caption -from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier +from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE from askai.exception.exceptions import TerminatingQuery -from clitt.core.tui.line_input.line_input import line_input -from functools import lru_cache -from hspylib.core.metaclass.classpath import AnyPath -from hspylib.core.metaclass.singleton import Singleton -from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith -from langchain_core.tools import BaseTool, StructuredTool -from textwrap import dedent -from typing import Callable, Optional - -import inspect -import logging as log -import os -import re class AgentTools(metaclass=Singleton): @@ -128,9 +128,10 @@ def image_captioner(self, image_path: str) -> str: :param image_path: The absolute path of the image file to be analyzed. :return: A string containing the generated caption describing the image. """ - return ensure_endswith( - ensure_startswith(parse_caption(image_captioner(image_path)), f"\n>  Description of '{image_path}':\n"), - "\n", + image_caption: list[str] = parse_caption(image_captioner(image_path)) + return CAPTION_TEMPLATE.substitute( + image_path=image_path, + image_caption=os.linesep.join(image_caption) if image_caption else '' ) def webcam_capturer(self, photo_name: str | None, detect_faces: bool = False) -> str: diff --git a/src/main/askai/core/router/tools/vision.py b/src/main/askai/core/router/tools/vision.py index 010f2615..69a29945 100644 --- a/src/main/askai/core/router/tools/vision.py +++ b/src/main/askai/core/router/tools/vision.py @@ -1,3 +1,15 @@ +import os +from textwrap import indent + +import pyautogui +import torch +from PIL import Image +from hspylib.core.config.path_object import PathObject +from hspylib.core.enums.enumeration import Enumeration +from hspylib.core.metaclass.classpath import AnyPath +from hspylib.core.preconditions import check_argument +from transformers import BlipForConditionalGeneration, BlipProcessor + from askai.core.askai_events import events from askai.core.askai_messages import msg from askai.core.component.cache_service import PICTURE_DIR @@ -6,17 +18,6 @@ from askai.core.model.image_result import ImageResult from askai.core.router.evaluation import resolve_x_refs from askai.core.support.shared_instances import shared -from hspylib.core.config.path_object import PathObject -from hspylib.core.enums.enumeration import Enumeration -from hspylib.core.metaclass.classpath import AnyPath -from hspylib.core.preconditions import check_argument -from PIL import Image -from textwrap import indent -from transformers import BlipForConditionalGeneration, BlipProcessor - -import os -import pyautogui -import torch class HFModel(Enumeration): @@ -92,7 +93,7 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str: return image_caption -def parse_caption(image_caption: str) -> str: +def parse_caption(image_caption: str) -> list[str]: """Parse the given image caption. :param image_caption: The caption to parse. :return: The parsed caption as a string. @@ -102,17 +103,16 @@ def parse_caption(image_caption: str) -> str: ln: str = os.linesep people_desc: str = "" if result.people_description: - people_desc: str = f"- **People:** `({result.people_count})`\n" + indent( - f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", " " - ) - # fmt: off - return ( - f"- **Description:** `{result.env_description}`\n" - f"- **Objects:** `{', '.join(result.main_objects)}`\n" - f"{people_desc or ''}" - ) # fmt: on - - return msg.no_caption() + people_desc: list[str] = [ + f"- **People:** `({result.people_count})`", + indent(f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", " ") + ] + return [ + f"- **Description:** `{result.env_description}`", + f"- **Objects:** `{', '.join(result.main_objects)}`", + ] + people_desc + + return [msg.no_caption()] def take_screenshot(path_name: AnyPath, load_dir: AnyPath | None = None) -> str: diff --git a/src/main/askai/core/router/tools/webcam.py b/src/main/askai/core/router/tools/webcam.py index 30ebfeb1..2d4b2049 100644 --- a/src/main/askai/core/router/tools/webcam.py +++ b/src/main/askai/core/router/tools/webcam.py @@ -1,14 +1,40 @@ +import os +from os.path import basename +from string import Template +from textwrap import indent + from askai.core.askai_configs import configs from askai.core.askai_events import events from askai.core.askai_messages import msg from askai.core.component.camera import camera from askai.core.model.ai_reply import AIReply from askai.core.router.tools.vision import image_captioner, parse_caption -from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith -from os.path import basename -from textwrap import indent -import os +PHOTO_TEMPLATE: Template = Template("""\ + +>  Photo Taken -> ${pic_file} + +${image_description} +${face_description} + +""") + +ID_TEMPLATE: Template = Template("""\ + +>  Person Identified -> ${photo_uri} + +- **Distance:** `${distance}` +${photo_caption} + +""") + +CAPTION_TEMPLATE: Template = Template("""\ + +>  Description of `${image_path}`: + +${image_caption} + +""") def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str: @@ -19,34 +45,28 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str: """ pic_file, pic_data = camera.capture(photo_name, with_caption=False) - face_description: str | None = None + face_description: list[str] = [] ln: str = os.linesep if detect_faces: face_files, face_datas = camera.detect_faces(pic_data, photo_name) faces: int = len(face_files) - face_description = ( - ( - f"- **Faces:** `({faces})`\n" - + indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", " ") - + f"- **Face-Captions:** `({faces})`\n" - + indent( - f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}", - " ", - ) - ) - if faces - else "" - ) + face_description: list[str] = [ + f"- **Faces:** `({faces})`", + indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", " "), + f"- **Face-Captions:** `({faces})`", + indent( + f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}", + " "), + ] if faces else [] - image_description: str = parse_caption(image_captioner(pic_file.img_path)) + image_description: list[str] = parse_caption(image_captioner(pic_file.img_path)) - # fmt: off - return ensure_endswith(ensure_startswith( - f"\n>  Photo Taken -> {pic_file.img_path}\n\n" - f"{image_description or ''}\n" - f"{face_description or ''}", "\n" - ), "\n") # fmt: on + return PHOTO_TEMPLATE.substitute( + pic_file=pic_file.img_path, + image_description=os.linesep.join(image_description) if image_description else '', + face_description=os.linesep.join(face_description) if face_description else '' + ) def webcam_identifier(max_distance: int = configs.max_id_distance) -> str: @@ -54,14 +74,12 @@ def webcam_identifier(max_distance: int = configs.max_id_distance) -> str: :param max_distance: The maximum distance for identifying the person based on image similarity. :return: A description of the identified person. """ - identity: str = "%ORANGE% No identification was possible!%NC%" events.reply.emit(reply=AIReply.debug(msg.look_at_camera())) if photo := camera.identify(3, max_distance): - # fmt: off - identity = ensure_endswith(ensure_startswith( - f"\n>  Person Identified -> {photo.uri}\n\n" - f"- **Distance:** `{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}`\n" - f"{photo.caption}", "\n" - ), "\n") # fmt: on - - return identity + return ID_TEMPLATE.substitute( + photo_uri=photo.uri, + distance=f"{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}", + photo_caption=photo.caption + ) + + return "%ORANGE% No identification was possible!%NC%"