Vision features improvements, specially formatting

yorevs · Nov 2, 2024 · 176b9b1 · 176b9b1
1 parent b28439d
commit 176b9b1
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 89 deletions.
diff --git a/src/demo/components/webcam_demo.py b/src/demo/components/webcam_demo.py
@@ -12,7 +12,7 @@
 
    Copyright (c) 2024, HomeSetup
 """
-from askai.core.router.tools.webcam import webcam_capturer
+from askai.core.router.tools.webcam import *
 from hspylib.core.tools.commons import sysout
 from utils import init_context
 
@@ -21,6 +21,6 @@
     sysout("-=" * 40)
     sysout("AskAI WebCam Demo")
     sysout("-=" * 40)
-    info: str = webcam_capturer("hugo", True)
-    # info: str = webcam_identifier()
+    # info: str = webcam_capturer("hugo", True)
+    info: str = webcam_identifier()
     sysout(info, markdown=True)
diff --git a/src/demo/utils.py b/src/demo/utils.py
@@ -2,6 +2,7 @@
 from askai.core.askai_events import events
 from askai.core.commander.commander import commands
 from askai.core.component.cache_service import cache
+from askai.core.enums.router_mode import RouterMode
 from askai.core.support.shared_instances import shared
 from askai.core.support.utilities import display_text
 from clitt.core.tui.line_input.keyboard_input import KeyboardInput
@@ -39,7 +40,7 @@ def init_context(
             console_enable=console_enable,
         )
     KeyboardInput.preload_history(cache.load_input_history(commands()))
-    shared.create_engine(engine_name=engine_name, model_name=model_name)
+    shared.create_engine(engine_name=engine_name, model_name=model_name, mode=RouterMode.default())
     shared.create_context(context_size)
     events.reply.subscribe(cb_event_handler=lambda ev: display_text(ev.args.reply))
     atexit.register(cache.save_input_history)

diff --git a/src/main/askai/core/askai_messages.py b/src/main/askai/core/askai_messages.py
@@ -90,10 +90,13 @@ def goodbye(self) -> str:
         return "Goodbye, have a nice day !"
 
     def smile(self, countdown: int) -> str:
-        return f"\nSmile {str(countdown)} "
+        return f"\n  Smile {str(countdown)}…"
+
+    def click(self) -> str:
+        return "  !!! Click !!!"
 
     def look_at_camera(self) -> str:
-        return "Look at the camera…"
+        return "  Look at the camera…"
 
     def cmd_success(self, command_line: AnyStr) -> str:
         return f"OK, command `{command_line}` succeeded"

diff --git a/src/main/askai/core/component/camera.py b/src/main/askai/core/component/camera.py
@@ -70,7 +70,7 @@ def _countdown(count: int) -> None:
                 pause.seconds(1)
                 events.reply.emit(reply=AIReply.mute(msg.smile(i)), erase_last=True)
             player.play_sfx("camera-shutter")
-            events.reply.emit(reply=AIReply.mute("  !!!Click!!!"), erase_last=True)
+            events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)
 
     def __init__(self):
         self._cam = None

diff --git a/src/main/askai/core/engine/ai_vision.py b/src/main/askai/core/engine/ai_vision.py
@@ -19,10 +19,11 @@
 class AIVision(Protocol):
     """Provide an interface for AI vision."""
 
-    def caption(self, filename: AnyPath, load_dir: AnyPath | None = None) -> str:
+    def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
         """Generate a caption for the provided image.
         :param filename: File name of the image for which the caption is to be generated.
         :param load_dir: Optional directory path for loading related resources.
+        :param query: Optional question about details of the image.
         :return: A dictionary containing the generated caption.
         """
         ...
diff --git a/src/main/askai/core/engine/openai/openai_vision.py b/src/main/askai/core/engine/openai/openai_vision.py
@@ -72,26 +72,25 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
         return msg.content
 
     def template(self, question: str | None = None) -> str:
-        return dedent(
-            f"""
+        return dedent(f"""\
         Given the image, provide the following information:
         - A count of how many living beings are in the image.
         - A list of the main objects present in the image.
         - A description the atmosphere of the environment.
         - A list of detailed descriptions all living beings you find in the image.
-        {'- ' + question if question else ''}"""
-        ).strip()
+        {'- ' + question if question else ''}""").strip()
 
     @retry()
-    def caption(self, filename: AnyPath, load_dir: AnyPath | None, question: str | None = None) -> str:
+    def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
         """Generate a caption for the provided image.
         :param filename: File name of the image for which the caption is to be generated.
         :param load_dir: Optional directory path for loading related resources.
-        :return: A dictionary containing the generated caption.
+        :param query: Optional question about details of the image.
+        :return: A string containing the generated caption.
         """
         final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
         check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
-        vision_prompt = self.template()
+        vision_prompt = self.template(query)
         load_image_chain = TransformChain(
             input_variables=["image_path"], output_variables=["image"], transform=self._encode_image
         )

diff --git a/src/main/askai/core/router/agent_tools.py b/src/main/askai/core/router/agent_tools.py
@@ -12,6 +12,19 @@
 
    Copyright (c) 2024, HomeSetup
 """
+import inspect
+import logging as log
+import os
+import re
+from functools import lru_cache
+from textwrap import dedent
+from typing import Callable, Optional
+
+from clitt.core.tui.line_input.line_input import line_input
+from hspylib.core.metaclass.classpath import AnyPath
+from hspylib.core.metaclass.singleton import Singleton
+from langchain_core.tools import BaseTool, StructuredTool
+
 from askai.core.askai_messages import msg
 from askai.core.router.tools.analysis import query_output
 from askai.core.router.tools.browser import browse, open_url
@@ -20,21 +33,8 @@
 from askai.core.router.tools.summarization import summarize
 from askai.core.router.tools.terminal import execute_command, list_contents, open_command
 from askai.core.router.tools.vision import image_captioner, parse_caption
-from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier
+from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE
 from askai.exception.exceptions import TerminatingQuery
-from clitt.core.tui.line_input.line_input import line_input
-from functools import lru_cache
-from hspylib.core.metaclass.classpath import AnyPath
-from hspylib.core.metaclass.singleton import Singleton
-from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith
-from langchain_core.tools import BaseTool, StructuredTool
-from textwrap import dedent
-from typing import Callable, Optional
-
-import inspect
-import logging as log
-import os
-import re
 
 
 class AgentTools(metaclass=Singleton):
@@ -128,9 +128,10 @@ def image_captioner(self, image_path: str) -> str:
         :param image_path: The absolute path of the image file to be analyzed.
         :return: A string containing the generated caption describing the image.
         """
-        return ensure_endswith(
-            ensure_startswith(parse_caption(image_captioner(image_path)), f"\n>   Description of '{image_path}':\n"),
-            "\n",
+        image_caption: list[str] = parse_caption(image_captioner(image_path))
+        return CAPTION_TEMPLATE.substitute(
+            image_path=image_path,
+            image_caption=os.linesep.join(image_caption) if image_caption else ''
         )
 
     def webcam_capturer(self, photo_name: str | None, detect_faces: bool = False) -> str:

diff --git a/src/main/askai/core/router/tools/vision.py b/src/main/askai/core/router/tools/vision.py
@@ -1,3 +1,15 @@
+import os
+from textwrap import indent
+
+import pyautogui
+import torch
+from PIL import Image
+from hspylib.core.config.path_object import PathObject
+from hspylib.core.enums.enumeration import Enumeration
+from hspylib.core.metaclass.classpath import AnyPath
+from hspylib.core.preconditions import check_argument
+from transformers import BlipForConditionalGeneration, BlipProcessor
+
 from askai.core.askai_events import events
 from askai.core.askai_messages import msg
 from askai.core.component.cache_service import PICTURE_DIR
@@ -6,17 +18,6 @@
 from askai.core.model.image_result import ImageResult
 from askai.core.router.evaluation import resolve_x_refs
 from askai.core.support.shared_instances import shared
-from hspylib.core.config.path_object import PathObject
-from hspylib.core.enums.enumeration import Enumeration
-from hspylib.core.metaclass.classpath import AnyPath
-from hspylib.core.preconditions import check_argument
-from PIL import Image
-from textwrap import indent
-from transformers import BlipForConditionalGeneration, BlipProcessor
-
-import os
-import pyautogui
-import torch
 
 
 class HFModel(Enumeration):
@@ -92,7 +93,7 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
     return image_caption
 
 
-def parse_caption(image_caption: str) -> str:
+def parse_caption(image_caption: str) -> list[str]:
     """Parse the given image caption.
     :param image_caption: The caption to parse.
     :return: The parsed caption as a string.
@@ -102,17 +103,16 @@ def parse_caption(image_caption: str) -> str:
         ln: str = os.linesep
         people_desc: str = ""
         if result.people_description:
-            people_desc: str = f"- **People:** `({result.people_count})`\n" + indent(
-                f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", "    "
-            )
-        # fmt: off
-        return (
-            f"- **Description:** `{result.env_description}`\n"
-            f"- **Objects:** `{', '.join(result.main_objects)}`\n"
-            f"{people_desc or ''}"
-        )  # fmt: on
-
-    return msg.no_caption()
+            people_desc: list[str] = [
+                f"- **People:** `({result.people_count})`",
+                indent(f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", "    ")
+            ]
+        return [
+            f"- **Description:** `{result.env_description}`",
+            f"- **Objects:** `{', '.join(result.main_objects)}`",
+        ] + people_desc
+
+    return [msg.no_caption()]
 
 
 def take_screenshot(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:

diff --git a/src/main/askai/core/router/tools/webcam.py b/src/main/askai/core/router/tools/webcam.py
@@ -1,14 +1,40 @@
+import os
+from os.path import basename
+from string import Template
+from textwrap import indent
+
 from askai.core.askai_configs import configs
 from askai.core.askai_events import events
 from askai.core.askai_messages import msg
 from askai.core.component.camera import camera
 from askai.core.model.ai_reply import AIReply
 from askai.core.router.tools.vision import image_captioner, parse_caption
-from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith
-from os.path import basename
-from textwrap import indent
 
-import os
+PHOTO_TEMPLATE: Template = Template("""\
+
+>   Photo Taken -> ${pic_file}
+
+${image_description}
+${face_description}
+
+""")
+
+ID_TEMPLATE: Template = Template("""\
+
+>   Person Identified -> ${photo_uri}
+
+- **Distance:** `${distance}`
+${photo_caption}
+
+""")
+
+CAPTION_TEMPLATE: Template = Template("""\
+
+>   Description of `${image_path}`:
+
+${image_caption}
+
+""")
 
 
 def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
@@ -19,49 +45,41 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
     """
 
     pic_file, pic_data = camera.capture(photo_name, with_caption=False)
-    face_description: str | None = None
+    face_description: list[str] = []
     ln: str = os.linesep
 
     if detect_faces:
         face_files, face_datas = camera.detect_faces(pic_data, photo_name)
         faces: int = len(face_files)
-        face_description = (
-            (
-                f"- **Faces:** `({faces})`\n"
-                + indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", "    ")
-                + f"- **Face-Captions:** `({faces})`\n"
-                + indent(
-                    f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}",
-                    "    ",
-                )
-            )
-            if faces
-            else ""
-        )
+        face_description: list[str] = [
+            f"- **Faces:** `({faces})`",
+            indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", "    "),
+            f"- **Face-Captions:** `({faces})`",
+            indent(
+                f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}",
+                "    "),
+        ] if faces else []
 
-    image_description: str = parse_caption(image_captioner(pic_file.img_path))
+    image_description: list[str] = parse_caption(image_captioner(pic_file.img_path))
 
-    # fmt: off
-    return ensure_endswith(ensure_startswith(
-        f"\n>   Photo Taken -> {pic_file.img_path}\n\n"
-        f"{image_description or ''}\n"
-        f"{face_description or ''}", "\n"
-    ), "\n")  # fmt: on
+    return PHOTO_TEMPLATE.substitute(
+        pic_file=pic_file.img_path,
+        image_description=os.linesep.join(image_description) if image_description else '',
+        face_description=os.linesep.join(face_description) if face_description else ''
+    )
 
 
 def webcam_identifier(max_distance: int = configs.max_id_distance) -> str:
     """Identifies the person in front of the webcam and provides a description of them.
     :param max_distance: The maximum distance for identifying the person based on image similarity.
     :return: A description of the identified person.
     """
-    identity: str = "%ORANGE%  No identification was possible!%NC%"
     events.reply.emit(reply=AIReply.debug(msg.look_at_camera()))
     if photo := camera.identify(3, max_distance):
-        # fmt: off
-        identity = ensure_endswith(ensure_startswith(
-            f"\n>   Person Identified -> {photo.uri}\n\n"
-            f"- **Distance:** `{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}`\n"
-            f"{photo.caption}", "\n"
-        ), "\n")  # fmt: on
-
-    return identity
+        return ID_TEMPLATE.substitute(
+            photo_uri=photo.uri,
+            distance=f"{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}",
+            photo_caption=photo.caption
+        )
+
+    return "%ORANGE%  No identification was possible!%NC%"