Minor bugfixes, Improved captioning - 2

yorevs · yorevs · commit ef6122dae006 · 2024-09-04T02:45:20.000-03:00
diff --git a/dependencies.hspd b/dependencies.hspd
@@ -38,9 +38,9 @@ package: pause, version: 0.3, mode: ge
 package: requests, version: 2.32.3, mode: ge
 package: urllib3, version: 1.26.20, mode: ge
 package: protobuf, version: 4.25.4, mode: ge
-package: torch, version: 2.4.0, mode: ge
 package: tqdm, version: 4.66.5, mode: ge
-package: torchvision, version: 0.19.0, mode: ge
+package: torch, version: 2.2.0, mode: ge
+package: torchvision, version: 0.17.2, mode: ge
 package: open-clip-torch, version 2.26.1, mode: ge
 package: transformers, version: 4.44.2, mode: ge
 package: pyperclip, version: 1.9.0, mode: ge
diff --git a/src/demo/components/vision_demo.py b/src/demo/components/vision_demo.py
@@ -6,7 +6,7 @@
 if __name__ == '__main__':
     init_context("vision-demo")
     vision: AIVision = shared.engine.vision()
-    load_dir: str = "/Users/hjunior/.config/hhs/askai/cache/pictures/photos"
+    load_dir: str = "${HOME}/.config/hhs/askai/cache/pictures/photos"
     image_file: str = "eu-edvaldo-suecia.jpg"
     result = vision.caption(image_file, load_dir)
     print(result)
diff --git a/src/main/askai/core/commander/commander.py b/src/main/askai/core/commander/commander.py
@@ -12,8 +12,17 @@
 
    Copyright (c) 2024, HomeSetup
 """
+import os
+import re
+from functools import partial
+from os.path import dirname
+from pathlib import Path
+from string import Template
+from textwrap import dedent
+
+import click
 from askai.core.askai_configs import configs
-from askai.core.askai_events import ASKAI_BUS_NAME, AskAiEvents, events, REPLY_ERROR_EVENT, REPLY_EVENT
+from askai.core.askai_events import ASKAI_BUS_NAME, AskAiEvents, REPLY_ERROR_EVENT, REPLY_EVENT
 from askai.core.commander.commands.cache_cmd import CacheCmd
 from askai.core.commander.commands.camera_cmd import CameraCmd
 from askai.core.commander.commands.general_cmd import GeneralCmd
@@ -26,18 +35,9 @@
 from askai.language.language import AnyLocale, Language
 from click import Command, Group
 from clitt.core.term.cursor import cursor
-from functools import partial
 from hspylib.core.enums.charset import Charset
 from hspylib.core.tools.commons import sysout, to_bool
 from hspylib.modules.eventbus.event import Event
-from os.path import dirname
-from pathlib import Path
-from string import Template
-from textwrap import dedent
-
-import click
-import os
-import re
 
 COMMANDER_HELP_TPL = Template(
     dedent(
@@ -135,18 +135,6 @@ def _init_context(context_size: int = 1000, engine_name: str = "openai", model_n
     :param engine_name: The name of the engine to initialize (default is "openai").
     :param model_name: The model name of the engine to initialize (default is "gpt-4o-mini").
     """
-    if not (shared.engine and shared.context):
-        shared.create_engine(engine_name=engine_name, model_name=model_name)
-        shared.create_context(context_size)
-        events.reply.subscribe(cb_event_handler=lambda ev: display_text(ev.args.message))
-
-
-@click.group()
-@click.pass_context
-def ask_commander(_) -> None:
-    """AskAI commands group. This function serves as the entry point for the AskAI command-line interface (CLI)
-    commands, grouping related commands together.
-    """
 
     def _reply_event(ev: Event, error: bool = False) -> None:
         """Callback for handling the reply event.
@@ -161,10 +149,21 @@ def _reply_event(ev: Event, error: bool = False) -> None:
                     cursor.erase_line()
                 display_text(message)
 
+    if shared.engine is None and shared.context is None:
+        shared.create_engine(engine_name=engine_name, model_name=model_name)
+        shared.create_context(context_size)
+        askai_bus = AskAiEvents.bus(ASKAI_BUS_NAME)
+        askai_bus.subscribe(REPLY_EVENT, _reply_event)
+        askai_bus.subscribe(REPLY_ERROR_EVENT, partial(_reply_event, error=True))
+
+
+@click.group()
+@click.pass_context
+def ask_commander(_) -> None:
+    """AskAI commands group. This function serves as the entry point for the AskAI command-line interface (CLI)
+    commands, grouping related commands together.
+    """
     _init_context()
-    askai_bus = AskAiEvents.bus(ASKAI_BUS_NAME)
-    askai_bus.subscribe(REPLY_EVENT, _reply_event)
-    askai_bus.subscribe(REPLY_ERROR_EVENT, partial(_reply_event, error=True))
 
 
 @ask_commander.command()
@@ -431,4 +430,4 @@ def camera(operation: str, args: tuple[str, ...]) -> None:
 
 
 if __name__ == "__main__":
-    ask_commander(["help", "camera"], standalone_mode=False)
+    ask_commander(["camera", "identify"], standalone_mode=False)
diff --git a/src/main/askai/core/commander/commands/camera_cmd.py b/src/main/askai/core/commander/commands/camera_cmd.py
@@ -1,26 +1,24 @@
 from abc import ABC
+
 from askai.core.askai_configs import configs
 from askai.core.component.camera import camera
-from askai.core.features.router.tools.webcam import webcam_identifier
+from askai.core.features.router.tools.webcam import webcam_identifier, webcam_capturer
 from askai.core.support.text_formatter import text_formatter
+from askai.core.support.utilities import display_text
 from hspylib.core.metaclass.classpath import AnyPath
 
 
 class CameraCmd(ABC):
     """Provides camera command functionalities."""
 
     @staticmethod
-    def capture(filename: AnyPath = None, detect_faces: bool = True, countdown: int = 3) -> None:
+    def capture(filename: AnyPath = None, detect_faces: bool = True) -> None:
         """Take a photo using the webcam.
         :param filename: The filename to save the photo under (optional).
         :param detect_faces: Whether to detect faces in the photo (default is True).
-        :param countdown: The countdown in seconds before the photo is taken (default is 3).
         """
-        if photo := camera.capture(filename, countdown):
-            text_formatter.cmd_print(f"Photo taken: %GREEN%{photo[0]}%NC%")
-            if detect_faces:
-                if len(faces := camera.detect_faces(photo[1], filename)) > 0:
-                    text_formatter.cmd_print(f"Faces detected: %GREEN%{len(faces[0])}%NC%")
+        if photo_description := webcam_capturer(filename, detect_faces):
+            display_text(photo_description)
         else:
             text_formatter.cmd_print("%RED%Unable to take photo!%NC%")
 
@@ -30,7 +28,7 @@ def identify(max_distance: int = configs.max_id_distance) -> None:
         :param max_distance: The maximum allowable distance for face recognition. A lower value means closer matching
                              to the real face (default is configs.max_id_distance).
         """
-        text_formatter.cmd_print(webcam_identifier(max_distance))
+        display_text(webcam_identifier(max_distance))
 
     @staticmethod
     def import_images(pathname: AnyPath = None, detect_faces: bool = True) -> None:
diff --git a/src/main/askai/core/component/camera.py b/src/main/askai/core/component/camera.py
@@ -31,10 +31,12 @@
 from askai.core.component.cache_service import FACE_DIR, IMG_IMPORTS_DIR, PHOTO_DIR
 from askai.core.component.image_store import ImageData, ImageFile, ImageMetadata, store
 from askai.core.features.router.tools.vision import image_captioner, parse_caption
+from askai.core.model.image_result import ImageResult
 from askai.core.support.utilities import build_img_path
 from askai.exception.exceptions import CameraAccessFailure, WebCamInitializationFailure
 from hspylib.core.metaclass.classpath import AnyPath
 from hspylib.core.metaclass.singleton import Singleton
+from hspylib.core.tools.dict_tools import get_or_default
 from hspylib.core.tools.text_tools import hash_text
 from hspylib.core.zoned_datetime import now_ms
 from retry import retry
@@ -155,15 +157,17 @@ def detect_faces(
         if len(faces) == 0:
             return face_files, face_datas
 
+        filename: str = filename or str(now_ms())
         for x, y, w, h in faces:
             cropped_face: ImageData = photo[y : y + h, x : x + w]
             final_path: str = build_img_path(FACE_DIR, str(filename), f"-FACE-{len(face_files)}.jpg")
             if final_path and cv2.imwrite(final_path, cropped_face):
+                result: ImageResult = ImageResult.of(image_captioner(final_path))
                 face_file = ImageFile(
                     hash_text(basename(final_path)),
                     final_path,
                     store.FACE_CATEGORY,
-                    parse_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
+                    get_or_default(result.people_description, 0, '<N/A>') if with_caption else msg.no_caption(),
                 )
                 face_files.append(face_file)
                 face_datas.append(cropped_face)
diff --git a/src/main/askai/core/engine/openai/openai_vision.py b/src/main/askai/core/engine/openai/openai_vision.py
@@ -39,7 +39,7 @@ class OpenAIVision:
     _OUT_PARSER = JsonOutputParser(pydantic_object=ImageResult)
 
     @staticmethod
-    def _encode_image(inputs: dict) -> dict:
+    def _encode_image(inputs: dict) -> dict[str, str]:
         """Load an image from file and encode it as a base64 string.
         :param inputs: Dictionary containing the file path under a specific key.
         :return: Dictionary with the base64 encoded image string.
@@ -67,25 +67,24 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
         )
         return msg.content
 
-    @property
-    def template(self) -> str:
-        return dedent("""
+    def template(self, question: str | None = None) -> str:
+        return dedent(f"""
         Given the image, provide the following information:
         - A count of how many living beings are in the image.
         - A list of the main objects present in the image.
         - A description the atmosphere of the environment.
         - A list of detailed descriptions all living beings you find in the image.
-        """).strip()
+        {'- ' + question if question else ''}""").strip()
 
-    def caption(self, filename: AnyPath, load_dir: AnyPath | None) -> str:
+    def caption(self, filename: AnyPath, load_dir: AnyPath | None, question: str | None = None) -> str:
         """Generate a caption for the provided image.
         :param filename: File name of the image for which the caption is to be generated.
         :param load_dir: Optional directory path for loading related resources.
         :return: A dictionary containing the generated caption.
         """
         final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
         check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
-        vision_prompt = self.template
+        vision_prompt = self.template()
         load_image_chain = TransformChain(
             input_variables=["image_path"],
             output_variables=["image"],
diff --git a/src/main/askai/core/features/router/tools/vision.py b/src/main/askai/core/features/router/tools/vision.py
@@ -1,6 +1,8 @@
 import os
 from textwrap import indent
 
+import torch
+from PIL import Image
 from askai.core.askai_events import events
 from askai.core.askai_messages import msg
 from askai.core.component.cache_service import PICTURE_DIR
@@ -9,7 +11,53 @@
 from askai.core.model.image_result import ImageResult
 from askai.core.support.shared_instances import shared
 from hspylib.core.config.path_object import PathObject
+from hspylib.core.enums.enumeration import Enumeration
 from hspylib.core.metaclass.classpath import AnyPath
+from transformers import BlipForConditionalGeneration, BlipProcessor
+
+
+def offline_captioner(path_name: AnyPath) -> str:
+    """This tool is used to describe an image.
+    :param path_name: The path of the image to describe.
+    """
+
+    class HFModel(Enumeration):
+        """Available Hugging Face models"""
+
+        # fmt: off
+        SF_BLIP_BASE            = "Salesforce/blip-image-captioning-base"
+        SF_BLIP_LARGE           = "Salesforce/blip-image-captioning-large"
+        # fmt: on
+
+        @staticmethod
+        def default() -> "HFModel":
+            """Return the default HF model."""
+            return HFModel.SF_BLIP_LARGE
+
+    caption: str = "Not available"
+
+    posix_path: PathObject = PathObject.of(path_name)
+    if not posix_path.exists:
+        # Attempt to resolve cross-references
+        if history := str(shared.context.flat("HISTORY") or ""):
+            if (x_referenced := resolve_x_refs(path_name, history)) and x_referenced != shared.UNCERTAIN_ID:
+                x_ref_path: PathObject = PathObject.of(x_referenced)
+                posix_path: PathObject = x_ref_path if x_ref_path.exists else posix_path
+
+    if posix_path.exists:
+        events.reply.emit(message=msg.describe_image(str(posix_path)))
+        hf_model: HFModel = HFModel.default()
+        # Use GPU if it's available
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        image = Image.open(str(posix_path)).convert("RGB")
+        model = BlipForConditionalGeneration.from_pretrained(hf_model.value).to(device)
+        processor = BlipProcessor.from_pretrained(hf_model.value)
+        inputs = processor(images=image, return_tensors="pt").to(device)
+        outputs = model.generate(**inputs)
+        caption = processor.decode(outputs[0], skip_special_tokens=True)
+        caption = caption.title() if caption else "I could not caption the image"
+
+    return caption
 
 
 def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
@@ -30,7 +78,7 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
                 posix_path: PathObject = x_ref_path if x_ref_path.exists else posix_path
 
     if posix_path.exists:
-        events.reply.emit(message=msg.describe_image(str(posix_path)))
+        events.reply.emit(message=msg.describe_image(str(posix_path)), verbosity="debug")
         vision: AIVision = shared.engine.vision()
         caption = vision.caption(posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR)
 
@@ -43,13 +91,13 @@ def parse_caption(image_caption: str) -> str:
     :return: The parsed caption as a string.
     """
     if image_caption:
-        result: ImageResult = ImageResult.model_validate_json(image_caption.replace("'", '"'))
+        result: ImageResult = ImageResult.of(image_caption)
         ln: str = os.linesep
         people_desc: str = ''
         if result.people_description:
             people_desc: str = (
-                f"- **People ({result.people_count}):**\n"
-                + indent(f"- {'- '.join([ppl + ln for ppl in result.people_description])}", "    ")
+                f"- **People:** `({result.people_count})`\n"
+                + indent(f"- {'- '.join([f'`{ppl}{ln}`' for ppl in result.people_description])}", "    ")
             )
         return (
             f"- **Description:** `{result.env_description}`\n"
diff --git a/src/main/askai/core/features/router/tools/webcam.py b/src/main/askai/core/features/router/tools/webcam.py
@@ -16,26 +16,25 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
     """
 
     pic_file, pic_data = camera.capture(photo_name, with_caption=False)
-    face_desc: str | None = None
+    face_description: str | None = None
     ln: str = os.linesep
 
     if detect_faces:
         face_files, face_datas = camera.detect_faces(pic_data, photo_name)
         faces: int = len(face_files)
-        face_desc = (
-            f"- **Faces ({faces}):**\n"
-            + indent(f"- {'- '.join([ff.img_path + ln for ff in face_files])}", "    ")
-            + f"- **Face Captions ({faces}):**\n"
-            + indent(f"- {'- '.join([basename(ff.img_path) + ': ' + ff.img_caption + ln for ff in face_files])}",
-                     "    ")
+        face_description = (
+            f"- **Faces:** `({faces})`\n"
+            + indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", "    ")
+            + f"- **Face-Captions:** `({faces})`\n"
+            + indent(f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}", "    ")
         ) if faces else ''
 
-    people_desc: str = parse_caption(image_captioner(pic_file.img_path))
+    image_description: str = parse_caption(image_captioner(pic_file.img_path))
 
     return (
         f">   Photo Taken -> {pic_file.img_path}\n\n"
-        f"{people_desc or ''}"
-        f"{face_desc or ''}"
+        f"{image_description or ''}\n"
+        f"{face_description or ''}"
     )
 
 
@@ -46,8 +45,8 @@ def webcam_identifier(max_distance: int = configs.max_id_distance) -> str:
     if photo := camera.identify(3, max_distance):
         identity = (
             f">   Person Identified -> {photo.uri}\n\n"
-            f"- **Description:** `{photo.caption}`\n"
             f"- **Distance:** `{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}`\n"
+            f"{photo.caption}\n"
         )
 
     return identity
diff --git a/src/main/askai/core/model/image_result.py b/src/main/askai/core/model/image_result.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import AnyStr
 
 from pydantic import BaseModel, Field
 
@@ -13,9 +13,5 @@ class ImageResult(BaseModel):
     people_description: list[str] = Field(description="List of people description")
 
     @staticmethod
-    def to_image_result(from_dict: dict[str, Any]) -> 'ImageResult':
-        return ImageResult.MyModel.parse_obj(
-            from_dict['people_count'],
-            from_dict['main_objects'],
-            from_dict['env_description'],
-            from_dict['people_description'])
+    def of(image_caption: AnyStr) -> 'ImageResult':
+        return ImageResult.model_validate_json(str(image_caption).replace("'", '"'))
diff --git a/src/main/askai/core/support/utilities.py b/src/main/askai/core/support/utilities.py
@@ -23,6 +23,9 @@
 from typing import Optional, AnyStr
 
 import pause
+from askai.core.support.presets import Presets
+from askai.core.support.text_formatter import text_formatter
+from askai.language.language import Language
 from clitt.core.term.cursor import Cursor
 from hspylib.core.config.path_object import PathObject
 from hspylib.core.enums.charset import Charset
@@ -33,10 +36,6 @@
 from hspylib.core.zoned_datetime import now_ms
 from hspylib.modules.cli.vt100.vt_color import VtColor
 
-from askai.core.support.presets import Presets
-from askai.core.support.text_formatter import text_formatter
-from askai.language.language import Language
-
 
 def read_stdin() -> Optional[str]:
     """Read input from the standard input (stdin).
diff --git a/src/main/requirements.txt b/src/main/requirements.txt