Skip to content

Commit

Permalink
Vision features improvements, specially formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Nov 2, 2024
1 parent b28439d commit 176b9b1
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 89 deletions.
6 changes: 3 additions & 3 deletions src/demo/components/webcam_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Copyright (c) 2024, HomeSetup
"""
from askai.core.router.tools.webcam import webcam_capturer
from askai.core.router.tools.webcam import *
from hspylib.core.tools.commons import sysout
from utils import init_context

Expand All @@ -21,6 +21,6 @@
sysout("-=" * 40)
sysout("AskAI WebCam Demo")
sysout("-=" * 40)
info: str = webcam_capturer("hugo", True)
# info: str = webcam_identifier()
# info: str = webcam_capturer("hugo", True)
info: str = webcam_identifier()
sysout(info, markdown=True)
3 changes: 2 additions & 1 deletion src/demo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from askai.core.askai_events import events
from askai.core.commander.commander import commands
from askai.core.component.cache_service import cache
from askai.core.enums.router_mode import RouterMode
from askai.core.support.shared_instances import shared
from askai.core.support.utilities import display_text
from clitt.core.tui.line_input.keyboard_input import KeyboardInput
Expand Down Expand Up @@ -39,7 +40,7 @@ def init_context(
console_enable=console_enable,
)
KeyboardInput.preload_history(cache.load_input_history(commands()))
shared.create_engine(engine_name=engine_name, model_name=model_name)
shared.create_engine(engine_name=engine_name, model_name=model_name, mode=RouterMode.default())
shared.create_context(context_size)
events.reply.subscribe(cb_event_handler=lambda ev: display_text(ev.args.reply))
atexit.register(cache.save_input_history)
Expand Down
7 changes: 5 additions & 2 deletions src/main/askai/core/askai_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,13 @@ def goodbye(self) -> str:
return "Goodbye, have a nice day !"

def smile(self, countdown: int) -> str:
return f"\nSmile {str(countdown)} "
return f"\n Smile {str(countdown)}…"

def click(self) -> str:
return " !!! Click !!!"

def look_at_camera(self) -> str:
return "Look at the camera…"
return "Look at the camera…"

def cmd_success(self, command_line: AnyStr) -> str:
return f"OK, command `{command_line}` succeeded"
Expand Down
2 changes: 1 addition & 1 deletion src/main/askai/core/component/camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _countdown(count: int) -> None:
pause.seconds(1)
events.reply.emit(reply=AIReply.mute(msg.smile(i)), erase_last=True)
player.play_sfx("camera-shutter")
events.reply.emit(reply=AIReply.mute(" !!!Click!!!"), erase_last=True)
events.reply.emit(reply=AIReply.mute(msg.click()), erase_last=True)

def __init__(self):
self._cam = None
Expand Down
3 changes: 2 additions & 1 deletion src/main/askai/core/engine/ai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@
class AIVision(Protocol):
"""Provide an interface for AI vision."""

def caption(self, filename: AnyPath, load_dir: AnyPath | None = None) -> str:
def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
"""Generate a caption for the provided image.
:param filename: File name of the image for which the caption is to be generated.
:param load_dir: Optional directory path for loading related resources.
:param query: Optional question about details of the image.
:return: A dictionary containing the generated caption.
"""
...
13 changes: 6 additions & 7 deletions src/main/askai/core/engine/openai/openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,26 +72,25 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
return msg.content

def template(self, question: str | None = None) -> str:
return dedent(
f"""
return dedent(f"""\
Given the image, provide the following information:
- A count of how many living beings are in the image.
- A list of the main objects present in the image.
- A description the atmosphere of the environment.
- A list of detailed descriptions all living beings you find in the image.
{'- ' + question if question else ''}"""
).strip()
{'- ' + question if question else ''}""").strip()

@retry()
def caption(self, filename: AnyPath, load_dir: AnyPath | None, question: str | None = None) -> str:
def caption(self, filename: AnyPath, load_dir: AnyPath | None, query: str | None = None) -> str:
"""Generate a caption for the provided image.
:param filename: File name of the image for which the caption is to be generated.
:param load_dir: Optional directory path for loading related resources.
:return: A dictionary containing the generated caption.
:param query: Optional question about details of the image.
:return: A string containing the generated caption.
"""
final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
vision_prompt = self.template()
vision_prompt = self.template(query)
load_image_chain = TransformChain(
input_variables=["image_path"], output_variables=["image"], transform=self._encode_image
)
Expand Down
35 changes: 18 additions & 17 deletions src/main/askai/core/router/agent_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@
Copyright (c) 2024, HomeSetup
"""
import inspect
import logging as log
import os
import re
from functools import lru_cache
from textwrap import dedent
from typing import Callable, Optional

from clitt.core.tui.line_input.line_input import line_input
from hspylib.core.metaclass.classpath import AnyPath
from hspylib.core.metaclass.singleton import Singleton
from langchain_core.tools import BaseTool, StructuredTool

from askai.core.askai_messages import msg
from askai.core.router.tools.analysis import query_output
from askai.core.router.tools.browser import browse, open_url
Expand All @@ -20,21 +33,8 @@
from askai.core.router.tools.summarization import summarize
from askai.core.router.tools.terminal import execute_command, list_contents, open_command
from askai.core.router.tools.vision import image_captioner, parse_caption
from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier
from askai.core.router.tools.webcam import webcam_capturer, webcam_identifier, CAPTION_TEMPLATE
from askai.exception.exceptions import TerminatingQuery
from clitt.core.tui.line_input.line_input import line_input
from functools import lru_cache
from hspylib.core.metaclass.classpath import AnyPath
from hspylib.core.metaclass.singleton import Singleton
from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith
from langchain_core.tools import BaseTool, StructuredTool
from textwrap import dedent
from typing import Callable, Optional

import inspect
import logging as log
import os
import re


class AgentTools(metaclass=Singleton):
Expand Down Expand Up @@ -128,9 +128,10 @@ def image_captioner(self, image_path: str) -> str:
:param image_path: The absolute path of the image file to be analyzed.
:return: A string containing the generated caption describing the image.
"""
return ensure_endswith(
ensure_startswith(parse_caption(image_captioner(image_path)), f"\n>  Description of '{image_path}':\n"),
"\n",
image_caption: list[str] = parse_caption(image_captioner(image_path))
return CAPTION_TEMPLATE.substitute(
image_path=image_path,
image_caption=os.linesep.join(image_caption) if image_caption else ''
)

def webcam_capturer(self, photo_name: str | None, detect_faces: bool = False) -> str:
Expand Down
46 changes: 23 additions & 23 deletions src/main/askai/core/router/tools/vision.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
import os
from textwrap import indent

import pyautogui
import torch
from PIL import Image
from hspylib.core.config.path_object import PathObject
from hspylib.core.enums.enumeration import Enumeration
from hspylib.core.metaclass.classpath import AnyPath
from hspylib.core.preconditions import check_argument
from transformers import BlipForConditionalGeneration, BlipProcessor

from askai.core.askai_events import events
from askai.core.askai_messages import msg
from askai.core.component.cache_service import PICTURE_DIR
Expand All @@ -6,17 +18,6 @@
from askai.core.model.image_result import ImageResult
from askai.core.router.evaluation import resolve_x_refs
from askai.core.support.shared_instances import shared
from hspylib.core.config.path_object import PathObject
from hspylib.core.enums.enumeration import Enumeration
from hspylib.core.metaclass.classpath import AnyPath
from hspylib.core.preconditions import check_argument
from PIL import Image
from textwrap import indent
from transformers import BlipForConditionalGeneration, BlipProcessor

import os
import pyautogui
import torch


class HFModel(Enumeration):
Expand Down Expand Up @@ -92,7 +93,7 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
return image_caption


def parse_caption(image_caption: str) -> str:
def parse_caption(image_caption: str) -> list[str]:
"""Parse the given image caption.
:param image_caption: The caption to parse.
:return: The parsed caption as a string.
Expand All @@ -102,17 +103,16 @@ def parse_caption(image_caption: str) -> str:
ln: str = os.linesep
people_desc: str = ""
if result.people_description:
people_desc: str = f"- **People:** `({result.people_count})`\n" + indent(
f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", " "
)
# fmt: off
return (
f"- **Description:** `{result.env_description}`\n"
f"- **Objects:** `{', '.join(result.main_objects)}`\n"
f"{people_desc or ''}"
) # fmt: on

return msg.no_caption()
people_desc: list[str] = [
f"- **People:** `({result.people_count})`",
indent(f"- {'- '.join([f'`{ppl}{ln}`' + ln for ppl in result.people_description])}", " ")
]
return [
f"- **Description:** `{result.env_description}`",
f"- **Objects:** `{', '.join(result.main_objects)}`",
] + people_desc

return [msg.no_caption()]


def take_screenshot(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
Expand Down
86 changes: 52 additions & 34 deletions src/main/askai/core/router/tools/webcam.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,40 @@
import os
from os.path import basename
from string import Template
from textwrap import indent

from askai.core.askai_configs import configs
from askai.core.askai_events import events
from askai.core.askai_messages import msg
from askai.core.component.camera import camera
from askai.core.model.ai_reply import AIReply
from askai.core.router.tools.vision import image_captioner, parse_caption
from hspylib.core.tools.text_tools import ensure_endswith, ensure_startswith
from os.path import basename
from textwrap import indent

import os
PHOTO_TEMPLATE: Template = Template("""\
>  Photo Taken -> ${pic_file}
${image_description}
${face_description}
""")

ID_TEMPLATE: Template = Template("""\
>  Person Identified -> ${photo_uri}
- **Distance:** `${distance}`
${photo_caption}
""")

CAPTION_TEMPLATE: Template = Template("""\
>  Description of `${image_path}`:
${image_caption}
""")


def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
Expand All @@ -19,49 +45,41 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
"""

pic_file, pic_data = camera.capture(photo_name, with_caption=False)
face_description: str | None = None
face_description: list[str] = []
ln: str = os.linesep

if detect_faces:
face_files, face_datas = camera.detect_faces(pic_data, photo_name)
faces: int = len(face_files)
face_description = (
(
f"- **Faces:** `({faces})`\n"
+ indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", " ")
+ f"- **Face-Captions:** `({faces})`\n"
+ indent(
f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}",
" ",
)
)
if faces
else ""
)
face_description: list[str] = [
f"- **Faces:** `({faces})`",
indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", " "),
f"- **Face-Captions:** `({faces})`",
indent(
f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}",
" "),
] if faces else []

image_description: str = parse_caption(image_captioner(pic_file.img_path))
image_description: list[str] = parse_caption(image_captioner(pic_file.img_path))

# fmt: off
return ensure_endswith(ensure_startswith(
f"\n>  Photo Taken -> {pic_file.img_path}\n\n"
f"{image_description or ''}\n"
f"{face_description or ''}", "\n"
), "\n") # fmt: on
return PHOTO_TEMPLATE.substitute(
pic_file=pic_file.img_path,
image_description=os.linesep.join(image_description) if image_description else '',
face_description=os.linesep.join(face_description) if face_description else ''
)


def webcam_identifier(max_distance: int = configs.max_id_distance) -> str:
"""Identifies the person in front of the webcam and provides a description of them.
:param max_distance: The maximum distance for identifying the person based on image similarity.
:return: A description of the identified person.
"""
identity: str = "%ORANGE% No identification was possible!%NC%"
events.reply.emit(reply=AIReply.debug(msg.look_at_camera()))
if photo := camera.identify(3, max_distance):
# fmt: off
identity = ensure_endswith(ensure_startswith(
f"\n>  Person Identified -> {photo.uri}\n\n"
f"- **Distance:** `{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}`\n"
f"{photo.caption}", "\n"
), "\n") # fmt: on

return identity
return ID_TEMPLATE.substitute(
photo_uri=photo.uri,
distance=f"{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}",
photo_caption=photo.caption
)

return "%ORANGE% No identification was possible!%NC%"

0 comments on commit 176b9b1

Please sign in to comment.