Skip to content

Commit ef6122d

Browse files
committed
Minor bugfixes, Improved captioning - 2
1 parent 892d89c commit ef6122d

File tree

11 files changed

+116
-74
lines changed

11 files changed

+116
-74
lines changed

dependencies.hspd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ package: pause, version: 0.3, mode: ge
3838
package: requests, version: 2.32.3, mode: ge
3939
package: urllib3, version: 1.26.20, mode: ge
4040
package: protobuf, version: 4.25.4, mode: ge
41-
package: torch, version: 2.4.0, mode: ge
4241
package: tqdm, version: 4.66.5, mode: ge
43-
package: torchvision, version: 0.19.0, mode: ge
42+
package: torch, version: 2.2.0, mode: ge
43+
package: torchvision, version: 0.17.2, mode: ge
4444
package: open-clip-torch, version 2.26.1, mode: ge
4545
package: transformers, version: 4.44.2, mode: ge
4646
package: pyperclip, version: 1.9.0, mode: ge

src/demo/components/vision_demo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
if __name__ == '__main__':
77
init_context("vision-demo")
88
vision: AIVision = shared.engine.vision()
9-
load_dir: str = "/Users/hjunior/.config/hhs/askai/cache/pictures/photos"
9+
load_dir: str = "${HOME}/.config/hhs/askai/cache/pictures/photos"
1010
image_file: str = "eu-edvaldo-suecia.jpg"
1111
result = vision.caption(image_file, load_dir)
1212
print(result)

src/main/askai/core/commander/commander.py

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,17 @@
1212
1313
Copyright (c) 2024, HomeSetup
1414
"""
15+
import os
16+
import re
17+
from functools import partial
18+
from os.path import dirname
19+
from pathlib import Path
20+
from string import Template
21+
from textwrap import dedent
22+
23+
import click
1524
from askai.core.askai_configs import configs
16-
from askai.core.askai_events import ASKAI_BUS_NAME, AskAiEvents, events, REPLY_ERROR_EVENT, REPLY_EVENT
25+
from askai.core.askai_events import ASKAI_BUS_NAME, AskAiEvents, REPLY_ERROR_EVENT, REPLY_EVENT
1726
from askai.core.commander.commands.cache_cmd import CacheCmd
1827
from askai.core.commander.commands.camera_cmd import CameraCmd
1928
from askai.core.commander.commands.general_cmd import GeneralCmd
@@ -26,18 +35,9 @@
2635
from askai.language.language import AnyLocale, Language
2736
from click import Command, Group
2837
from clitt.core.term.cursor import cursor
29-
from functools import partial
3038
from hspylib.core.enums.charset import Charset
3139
from hspylib.core.tools.commons import sysout, to_bool
3240
from hspylib.modules.eventbus.event import Event
33-
from os.path import dirname
34-
from pathlib import Path
35-
from string import Template
36-
from textwrap import dedent
37-
38-
import click
39-
import os
40-
import re
4141

4242
COMMANDER_HELP_TPL = Template(
4343
dedent(
@@ -135,18 +135,6 @@ def _init_context(context_size: int = 1000, engine_name: str = "openai", model_n
135135
:param engine_name: The name of the engine to initialize (default is "openai").
136136
:param model_name: The model name of the engine to initialize (default is "gpt-4o-mini").
137137
"""
138-
if not (shared.engine and shared.context):
139-
shared.create_engine(engine_name=engine_name, model_name=model_name)
140-
shared.create_context(context_size)
141-
events.reply.subscribe(cb_event_handler=lambda ev: display_text(ev.args.message))
142-
143-
144-
@click.group()
145-
@click.pass_context
146-
def ask_commander(_) -> None:
147-
"""AskAI commands group. This function serves as the entry point for the AskAI command-line interface (CLI)
148-
commands, grouping related commands together.
149-
"""
150138

151139
def _reply_event(ev: Event, error: bool = False) -> None:
152140
"""Callback for handling the reply event.
@@ -161,10 +149,21 @@ def _reply_event(ev: Event, error: bool = False) -> None:
161149
cursor.erase_line()
162150
display_text(message)
163151

152+
if shared.engine is None and shared.context is None:
153+
shared.create_engine(engine_name=engine_name, model_name=model_name)
154+
shared.create_context(context_size)
155+
askai_bus = AskAiEvents.bus(ASKAI_BUS_NAME)
156+
askai_bus.subscribe(REPLY_EVENT, _reply_event)
157+
askai_bus.subscribe(REPLY_ERROR_EVENT, partial(_reply_event, error=True))
158+
159+
160+
@click.group()
161+
@click.pass_context
162+
def ask_commander(_) -> None:
163+
"""AskAI commands group. This function serves as the entry point for the AskAI command-line interface (CLI)
164+
commands, grouping related commands together.
165+
"""
164166
_init_context()
165-
askai_bus = AskAiEvents.bus(ASKAI_BUS_NAME)
166-
askai_bus.subscribe(REPLY_EVENT, _reply_event)
167-
askai_bus.subscribe(REPLY_ERROR_EVENT, partial(_reply_event, error=True))
168167

169168

170169
@ask_commander.command()
@@ -431,4 +430,4 @@ def camera(operation: str, args: tuple[str, ...]) -> None:
431430

432431

433432
if __name__ == "__main__":
434-
ask_commander(["help", "camera"], standalone_mode=False)
433+
ask_commander(["camera", "identify"], standalone_mode=False)

src/main/askai/core/commander/commands/camera_cmd.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,24 @@
11
from abc import ABC
2+
23
from askai.core.askai_configs import configs
34
from askai.core.component.camera import camera
4-
from askai.core.features.router.tools.webcam import webcam_identifier
5+
from askai.core.features.router.tools.webcam import webcam_identifier, webcam_capturer
56
from askai.core.support.text_formatter import text_formatter
7+
from askai.core.support.utilities import display_text
68
from hspylib.core.metaclass.classpath import AnyPath
79

810

911
class CameraCmd(ABC):
1012
"""Provides camera command functionalities."""
1113

1214
@staticmethod
13-
def capture(filename: AnyPath = None, detect_faces: bool = True, countdown: int = 3) -> None:
15+
def capture(filename: AnyPath = None, detect_faces: bool = True) -> None:
1416
"""Take a photo using the webcam.
1517
:param filename: The filename to save the photo under (optional).
1618
:param detect_faces: Whether to detect faces in the photo (default is True).
17-
:param countdown: The countdown in seconds before the photo is taken (default is 3).
1819
"""
19-
if photo := camera.capture(filename, countdown):
20-
text_formatter.cmd_print(f"Photo taken: %GREEN%{photo[0]}%NC%")
21-
if detect_faces:
22-
if len(faces := camera.detect_faces(photo[1], filename)) > 0:
23-
text_formatter.cmd_print(f"Faces detected: %GREEN%{len(faces[0])}%NC%")
20+
if photo_description := webcam_capturer(filename, detect_faces):
21+
display_text(photo_description)
2422
else:
2523
text_formatter.cmd_print("%RED%Unable to take photo!%NC%")
2624

@@ -30,7 +28,7 @@ def identify(max_distance: int = configs.max_id_distance) -> None:
3028
:param max_distance: The maximum allowable distance for face recognition. A lower value means closer matching
3129
to the real face (default is configs.max_id_distance).
3230
"""
33-
text_formatter.cmd_print(webcam_identifier(max_distance))
31+
display_text(webcam_identifier(max_distance))
3432

3533
@staticmethod
3634
def import_images(pathname: AnyPath = None, detect_faces: bool = True) -> None:

src/main/askai/core/component/camera.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,12 @@
3131
from askai.core.component.cache_service import FACE_DIR, IMG_IMPORTS_DIR, PHOTO_DIR
3232
from askai.core.component.image_store import ImageData, ImageFile, ImageMetadata, store
3333
from askai.core.features.router.tools.vision import image_captioner, parse_caption
34+
from askai.core.model.image_result import ImageResult
3435
from askai.core.support.utilities import build_img_path
3536
from askai.exception.exceptions import CameraAccessFailure, WebCamInitializationFailure
3637
from hspylib.core.metaclass.classpath import AnyPath
3738
from hspylib.core.metaclass.singleton import Singleton
39+
from hspylib.core.tools.dict_tools import get_or_default
3840
from hspylib.core.tools.text_tools import hash_text
3941
from hspylib.core.zoned_datetime import now_ms
4042
from retry import retry
@@ -155,15 +157,17 @@ def detect_faces(
155157
if len(faces) == 0:
156158
return face_files, face_datas
157159

160+
filename: str = filename or str(now_ms())
158161
for x, y, w, h in faces:
159162
cropped_face: ImageData = photo[y : y + h, x : x + w]
160163
final_path: str = build_img_path(FACE_DIR, str(filename), f"-FACE-{len(face_files)}.jpg")
161164
if final_path and cv2.imwrite(final_path, cropped_face):
165+
result: ImageResult = ImageResult.of(image_captioner(final_path))
162166
face_file = ImageFile(
163167
hash_text(basename(final_path)),
164168
final_path,
165169
store.FACE_CATEGORY,
166-
parse_caption(image_captioner(final_path)) if with_caption else msg.no_caption(),
170+
get_or_default(result.people_description, 0, '<N/A>') if with_caption else msg.no_caption(),
167171
)
168172
face_files.append(face_file)
169173
face_datas.append(cropped_face)

src/main/askai/core/engine/openai/openai_vision.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class OpenAIVision:
3939
_OUT_PARSER = JsonOutputParser(pydantic_object=ImageResult)
4040

4141
@staticmethod
42-
def _encode_image(inputs: dict) -> dict:
42+
def _encode_image(inputs: dict) -> dict[str, str]:
4343
"""Load an image from file and encode it as a base64 string.
4444
:param inputs: Dictionary containing the file path under a specific key.
4545
:return: Dictionary with the base64 encoded image string.
@@ -67,25 +67,24 @@ def create_image_caption_chain(inputs: dict) -> MessageContent:
6767
)
6868
return msg.content
6969

70-
@property
71-
def template(self) -> str:
72-
return dedent("""
70+
def template(self, question: str | None = None) -> str:
71+
return dedent(f"""
7372
Given the image, provide the following information:
7473
- A count of how many living beings are in the image.
7574
- A list of the main objects present in the image.
7675
- A description the atmosphere of the environment.
7776
- A list of detailed descriptions all living beings you find in the image.
78-
""").strip()
77+
{'- ' + question if question else ''}""").strip()
7978

80-
def caption(self, filename: AnyPath, load_dir: AnyPath | None) -> str:
79+
def caption(self, filename: AnyPath, load_dir: AnyPath | None, question: str | None = None) -> str:
8180
"""Generate a caption for the provided image.
8281
:param filename: File name of the image for which the caption is to be generated.
8382
:param load_dir: Optional directory path for loading related resources.
8483
:return: A dictionary containing the generated caption.
8584
"""
8685
final_path: str = os.path.join(load_dir, filename) if load_dir else os.getcwd()
8786
check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
88-
vision_prompt = self.template
87+
vision_prompt = self.template()
8988
load_image_chain = TransformChain(
9089
input_variables=["image_path"],
9190
output_variables=["image"],

src/main/askai/core/features/router/tools/vision.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
from textwrap import indent
33

4+
import torch
5+
from PIL import Image
46
from askai.core.askai_events import events
57
from askai.core.askai_messages import msg
68
from askai.core.component.cache_service import PICTURE_DIR
@@ -9,7 +11,53 @@
911
from askai.core.model.image_result import ImageResult
1012
from askai.core.support.shared_instances import shared
1113
from hspylib.core.config.path_object import PathObject
14+
from hspylib.core.enums.enumeration import Enumeration
1215
from hspylib.core.metaclass.classpath import AnyPath
16+
from transformers import BlipForConditionalGeneration, BlipProcessor
17+
18+
19+
def offline_captioner(path_name: AnyPath) -> str:
20+
"""This tool is used to describe an image.
21+
:param path_name: The path of the image to describe.
22+
"""
23+
24+
class HFModel(Enumeration):
25+
"""Available Hugging Face models"""
26+
27+
# fmt: off
28+
SF_BLIP_BASE = "Salesforce/blip-image-captioning-base"
29+
SF_BLIP_LARGE = "Salesforce/blip-image-captioning-large"
30+
# fmt: on
31+
32+
@staticmethod
33+
def default() -> "HFModel":
34+
"""Return the default HF model."""
35+
return HFModel.SF_BLIP_LARGE
36+
37+
caption: str = "Not available"
38+
39+
posix_path: PathObject = PathObject.of(path_name)
40+
if not posix_path.exists:
41+
# Attempt to resolve cross-references
42+
if history := str(shared.context.flat("HISTORY") or ""):
43+
if (x_referenced := resolve_x_refs(path_name, history)) and x_referenced != shared.UNCERTAIN_ID:
44+
x_ref_path: PathObject = PathObject.of(x_referenced)
45+
posix_path: PathObject = x_ref_path if x_ref_path.exists else posix_path
46+
47+
if posix_path.exists:
48+
events.reply.emit(message=msg.describe_image(str(posix_path)))
49+
hf_model: HFModel = HFModel.default()
50+
# Use GPU if it's available
51+
device = "cuda" if torch.cuda.is_available() else "cpu"
52+
image = Image.open(str(posix_path)).convert("RGB")
53+
model = BlipForConditionalGeneration.from_pretrained(hf_model.value).to(device)
54+
processor = BlipProcessor.from_pretrained(hf_model.value)
55+
inputs = processor(images=image, return_tensors="pt").to(device)
56+
outputs = model.generate(**inputs)
57+
caption = processor.decode(outputs[0], skip_special_tokens=True)
58+
caption = caption.title() if caption else "I could not caption the image"
59+
60+
return caption
1361

1462

1563
def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
@@ -30,7 +78,7 @@ def image_captioner(path_name: AnyPath, load_dir: AnyPath | None = None) -> str:
3078
posix_path: PathObject = x_ref_path if x_ref_path.exists else posix_path
3179

3280
if posix_path.exists:
33-
events.reply.emit(message=msg.describe_image(str(posix_path)))
81+
events.reply.emit(message=msg.describe_image(str(posix_path)), verbosity="debug")
3482
vision: AIVision = shared.engine.vision()
3583
caption = vision.caption(posix_path.filename, load_dir or posix_path.abs_dir or PICTURE_DIR)
3684

@@ -43,13 +91,13 @@ def parse_caption(image_caption: str) -> str:
4391
:return: The parsed caption as a string.
4492
"""
4593
if image_caption:
46-
result: ImageResult = ImageResult.model_validate_json(image_caption.replace("'", '"'))
94+
result: ImageResult = ImageResult.of(image_caption)
4795
ln: str = os.linesep
4896
people_desc: str = ''
4997
if result.people_description:
5098
people_desc: str = (
51-
f"- **People ({result.people_count}):**\n"
52-
+ indent(f"- {'- '.join([ppl + ln for ppl in result.people_description])}", " ")
99+
f"- **People:** `({result.people_count})`\n"
100+
+ indent(f"- {'- '.join([f'`{ppl}{ln}`' for ppl in result.people_description])}", " ")
53101
)
54102
return (
55103
f"- **Description:** `{result.env_description}`\n"

src/main/askai/core/features/router/tools/webcam.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,25 @@ def webcam_capturer(photo_name: str | None, detect_faces: bool = False) -> str:
1616
"""
1717

1818
pic_file, pic_data = camera.capture(photo_name, with_caption=False)
19-
face_desc: str | None = None
19+
face_description: str | None = None
2020
ln: str = os.linesep
2121

2222
if detect_faces:
2323
face_files, face_datas = camera.detect_faces(pic_data, photo_name)
2424
faces: int = len(face_files)
25-
face_desc = (
26-
f"- **Faces ({faces}):**\n"
27-
+ indent(f"- {'- '.join([ff.img_path + ln for ff in face_files])}", " ")
28-
+ f"- **Face Captions ({faces}):**\n"
29-
+ indent(f"- {'- '.join([basename(ff.img_path) + ': ' + ff.img_caption + ln for ff in face_files])}",
30-
" ")
25+
face_description = (
26+
f"- **Faces:** `({faces})`\n"
27+
+ indent(f"- {'- '.join([f'`{ff.img_path}` {ln}' for ff in face_files])}", " ")
28+
+ f"- **Face-Captions:** `({faces})`\n"
29+
+ indent(f"- {'- '.join([f'*{basename(ff.img_path)}*: `{ff.img_caption}` {ln}' for ff in face_files])}", " ")
3130
) if faces else ''
3231

33-
people_desc: str = parse_caption(image_captioner(pic_file.img_path))
32+
image_description: str = parse_caption(image_captioner(pic_file.img_path))
3433

3534
return (
3635
f">  Photo Taken -> {pic_file.img_path}\n\n"
37-
f"{people_desc or ''}"
38-
f"{face_desc or ''}"
36+
f"{image_description or ''}\n"
37+
f"{face_description or ''}"
3938
)
4039

4140

@@ -46,8 +45,8 @@ def webcam_identifier(max_distance: int = configs.max_id_distance) -> str:
4645
if photo := camera.identify(3, max_distance):
4746
identity = (
4847
f">  Person Identified -> {photo.uri}\n\n"
49-
f"- **Description:** `{photo.caption}`\n"
5048
f"- **Distance:** `{round(photo.distance, 4):.4f}/{round(max_distance, 4):.4f}`\n"
49+
f"{photo.caption}\n"
5150
)
5251

5352
return identity
Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any
1+
from typing import AnyStr
22

33
from pydantic import BaseModel, Field
44

@@ -13,9 +13,5 @@ class ImageResult(BaseModel):
1313
people_description: list[str] = Field(description="List of people description")
1414

1515
@staticmethod
16-
def to_image_result(from_dict: dict[str, Any]) -> 'ImageResult':
17-
return ImageResult.MyModel.parse_obj(
18-
from_dict['people_count'],
19-
from_dict['main_objects'],
20-
from_dict['env_description'],
21-
from_dict['people_description'])
16+
def of(image_caption: AnyStr) -> 'ImageResult':
17+
return ImageResult.model_validate_json(str(image_caption).replace("'", '"'))

src/main/askai/core/support/utilities.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
from typing import Optional, AnyStr
2424

2525
import pause
26+
from askai.core.support.presets import Presets
27+
from askai.core.support.text_formatter import text_formatter
28+
from askai.language.language import Language
2629
from clitt.core.term.cursor import Cursor
2730
from hspylib.core.config.path_object import PathObject
2831
from hspylib.core.enums.charset import Charset
@@ -33,10 +36,6 @@
3336
from hspylib.core.zoned_datetime import now_ms
3437
from hspylib.modules.cli.vt100.vt_color import VtColor
3538

36-
from askai.core.support.presets import Presets
37-
from askai.core.support.text_formatter import text_formatter
38-
from askai.language.language import Language
39-
4039

4140
def read_stdin() -> Optional[str]:
4241
"""Read input from the standard input (stdin).

0 commit comments

Comments
 (0)