Skip to content

Commit

Permalink
Fix the image captioner (openai visionn)
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Nov 29, 2024
1 parent 73d5abe commit 4fffabc
Show file tree
Hide file tree
Showing 11 changed files with 140 additions and 46 deletions.
17 changes: 9 additions & 8 deletions dependencies.hspd
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@ package: pause, version: 0.3, mode: eq
package: tqdm, version: 4.66.5, mode: eq
package: pyperclip, version: 1.9.0, mode: eq
package: python-magic, version: 0.4.27, mode: eq
package: pytz, version: 2024.1, mode: eq
package: pytz, version: 2024.2, mode: eq
package: transitions, version: 0.9.2, mode: eq
package: pydantic, version: 2.10.2, mode: eq

/* LangChain */
package: langchain, version: 0.3.4, mode: eq
package: langchain-openai, version: 0.2.3, mode: eq
package: langchain-community, version: 0.3.3, mode: eq
package: langchain-google-community, version: 2.0.1, mode: eq
package: langchain, version: 0.3.9, mode: eq
package: langchain-openai, version: 0.2.10, mode: ge
package: langchain-community, version: 0.3.8, mode: eq
package: langchain-google-community, version: 2.0.3, mode: eq

/* OpenAI */
package: openai-whisper, version: 20240930, mode: eq
package: openai, version: 1.52.1, mode: eq
package: openai, version: 1.55.3, mode: eq

/* Google */
package: google-api-python-client, version: 2.149.0, mode: eq
package: google-api-python-client, version: 2.154.0, mode: eq

/* Web */
package: fake_useragent, version: 1.5.1, mode: eq
Expand All @@ -44,7 +45,7 @@ package: html2text, version: 2024.2.26, mode: eq

/* CLI/TUI */
package: rich, version: 13.8.1, mode: eq
package: textual, version: 0.86.3, mode: eq
package: textual, version: 0.87.1, mode: eq

/* Audio */
package: soundfile, version: 0.12.1, mode: eq
Expand Down
19 changes: 12 additions & 7 deletions src/demo/components/vision_demo.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from askai.core.engine.ai_vision import AIVision
from askai.core.router.tools import vision
from askai.core.router.tools.vision import offline_captioner

import os

from askai.core.support.shared_instances import shared
from utils import init_context

if __name__ == "__main__":
# init_context("vision-demo")
# vision: AIVision = shared.engine.vision()
load_dir: str = "${HOME}/.config/hhs/askai/cache/pictures/photos"
init_context("vision-demo")
vision: AIVision = shared.engine.vision()
load_dir: str = "/Users/hjunior/Library/CloudStorage/Dropbox/Media"
image_file: str = "eu-edvaldo-suecia.jpg"
# result = vision.caption(image_file, load_dir)
# print(result)
result2 = offline_captioner(os.path.join(load_dir, image_file))
print(result2)
result = vision.caption(image_file, load_dir)
print(result)
# result2 = offline_captioner(os.path.join(load_dir, image_file))
# print(result2)
74 changes: 74 additions & 0 deletions src/demo/devel/vision-tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import base64

from langchain.chains.transform import TransformChain
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import chain
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from pydantic.v1 import Field


def load_image(inputs: dict) -> dict:
"""Load image from file and encode it as base64."""
image_path = inputs["image_path"]

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')

image_base64 = encode_image(image_path)
return {"image": image_base64}


class ImageInformation(BaseModel):
"""Information about an image."""
image_description: str = Field(description="a short description of the image")
people_count: int = Field(description="number of humans on the picture")
main_objects: list[str] = Field(description="list of the main objects on the picture")


load_image_chain = TransformChain(
input_variables=["image_path", "parser_guides"],
output_variables=["image"],
transform=load_image
)

# Set verbose
parser = JsonOutputParser(pydantic_object=ImageInformation)


def get_image_informations(image_path: str) -> dict:
vision_prompt = """
Given the image, provide the following information:
- A count of how many people are in the image
- A list of the main objects present in the image
- A description of the image
"""
vision_chain = load_image_chain | image_model | parser
return vision_chain.invoke(
{
'image_path': f'{image_path}',
"parser_guides": parser.get_format_instructions(),
'prompt': vision_prompt
})


@chain
def image_model(inputs: dict) -> str | list[str] | dict:
"""Invoke model with image and prompt."""
model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini", max_tokens=1024)
msg = model.invoke(
[HumanMessage(
content=[
{"type": "text", "text": inputs["prompt"]},
{"type": "text", "text": parser.get_format_instructions()},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
])]
)
return msg.content


if __name__ == '__main__':
result = get_image_informations("/Users/hjunior/Library/CloudStorage/Dropbox/Media/eu-edvaldo-suecia.jpg")
print(result)
6 changes: 4 additions & 2 deletions src/main/askai/core/engine/openai/openai_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,14 @@ def caption(
check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
vision_prompt: str = self._get_vision_prompt(query, image_type)
load_image_chain = TransformChain(
input_variables=["image_path", "parser_guides"], output_variables=["image"], transform_cb=self._encode_image
input_variables=["image_path", "parser_guides"],
output_variables=["image"],
transform=self._encode_image
)
out_parser: JsonOutputParser = self._get_out_parser(image_type)
vision_chain = load_image_chain | self.create_image_caption_chain | out_parser
args: dict[str, str] = {
"image_path": f"{final_path}",
"image_path": final_path,
"prompt": vision_prompt,
"parser_guides": out_parser.get_format_instructions(),
}
Expand Down
1 change: 0 additions & 1 deletion src/main/askai/core/model/screenshot_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
from typing import AnyStr

from kubernetes.watch.watch import SimpleNamespace
from pydantic import BaseModel, Field


Expand Down
2 changes: 2 additions & 0 deletions src/main/askai/core/processors/splitter/splitter_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from askai.core.model.action_plan import ActionPlan
from askai.core.model.ai_reply import AIReply
from askai.core.model.model_result import ModelResult
from askai.core.router.agent_tools import features
from askai.core.router.task_agent import agent
from askai.core.router.tools.general import final_answer
from askai.core.support.langchain_support import lc_llm
Expand Down Expand Up @@ -138,6 +139,7 @@ def splitter_template(self, query: str) -> ChatPromptTemplate:
shell=prompt.shell,
datetime=geo_location.datetime,
home=Path.home(),
agent_tools=features.available_tools,
rag=self._rag.get_rag_examples(query),
),
),
Expand Down
3 changes: 3 additions & 0 deletions src/main/askai/core/router/agent_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def tools(self) -> list[BaseTool]:

@property
def available_tools(self) -> str:
"""Returns a formatted string of available tools and their descriptions.
:return: A string listing tools with their descriptions, formatted for readability.
"""
avail_list: list[str] = list()
for t in self.tools():
if match := re.search(r"^```(.*?)^\s*Usage:", t.description, re.DOTALL | re.MULTILINE):
Expand Down
39 changes: 20 additions & 19 deletions src/main/askai/core/router/task_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@
Copyright (c) 2024, HomeSetup
"""
from typing import AnyStr, Optional
import logging as log

from pydantic import ValidationError
from hspylib.core.config.path_object import PathObject
from hspylib.core.metaclass.singleton import Singleton
from langchain.agents import AgentExecutor, create_structured_chat_agent
from langchain.memory.chat_memory import BaseChatMemory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import Runnable
import openai

from askai.core.askai_configs import configs
from askai.core.askai_events import events
from askai.core.askai_messages import msg
Expand All @@ -21,17 +33,6 @@
from askai.core.router.agent_tools import features
from askai.core.support.langchain_support import lc_llm
from askai.core.support.shared_instances import shared
from hspylib.core.config.path_object import PathObject
from hspylib.core.metaclass.singleton import Singleton
from langchain.agents import AgentExecutor, create_structured_chat_agent
from langchain.memory.chat_memory import BaseChatMemory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import Runnable
from langchain_core.runnables.utils import Output
from typing import AnyStr, Optional

import logging as log
import openai


class TaskAgent(metaclass=Singleton):
Expand Down Expand Up @@ -83,9 +84,9 @@ def _create_lc_agent(self, temperature: Temperature = Temperature.COLDEST) -> Ru
tools = features.tools()
llm = lc_llm.create_chat_model(temperature.temp)
chat_memory: BaseChatMemory = shared.memory
lc_agent = create_structured_chat_agent(llm, tools, self.agent_template)
chat_agent = create_structured_chat_agent(llm, tools, self.agent_template)
lc_agent: Runnable = AgentExecutor(
agent=lc_agent,
agent=chat_agent,
tools=tools,
max_iterations=configs.max_agent_retries,
memory=chat_memory,
Expand All @@ -96,21 +97,21 @@ def _create_lc_agent(self, temperature: Temperature = Temperature.COLDEST) -> Ru

return lc_agent

def _exec_task(self, task: AnyStr) -> Optional[Output]:
def _exec_task(self, task: AnyStr) -> Optional[dict[str, str]]:
"""Execute the specified agent task.
:param task: The task to be executed by the agent.
:return: An instance of Output containing the result of the task, or None if the task fails or produces
no output.
"""
output: dict[str, str] | None = None
try:
lc_agent: Runnable = self._create_lc_agent()
return lc_agent.invoke({"input": task})
except openai.APIError as err:
log.error(str(err))
except ValueError as err:
output: dict[str, str] = lc_agent.invoke({"input": task})
except (openai.APIError, ValueError, ValidationError) as err:
log.error(str(err))
output: dict[str, str] = {'output': str(err)}

return None
return output


assert (agent := TaskAgent().INSTANCE) is not None
8 changes: 7 additions & 1 deletion src/main/askai/resources/prompts/task-splitter.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,14 @@ Step 5. Separate tasks clearly, avoiding the use of conjunctions like 'and', 'or

11. When you lack an answer, need user assistance, or require real-time access, the "tasks" field will be: "[{{{{ "id": "id", "task": "Search google for: '<Your advanced google search query with filters> <Current Month> <Current Year>'" }}}}]".

12. Prefer using the available agent tools than executing a terminal command.

**Available Resources:**
**Available Agent Tools:**

{agent_tools}


**Available Computer Resources:**

1. WebCam access (to take photos and capture videos).
2. Desktop (via screenshots).
Expand Down
1 change: 1 addition & 0 deletions src/main/askai/resources/rag/task-splitter.csv
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Identify who is in front of the webcam;N/A;[{{1. Use the webcam_identifier too t
Describe me;N/A;[{{1. Use the webcam to take a photo and query about the person in front of it.}}]
Describe the first image in my Downloads folder.;N/A;[{{1. List the user downloads folder}},{{2. Identify image files}},{{3. Describe the first image file}}]
Describe what you see on my desktop.;N/A;[{{1. Use the screenshot tool to capture and describe the user's desktop. }}]
Describe what you see on my screen.;N/A;[{{1. Use the screenshot tool to capture and describe the user's screen. }}]
What is the size of the Moon?;Direct: The moon has a diameter of about 3.474 kilometers (2.159 miles).;N/A
Hello, who are you?;Direct: Hello, I am Taius the AskAI helpful and kind assistant.;N/A
find . -mxdepth 1 -type f -nme *.png;N/A;[{{1. Execute on terminal: 'find . -maxdepth 1 -type f -name *.png'}}]
Expand Down
16 changes: 8 additions & 8 deletions src/main/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@ pause==0.3
tqdm==4.66.5
pyperclip==1.9.0
python-magic==0.4.27
pytz==2024.1
pytz==2024.2
transitions==0.9.2
langchain==0.3.4
langchain-openai==0.2.3
langchain-community==0.3.3
langchain-google-community==2.0.1
langchain==0.3.9
langchain-openai>=0.2.10
langchain-community==0.3.8
langchain-google-community==2.0.3
openai-whisper==20240930
openai==1.52.1
google-api-python-client==2.149.0
openai==1.55.3
google-api-python-client==2.154.0
fake_useragent==1.5.1
requests==2.32.3
urllib3==2.2.3
protobuf==4.25.4
aiohttp==3.10.5
html2text==2024.2.26
rich==13.8.1
textual==0.86.3
textual==0.87.1
soundfile==0.12.1
PyAudio==0.2.14
SpeechRecognition==3.10.4
Expand Down

0 comments on commit 4fffabc

Please sign in to comment.