Fix the image captioner (openai visionn)

yorevs · Nov 29, 2024 · 4fffabc · 4fffabc
1 parent 73d5abe
commit 4fffabc
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 46 deletions.
diff --git a/dependencies.hspd b/dependencies.hspd
@@ -18,21 +18,22 @@ package: pause, version: 0.3, mode: eq
 package: tqdm, version: 4.66.5, mode: eq
 package: pyperclip, version: 1.9.0, mode: eq
 package: python-magic, version: 0.4.27, mode: eq
-package: pytz, version: 2024.1, mode: eq
+package: pytz, version: 2024.2, mode: eq
 package: transitions, version: 0.9.2, mode: eq
+package: pydantic, version: 2.10.2, mode: eq
 
 /* LangChain */
-package: langchain, version: 0.3.4, mode: eq
-package: langchain-openai, version: 0.2.3, mode: eq
-package: langchain-community, version: 0.3.3, mode: eq
-package: langchain-google-community, version: 2.0.1, mode: eq
+package: langchain, version: 0.3.9, mode: eq
+package: langchain-openai, version: 0.2.10, mode: ge
+package: langchain-community, version: 0.3.8, mode: eq
+package: langchain-google-community, version: 2.0.3, mode: eq
 
 /* OpenAI */
 package: openai-whisper, version: 20240930, mode: eq
-package: openai, version: 1.52.1, mode: eq
+package: openai, version: 1.55.3, mode: eq
 
 /* Google */
-package: google-api-python-client, version: 2.149.0, mode: eq
+package: google-api-python-client, version: 2.154.0, mode: eq
 
 /* Web */
 package: fake_useragent, version: 1.5.1, mode: eq
@@ -44,7 +45,7 @@ package: html2text, version: 2024.2.26, mode: eq
 
 /* CLI/TUI */
 package: rich, version: 13.8.1, mode: eq
-package: textual, version: 0.86.3, mode: eq
+package: textual, version: 0.87.1, mode: eq
 
 /* Audio */
 package: soundfile, version: 0.12.1, mode: eq

diff --git a/src/demo/components/vision_demo.py b/src/demo/components/vision_demo.py
@@ -1,13 +1,18 @@
+from askai.core.engine.ai_vision import AIVision
+from askai.core.router.tools import vision
 from askai.core.router.tools.vision import offline_captioner
 
 import os
 
+from askai.core.support.shared_instances import shared
+from utils import init_context
+
 if __name__ == "__main__":
-    # init_context("vision-demo")
-    # vision: AIVision = shared.engine.vision()
-    load_dir: str = "${HOME}/.config/hhs/askai/cache/pictures/photos"
+    init_context("vision-demo")
+    vision: AIVision = shared.engine.vision()
+    load_dir: str = "/Users/hjunior/Library/CloudStorage/Dropbox/Media"
     image_file: str = "eu-edvaldo-suecia.jpg"
-    # result = vision.caption(image_file, load_dir)
-    # print(result)
-    result2 = offline_captioner(os.path.join(load_dir, image_file))
-    print(result2)
+    result = vision.caption(image_file, load_dir)
+    print(result)
+    # result2 = offline_captioner(os.path.join(load_dir, image_file))
+    # print(result2)
diff --git a/src/demo/devel/vision-tests.py b/src/demo/devel/vision-tests.py
@@ -0,0 +1,74 @@
+import base64
+
+from langchain.chains.transform import TransformChain
+from langchain_core.messages import HumanMessage
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.runnables import chain
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+from pydantic.v1 import Field
+
+
+def load_image(inputs: dict) -> dict:
+    """Load image from file and encode it as base64."""
+    image_path = inputs["image_path"]
+
+    def encode_image(image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+
+    image_base64 = encode_image(image_path)
+    return {"image": image_base64}
+
+
+class ImageInformation(BaseModel):
+    """Information about an image."""
+    image_description: str = Field(description="a short description of the image")
+    people_count: int = Field(description="number of humans on the picture")
+    main_objects: list[str] = Field(description="list of the main objects on the picture")
+
+
+load_image_chain = TransformChain(
+    input_variables=["image_path", "parser_guides"],
+    output_variables=["image"],
+    transform=load_image
+)
+
+# Set verbose
+parser = JsonOutputParser(pydantic_object=ImageInformation)
+
+
+def get_image_informations(image_path: str) -> dict:
+    vision_prompt = """
+    Given the image, provide the following information:
+    - A count of how many people are in the image
+    - A list of the main objects present in the image
+    - A description of the image
+    """
+    vision_chain = load_image_chain | image_model | parser
+    return vision_chain.invoke(
+        {
+            'image_path': f'{image_path}',
+            "parser_guides": parser.get_format_instructions(),
+            'prompt': vision_prompt
+        })
+
+
+@chain
+def image_model(inputs: dict) -> str | list[str] | dict:
+    """Invoke model with image and prompt."""
+    model = ChatOpenAI(temperature=0.5, model="gpt-4o-mini", max_tokens=1024)
+    msg = model.invoke(
+        [HumanMessage(
+            content=[
+                {"type": "text", "text": inputs["prompt"]},
+                {"type": "text", "text": parser.get_format_instructions()},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
+            ])]
+    )
+    return msg.content
+
+
+if __name__ == '__main__':
+    result = get_image_informations("/Users/hjunior/Library/CloudStorage/Dropbox/Media/eu-edvaldo-suecia.jpg")
+    print(result)
diff --git a/src/main/askai/core/engine/openai/openai_vision.py b/src/main/askai/core/engine/openai/openai_vision.py
@@ -99,12 +99,14 @@ def caption(
         check_argument(len((final_path := str(find_file(final_path) or ""))) > 0, f"Invalid image path: {final_path}")
         vision_prompt: str = self._get_vision_prompt(query, image_type)
         load_image_chain = TransformChain(
-            input_variables=["image_path", "parser_guides"], output_variables=["image"], transform_cb=self._encode_image
+            input_variables=["image_path", "parser_guides"],
+            output_variables=["image"],
+            transform=self._encode_image
         )
         out_parser: JsonOutputParser = self._get_out_parser(image_type)
         vision_chain = load_image_chain | self.create_image_caption_chain | out_parser
         args: dict[str, str] = {
-            "image_path": f"{final_path}",
+            "image_path": final_path,
             "prompt": vision_prompt,
             "parser_guides": out_parser.get_format_instructions(),
         }

diff --git a/src/main/askai/core/model/screenshot_result.py b/src/main/askai/core/model/screenshot_result.py
@@ -2,7 +2,6 @@
 import json
 from typing import AnyStr
 
-from kubernetes.watch.watch import SimpleNamespace
 from pydantic import BaseModel, Field
 
 

diff --git a/src/main/askai/core/processors/splitter/splitter_actions.py b/src/main/askai/core/processors/splitter/splitter_actions.py
@@ -35,6 +35,7 @@
 from askai.core.model.action_plan import ActionPlan
 from askai.core.model.ai_reply import AIReply
 from askai.core.model.model_result import ModelResult
+from askai.core.router.agent_tools import features
 from askai.core.router.task_agent import agent
 from askai.core.router.tools.general import final_answer
 from askai.core.support.langchain_support import lc_llm
@@ -138,6 +139,7 @@ def splitter_template(self, query: str) -> ChatPromptTemplate:
                         shell=prompt.shell,
                         datetime=geo_location.datetime,
                         home=Path.home(),
+                        agent_tools=features.available_tools,
                         rag=self._rag.get_rag_examples(query),
                     ),
                 ),

diff --git a/src/main/askai/core/router/agent_tools.py b/src/main/askai/core/router/agent_tools.py
@@ -67,6 +67,9 @@ def tools(self) -> list[BaseTool]:
 
     @property
     def available_tools(self) -> str:
+        """Returns a formatted string of available tools and their descriptions.
+        :return: A string listing tools with their descriptions, formatted for readability.
+        """
         avail_list: list[str] = list()
         for t in self.tools():
             if match := re.search(r"^```(.*?)^\s*Usage:", t.description, re.DOTALL | re.MULTILINE):

diff --git a/src/main/askai/core/router/task_agent.py b/src/main/askai/core/router/task_agent.py
@@ -12,6 +12,18 @@
 
    Copyright (c) 2024, HomeSetup
 """
+from typing import AnyStr, Optional
+import logging as log
+
+from pydantic import ValidationError
+from hspylib.core.config.path_object import PathObject
+from hspylib.core.metaclass.singleton import Singleton
+from langchain.agents import AgentExecutor, create_structured_chat_agent
+from langchain.memory.chat_memory import BaseChatMemory
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.runnables import Runnable
+import openai
+
 from askai.core.askai_configs import configs
 from askai.core.askai_events import events
 from askai.core.askai_messages import msg
@@ -21,17 +33,6 @@
 from askai.core.router.agent_tools import features
 from askai.core.support.langchain_support import lc_llm
 from askai.core.support.shared_instances import shared
-from hspylib.core.config.path_object import PathObject
-from hspylib.core.metaclass.singleton import Singleton
-from langchain.agents import AgentExecutor, create_structured_chat_agent
-from langchain.memory.chat_memory import BaseChatMemory
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain_core.runnables import Runnable
-from langchain_core.runnables.utils import Output
-from typing import AnyStr, Optional
-
-import logging as log
-import openai
 
 
 class TaskAgent(metaclass=Singleton):
@@ -83,9 +84,9 @@ def _create_lc_agent(self, temperature: Temperature = Temperature.COLDEST) -> Ru
         tools = features.tools()
         llm = lc_llm.create_chat_model(temperature.temp)
         chat_memory: BaseChatMemory = shared.memory
-        lc_agent = create_structured_chat_agent(llm, tools, self.agent_template)
+        chat_agent = create_structured_chat_agent(llm, tools, self.agent_template)
         lc_agent: Runnable = AgentExecutor(
-            agent=lc_agent,
+            agent=chat_agent,
             tools=tools,
             max_iterations=configs.max_agent_retries,
             memory=chat_memory,
@@ -96,21 +97,21 @@ def _create_lc_agent(self, temperature: Temperature = Temperature.COLDEST) -> Ru
 
         return lc_agent
 
-    def _exec_task(self, task: AnyStr) -> Optional[Output]:
+    def _exec_task(self, task: AnyStr) -> Optional[dict[str, str]]:
         """Execute the specified agent task.
         :param task: The task to be executed by the agent.
         :return: An instance of Output containing the result of the task, or None if the task fails or produces
         no output.
         """
+        output: dict[str, str] | None = None
         try:
             lc_agent: Runnable = self._create_lc_agent()
-            return lc_agent.invoke({"input": task})
-        except openai.APIError as err:
-            log.error(str(err))
-        except ValueError as err:
+            output: dict[str, str] = lc_agent.invoke({"input": task})
+        except (openai.APIError, ValueError, ValidationError) as err:
             log.error(str(err))
+            output: dict[str, str] = {'output': str(err)}
 
-        return None
+        return output
 
 
 assert (agent := TaskAgent().INSTANCE) is not None
diff --git a/src/main/askai/resources/prompts/task-splitter.txt b/src/main/askai/resources/prompts/task-splitter.txt
@@ -52,8 +52,14 @@ Step 5. Separate tasks clearly, avoiding the use of conjunctions like 'and', 'or
 
 11. When you lack an answer, need user assistance, or require real-time access, the "tasks" field will be: "[{{{{ "id": "id", "task": "Search google for: '<Your advanced google search query with filters> <Current Month> <Current Year>'" }}}}]".
 
+12. Prefer using the available agent tools than executing a terminal command.
 
-**Available Resources:**
+**Available Agent Tools:**
+
+{agent_tools}
+
+
+**Available Computer Resources:**
 
 1. WebCam access (to take photos and capture videos).
 2. Desktop (via screenshots).

diff --git a/src/main/askai/resources/rag/task-splitter.csv b/src/main/askai/resources/rag/task-splitter.csv
@@ -6,6 +6,7 @@ Identify who is in front of the webcam;N/A;[{{1. Use the webcam_identifier too t
 Describe me;N/A;[{{1. Use the webcam to take a photo and query about the person in front of it.}}]
 Describe the first image in my Downloads folder.;N/A;[{{1. List the user downloads folder}},{{2. Identify image files}},{{3. Describe the first image file}}]
 Describe what you see on my desktop.;N/A;[{{1. Use the screenshot tool to capture and describe the user's desktop. }}]
+Describe what you see on my screen.;N/A;[{{1. Use the screenshot tool to capture and describe the user's screen. }}]
 What is the size of the Moon?;Direct: The moon has a diameter of about 3.474 kilometers (2.159 miles).;N/A
 Hello, who are you?;Direct: Hello, I am Taius the AskAI helpful and kind assistant.;N/A
 find . -mxdepth 1 -type f -nme *.png;N/A;[{{1. Execute on terminal: 'find . -maxdepth 1 -type f -name *.png'}}]

diff --git a/src/main/requirements.txt b/src/main/requirements.txt
@@ -8,23 +8,23 @@ pause==0.3
 tqdm==4.66.5
 pyperclip==1.9.0
 python-magic==0.4.27
-pytz==2024.1
+pytz==2024.2
 transitions==0.9.2
-langchain==0.3.4
-langchain-openai==0.2.3
-langchain-community==0.3.3
-langchain-google-community==2.0.1
+langchain==0.3.9
+langchain-openai>=0.2.10
+langchain-community==0.3.8
+langchain-google-community==2.0.3
 openai-whisper==20240930
-openai==1.52.1
-google-api-python-client==2.149.0
+openai==1.55.3
+google-api-python-client==2.154.0
 fake_useragent==1.5.1
 requests==2.32.3
 urllib3==2.2.3
 protobuf==4.25.4
 aiohttp==3.10.5
 html2text==2024.2.26
 rich==13.8.1
-textual==0.86.3
+textual==0.87.1
 soundfile==0.12.1
 PyAudio==0.2.14
 SpeechRecognition==3.10.4