commit

LohiyaH · Jan 8, 2025 · c895226 · c895226
commit c895226
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.env
+/.venv
diff --git a/README.md b/README.md
@@ -0,0 +1,51 @@
+# Voice Assistant
+
+This project is a sample AI assistant that uses OpenAI and Google Generative AI models to provide responses based on user prompts and webcam images. The assistant can also convert text responses to speech.
+
+## Prerequisites
+
+1. **API Keys**: You need an `OPENAI_API_KEY` and a `GOOGLE_API_KEY` to run this code. Store them in a `.env` file in the root directory of the project, or set them as environment variables.
+
+2. **Apple Silicon Users**: If you are running the code on Apple Silicon, install `portaudio` by running the following command:
+    ```sh
+    brew install portaudio
+    ```
+
+## Setup
+
+1. **Create a Virtual Environment**:
+    ```sh
+    python3 -m venv .venv
+    ```
+
+2. **Activate the Virtual Environment**:
+    ```sh
+    source .venv/bin/activate
+    ```
+
+3. **Update pip and Install Required Packages**:
+    ```sh
+    pip install -U pip
+    pip install -r requirements.txt
+    ```
+
+## Running the Assistant
+
+To start the assistant, run the following command:
+```sh
+python3 assistant.py
+```
+
+## Usage
+
+1. **Webcam Stream**: The assistant uses your webcam to capture images.
+2. **Voice Input**: Speak into your microphone to provide prompts.
+3. **Text-to-Speech**: The assistant will respond with synthesized speech.
+
+## Stopping the Assistant
+
+To stop the assistant, close the webcam window or press `Esc` or `q`.
+
+## License
+
+This project is licensed under the MIT License.
diff --git a/assistant.py b/assistant.py
@@ -0,0 +1,171 @@
+import base64
+from threading import Lock, Thread
+
+import cv2
+import openai
+from cv2 import VideoCapture, imencode
+from dotenv import load_dotenv
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.schema.messages import SystemMessage
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_openai import ChatOpenAI
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pyaudio import PyAudio, paInt16
+from speech_recognition import Microphone, Recognizer, UnknownValueError
+
+load_dotenv()
+
+
+class WebcamStream:
+    def __init__(self):
+        self.stream = VideoCapture(index=0)
+        _, self.frame = self.stream.read()
+        self.running = False
+        self.lock = Lock()
+
+    def start(self):
+        if self.running:
+            return self
+
+        self.running = True
+
+        self.thread = Thread(target=self.update, args=())
+        self.thread.start()
+        return self
+
+    def update(self):
+        while self.running:
+            _, frame = self.stream.read()
+
+            self.lock.acquire()
+            self.frame = frame
+            self.lock.release()
+
+    def read(self, encode=False):
+        self.lock.acquire()
+        frame = self.frame.copy()
+        self.lock.release()
+
+        if encode:
+            _, buffer = imencode(".jpeg", frame)
+            return base64.b64encode(buffer)
+
+        return frame
+
+    def stop(self):
+        self.running = False
+        if self.thread.is_alive():
+            self.thread.join()
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.stream.release()
+
+
+class Assistant:
+    def __init__(self, model):
+        self.chain = self._create_inference_chain(model)
+
+    def answer(self, prompt, image):
+        if not prompt:
+            return
+
+        print("Prompt:", prompt)
+
+        response = self.chain.invoke(
+            {"prompt": prompt, "image_base64": image.decode()},
+            config={"configurable": {"session_id": "unused"}},
+        ).strip()
+
+        print("Response:", response)
+
+        if response:
+            self._tts(response)
+
+    def _tts(self, response):
+        player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)
+
+        with openai.audio.speech.with_streaming_response.create(
+            model="tts-1",
+            voice="alloy",
+            response_format="pcm",
+            input=response,
+        ) as stream:
+            for chunk in stream.iter_bytes(chunk_size=1024):
+                player.write(chunk)
+
+    def _create_inference_chain(self, model):
+        SYSTEM_PROMPT = """
+        You are a witty assistant that will use the chat history and the image 
+        provided by the user to answer its questions.
+
+        Use few words on your answers. Go straight to the point. Do not use any
+        emoticons or emojis. Do not ask the user any questions.
+
+        Be friendly and helpful. Show some personality. Do not be too formal.
+        """
+
+        prompt_template = ChatPromptTemplate.from_messages(
+            [
+                SystemMessage(content=SYSTEM_PROMPT),
+                MessagesPlaceholder(variable_name="chat_history"),
+                (
+                    "human",
+                    [
+                        {"type": "text", "text": "{prompt}"},
+                        {
+                            "type": "image_url",
+                            "image_url": "data:image/jpeg;base64,{image_base64}",
+                        },
+                    ],
+                ),
+            ]
+        )
+
+        chain = prompt_template | model | StrOutputParser()
+
+        chat_message_history = ChatMessageHistory()
+        return RunnableWithMessageHistory(
+            chain,
+            lambda _: chat_message_history,
+            input_messages_key="prompt",
+            history_messages_key="chat_history",
+        )
+
+
+webcam_stream = WebcamStream().start()
+
+model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
+
+# You can use OpenAI's GPT-4o model instead of Gemini Flash
+# by uncommenting the following line:
+# model = ChatOpenAI(model="gpt-4o")
+
+assistant = Assistant(model)
+
+
+def audio_callback(recognizer, audio):
+    try:
+        prompt = recognizer.recognize_whisper(audio, model="base", language="english")
+        assistant.answer(prompt, webcam_stream.read(encode=True))
+
+    except UnknownValueError:
+        print("There was an error processing the audio.")
+
+
+recognizer = Recognizer()
+microphone = Microphone()
+with microphone as source:
+    recognizer.adjust_for_ambient_noise(source)
+
+stop_listening = recognizer.listen_in_background(microphone, audio_callback)
+
+while True:
+    cv2.imshow("webcam", webcam_stream.read())
+    if cv2.waitKey(1) in [27, ord("q")]:
+        break
+
+webcam_stream.stop()
+cv2.destroyAllWindows()
+stop_listening(wait_for_stop=False)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+opencv-python
+langchain
+langchain-openai
+langchain_google_genai
+langchain-community
+python-dotenv
+pyaudio
+soundfile
+SpeechRecognition
+git+https://github.com/openai/whisper.git