Skip to content

Commit

Permalink
commit
Browse files Browse the repository at this point in the history
  • Loading branch information
LohiyaH committed Jan 8, 2025
0 parents commit c895226
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.env
/.venv
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Voice Assistant

This project is a sample AI assistant that uses OpenAI and Google Generative AI models to provide responses based on user prompts and webcam images. The assistant can also convert text responses to speech.

## Prerequisites

1. **API Keys**: You need an `OPENAI_API_KEY` and a `GOOGLE_API_KEY` to run this code. Store them in a `.env` file in the root directory of the project, or set them as environment variables.

2. **Apple Silicon Users**: If you are running the code on Apple Silicon, install `portaudio` by running the following command:
```sh
brew install portaudio
```

## Setup

1. **Create a Virtual Environment**:
```sh
python3 -m venv .venv
```

2. **Activate the Virtual Environment**:
```sh
source .venv/bin/activate
```

3. **Update pip and Install Required Packages**:
```sh
pip install -U pip
pip install -r requirements.txt
```

## Running the Assistant

To start the assistant, run the following command:
```sh
python3 assistant.py
```

## Usage

1. **Webcam Stream**: The assistant uses your webcam to capture images.
2. **Voice Input**: Speak into your microphone to provide prompts.
3. **Text-to-Speech**: The assistant will respond with synthesized speech.

## Stopping the Assistant

To stop the assistant, close the webcam window or press `Esc` or `q`.

## License

This project is licensed under the MIT License.
171 changes: 171 additions & 0 deletions assistant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import base64
from threading import Lock, Thread

import cv2
import openai
from cv2 import VideoCapture, imencode
from dotenv import load_dotenv
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import SystemMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from pyaudio import PyAudio, paInt16
from speech_recognition import Microphone, Recognizer, UnknownValueError

load_dotenv()


class WebcamStream:
def __init__(self):
self.stream = VideoCapture(index=0)
_, self.frame = self.stream.read()
self.running = False
self.lock = Lock()

def start(self):
if self.running:
return self

self.running = True

self.thread = Thread(target=self.update, args=())
self.thread.start()
return self

def update(self):
while self.running:
_, frame = self.stream.read()

self.lock.acquire()
self.frame = frame
self.lock.release()

def read(self, encode=False):
self.lock.acquire()
frame = self.frame.copy()
self.lock.release()

if encode:
_, buffer = imencode(".jpeg", frame)
return base64.b64encode(buffer)

return frame

def stop(self):
self.running = False
if self.thread.is_alive():
self.thread.join()

def __exit__(self, exc_type, exc_value, exc_traceback):
self.stream.release()


class Assistant:
def __init__(self, model):
self.chain = self._create_inference_chain(model)

def answer(self, prompt, image):
if not prompt:
return

print("Prompt:", prompt)

response = self.chain.invoke(
{"prompt": prompt, "image_base64": image.decode()},
config={"configurable": {"session_id": "unused"}},
).strip()

print("Response:", response)

if response:
self._tts(response)

def _tts(self, response):
player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True)

with openai.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
response_format="pcm",
input=response,
) as stream:
for chunk in stream.iter_bytes(chunk_size=1024):
player.write(chunk)

def _create_inference_chain(self, model):
SYSTEM_PROMPT = """
You are a witty assistant that will use the chat history and the image
provided by the user to answer its questions.
Use few words on your answers. Go straight to the point. Do not use any
emoticons or emojis. Do not ask the user any questions.
Be friendly and helpful. Show some personality. Do not be too formal.
"""

prompt_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=SYSTEM_PROMPT),
MessagesPlaceholder(variable_name="chat_history"),
(
"human",
[
{"type": "text", "text": "{prompt}"},
{
"type": "image_url",
"image_url": "data:image/jpeg;base64,{image_base64}",
},
],
),
]
)

chain = prompt_template | model | StrOutputParser()

chat_message_history = ChatMessageHistory()
return RunnableWithMessageHistory(
chain,
lambda _: chat_message_history,
input_messages_key="prompt",
history_messages_key="chat_history",
)


webcam_stream = WebcamStream().start()

model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")

# You can use OpenAI's GPT-4o model instead of Gemini Flash
# by uncommenting the following line:
# model = ChatOpenAI(model="gpt-4o")

assistant = Assistant(model)


def audio_callback(recognizer, audio):
try:
prompt = recognizer.recognize_whisper(audio, model="base", language="english")
assistant.answer(prompt, webcam_stream.read(encode=True))

except UnknownValueError:
print("There was an error processing the audio.")


recognizer = Recognizer()
microphone = Microphone()
with microphone as source:
recognizer.adjust_for_ambient_noise(source)

stop_listening = recognizer.listen_in_background(microphone, audio_callback)

while True:
cv2.imshow("webcam", webcam_stream.read())
if cv2.waitKey(1) in [27, ord("q")]:
break

webcam_stream.stop()
cv2.destroyAllWindows()
stop_listening(wait_for_stop=False)
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
opencv-python
langchain
langchain-openai
langchain_google_genai
langchain-community
python-dotenv
pyaudio
soundfile
SpeechRecognition
git+https://github.com/openai/whisper.git

0 comments on commit c895226

Please sign in to comment.