-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit c895226
Showing
4 changed files
with
234 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.env | ||
/.venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Voice Assistant | ||
|
||
This project is a sample AI assistant that uses OpenAI and Google Generative AI models to provide responses based on user prompts and webcam images. The assistant can also convert text responses to speech. | ||
|
||
## Prerequisites | ||
|
||
1. **API Keys**: You need an `OPENAI_API_KEY` and a `GOOGLE_API_KEY` to run this code. Store them in a `.env` file in the root directory of the project, or set them as environment variables. | ||
|
||
2. **Apple Silicon Users**: If you are running the code on Apple Silicon, install `portaudio` by running the following command: | ||
```sh | ||
brew install portaudio | ||
``` | ||
|
||
## Setup | ||
|
||
1. **Create a Virtual Environment**: | ||
```sh | ||
python3 -m venv .venv | ||
``` | ||
|
||
2. **Activate the Virtual Environment**: | ||
```sh | ||
source .venv/bin/activate | ||
``` | ||
|
||
3. **Update pip and Install Required Packages**: | ||
```sh | ||
pip install -U pip | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Running the Assistant | ||
|
||
To start the assistant, run the following command: | ||
```sh | ||
python3 assistant.py | ||
``` | ||
|
||
## Usage | ||
|
||
1. **Webcam Stream**: The assistant uses your webcam to capture images. | ||
2. **Voice Input**: Speak into your microphone to provide prompts. | ||
3. **Text-to-Speech**: The assistant will respond with synthesized speech. | ||
|
||
## Stopping the Assistant | ||
|
||
To stop the assistant, close the webcam window or press `Esc` or `q`. | ||
|
||
## License | ||
|
||
This project is licensed under the MIT License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
import base64 | ||
from threading import Lock, Thread | ||
|
||
import cv2 | ||
import openai | ||
from cv2 import VideoCapture, imencode | ||
from dotenv import load_dotenv | ||
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder | ||
from langchain.schema.messages import SystemMessage | ||
from langchain_community.chat_message_histories import ChatMessageHistory | ||
from langchain_core.output_parsers import StrOutputParser | ||
from langchain_core.runnables.history import RunnableWithMessageHistory | ||
from langchain_openai import ChatOpenAI | ||
from langchain_google_genai import ChatGoogleGenerativeAI | ||
from pyaudio import PyAudio, paInt16 | ||
from speech_recognition import Microphone, Recognizer, UnknownValueError | ||
|
||
load_dotenv() | ||
|
||
|
||
class WebcamStream: | ||
def __init__(self): | ||
self.stream = VideoCapture(index=0) | ||
_, self.frame = self.stream.read() | ||
self.running = False | ||
self.lock = Lock() | ||
|
||
def start(self): | ||
if self.running: | ||
return self | ||
|
||
self.running = True | ||
|
||
self.thread = Thread(target=self.update, args=()) | ||
self.thread.start() | ||
return self | ||
|
||
def update(self): | ||
while self.running: | ||
_, frame = self.stream.read() | ||
|
||
self.lock.acquire() | ||
self.frame = frame | ||
self.lock.release() | ||
|
||
def read(self, encode=False): | ||
self.lock.acquire() | ||
frame = self.frame.copy() | ||
self.lock.release() | ||
|
||
if encode: | ||
_, buffer = imencode(".jpeg", frame) | ||
return base64.b64encode(buffer) | ||
|
||
return frame | ||
|
||
def stop(self): | ||
self.running = False | ||
if self.thread.is_alive(): | ||
self.thread.join() | ||
|
||
def __exit__(self, exc_type, exc_value, exc_traceback): | ||
self.stream.release() | ||
|
||
|
||
class Assistant: | ||
def __init__(self, model): | ||
self.chain = self._create_inference_chain(model) | ||
|
||
def answer(self, prompt, image): | ||
if not prompt: | ||
return | ||
|
||
print("Prompt:", prompt) | ||
|
||
response = self.chain.invoke( | ||
{"prompt": prompt, "image_base64": image.decode()}, | ||
config={"configurable": {"session_id": "unused"}}, | ||
).strip() | ||
|
||
print("Response:", response) | ||
|
||
if response: | ||
self._tts(response) | ||
|
||
def _tts(self, response): | ||
player = PyAudio().open(format=paInt16, channels=1, rate=24000, output=True) | ||
|
||
with openai.audio.speech.with_streaming_response.create( | ||
model="tts-1", | ||
voice="alloy", | ||
response_format="pcm", | ||
input=response, | ||
) as stream: | ||
for chunk in stream.iter_bytes(chunk_size=1024): | ||
player.write(chunk) | ||
|
||
def _create_inference_chain(self, model): | ||
SYSTEM_PROMPT = """ | ||
You are a witty assistant that will use the chat history and the image | ||
provided by the user to answer its questions. | ||
Use few words on your answers. Go straight to the point. Do not use any | ||
emoticons or emojis. Do not ask the user any questions. | ||
Be friendly and helpful. Show some personality. Do not be too formal. | ||
""" | ||
|
||
prompt_template = ChatPromptTemplate.from_messages( | ||
[ | ||
SystemMessage(content=SYSTEM_PROMPT), | ||
MessagesPlaceholder(variable_name="chat_history"), | ||
( | ||
"human", | ||
[ | ||
{"type": "text", "text": "{prompt}"}, | ||
{ | ||
"type": "image_url", | ||
"image_url": "data:image/jpeg;base64,{image_base64}", | ||
}, | ||
], | ||
), | ||
] | ||
) | ||
|
||
chain = prompt_template | model | StrOutputParser() | ||
|
||
chat_message_history = ChatMessageHistory() | ||
return RunnableWithMessageHistory( | ||
chain, | ||
lambda _: chat_message_history, | ||
input_messages_key="prompt", | ||
history_messages_key="chat_history", | ||
) | ||
|
||
|
||
webcam_stream = WebcamStream().start() | ||
|
||
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest") | ||
|
||
# You can use OpenAI's GPT-4o model instead of Gemini Flash | ||
# by uncommenting the following line: | ||
# model = ChatOpenAI(model="gpt-4o") | ||
|
||
assistant = Assistant(model) | ||
|
||
|
||
def audio_callback(recognizer, audio): | ||
try: | ||
prompt = recognizer.recognize_whisper(audio, model="base", language="english") | ||
assistant.answer(prompt, webcam_stream.read(encode=True)) | ||
|
||
except UnknownValueError: | ||
print("There was an error processing the audio.") | ||
|
||
|
||
recognizer = Recognizer() | ||
microphone = Microphone() | ||
with microphone as source: | ||
recognizer.adjust_for_ambient_noise(source) | ||
|
||
stop_listening = recognizer.listen_in_background(microphone, audio_callback) | ||
|
||
while True: | ||
cv2.imshow("webcam", webcam_stream.read()) | ||
if cv2.waitKey(1) in [27, ord("q")]: | ||
break | ||
|
||
webcam_stream.stop() | ||
cv2.destroyAllWindows() | ||
stop_listening(wait_for_stop=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
opencv-python | ||
langchain | ||
langchain-openai | ||
langchain_google_genai | ||
langchain-community | ||
python-dotenv | ||
pyaudio | ||
soundfile | ||
SpeechRecognition | ||
git+https://github.com/openai/whisper.git |