Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## Building a voice agent

An exciting assignment on building a voice agents.

We start by building the interface, which uses the Fast API library to create `post` API endpoint `/chat/` that accepts an audio file

You can use [Postman](https://www.postman.com/downloads/) to test this out.

---

### Running the project locally

To run the project locally we need to start a server using `uvicorn`

``` bash
uvicorn main:app --reload
```

---

### Transcription

I transcribed my audio using the [openai-whisper](https://github.com/openai/whisper?tab=readme-ov-file) library

---

### LLM

Using huggingface [Transformers](https://huggingface.co/docs/transformers/en/installation) library

Note: I decided to use a local lightweight llm, Ollama


### TTS

[Index TTS2](https://github.com/index-tts/index-tts)

[PYTTSX3](https://pypi.org/project/pyttsx3/)

[Bento TTS](https://github.com/bentoml/BentoXTTS)

``` bash
git clone https://github.com/index-tts/index-tts.git && cd index-tts
git lfs pull # download large repository files
```

### Dependencies

Found in the `requirements.txt` file.
Binary file added audio_outputs/voice_sample2.wav
Binary file not shown.
Binary file added audio_outputs/voice_sample2.wav:Zone.Identifier
Binary file not shown.
Binary file added audio_outputs/voice_sample3.wav
Binary file not shown.
Binary file added audio_outputs/voice_sample3.wav:Zone.Identifier
Binary file not shown.
Binary file added audio_outputs/voice_sample_3things.wav
Binary file not shown.
Binary file not shown.
113 changes: 113 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import time
import logging


from fastapi import FastAPI, Response, UploadFile, Request, File

# Import necessary libraries
from fastapi.responses import FileResponse



# transcribe audio
from utils.asr import transcribe_audio

# llm response
from utils.response_gen import generate_response, generate_response_ollama

# tts
from utils.tts import bentoml_ttx_get_audio, save_audio_to_folder

# upload dir
UPLOAD_DIR = "uploads/temp"
TRANSCR_DIR = "uploads/audio"
TRANSCR_TEXT_DIR = "uploads/text"
# make sure they exist
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(TRANSCR_DIR, exist_ok=True)
os.makedirs(TRANSCR_TEXT_DIR, exist_ok=True)

app = FastAPI()

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Week 3 Assignment: Voice Agent Development 😀")

@app.post("/chat/")
async def chat_endpoint(request: Request, file: UploadFile = File(...)):
"""
Accepts audio file and returns audio file.
"""

# check for only one file.
form = await request.form()
if len(form) > 1:
logging.warning("Only one audio file is allowed per request.")
return {"error": "Only one audio file is allowed per request."}

# save the uploaded file
audio_bytes = await file.read()

uploaded_file = os.path.join(UPLOAD_DIR, file.filename)
with open(uploaded_file, "wb") as f:
logging.info("Saving uploaded file.🆗")
f.write(await file.read())

logging.debug("Reading the uploaded file.")

# TODO: ASR → LLM → TTS

# transcribe
user_instruction_text = transcribe_audio(audio_bytes=audio_bytes, original_filename=file.filename)
logging.debug(f"\n👌: {user_instruction_text}\n")

# Save transcription to a text file using timestamp
timestamp = time.strftime("%Y%m%d-%H%M%S") # e.g., 20251024-075230

# Save transcription to a text file
transcript_path = os.path.join(TRANSCR_TEXT_DIR, f"{timestamp}_{os.path.splitext(file.filename)[0]}_transcript.txt")
with open(transcript_path, "w") as transcript_file:
transcript_file.write(user_instruction_text)

# get respinse from llm
llm_response = generate_response_ollama(user_instruction_text)
logging.debug(f"\nLlm Response:\n{llm_response}\n")

# text to speech

logging.debug("Connecting to bentoml ttx")
api_url = "http://localhost:3000/synthesize"
lang = "en"

audio_bytes = bentoml_ttx_get_audio(llm_response, lang, api_url)
logging.debug("Got the bytes")

# Optional: save to file
save_audio_to_folder(audio_bytes, "audio_outputs", "voice_sample.wav")

logging.debug("Saved audio")



# file path
# # TODO: Return the actual file.
# This is a placeholder for testing purposes
return FileResponse(
path="output.wav", # use the uploaded file as placeholder
media_type="audio/mpeg",
filename="test.mp3"
)


# test
#logging.debug(f"{generate_response('Hello there, what is your name?')}")
#logging.debug(f"{generate_response_ollama('Hello there, what is your name?')}")
#logging.debug(f"{generate_response_ollama('Can we go home together?')}")
#logging.debug(f"{generate_response_ollama('Remind me your name again? I am Ian Too')}")



api_url = "http://localhost:3000/synthesize"
lang = "en"
logging.debug("Got the bytes")
save_audio_to_folder(bentoml_ttx_get_audio(generate_response_ollama("Name 3 items you like. Keept it short."), lang, api_url), "audio_outputs", "voice_sample_3things.wav")
Binary file added test_data/.DS_Store
Binary file not shown.
Binary file added test_data/.DS_Store:Zone.Identifier
Binary file not shown.
Binary file added test_data/audio/sample-0.mp3
Binary file not shown.
Binary file added test_data/audio/sample-0.mp3:Zone.Identifier
Binary file not shown.
Binary file added test_data/audio/sample-1.mp3
Binary file not shown.
Binary file added test_data/audio/sample-1.mp3:Zone.Identifier
Binary file not shown.
Binary file added test_data/audio/sample-4.mp3
Binary file not shown.
Binary file added test_data/audio/sample-4.mp3:Zone.Identifier
Binary file not shown.
11 changes: 11 additions & 0 deletions test_data/data/Fake_Pretraining_Texts.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Raw Text
Hello! Contact us at support@data.org or call 123-456-7890. Your credit card 4111111111111111 was declined. This message is intended only for the recipient. Visit <a href='https://fake.site'>our site</a> for more.
Hola! Este artículo está completamente en español. Teléfono: 11-2222-3333
<html><body><div><h1>Breaking News</h1><p>This is a major event!</p></div><footer>Contact us</footer></body></html>
Buy now! Best product ever. Best product ever. Best product ever.
Python 3.14 introduces several improvements including better error messages. Learn more on the official site.
Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.
"<div>For inquiries, email jane_doe@example.com or visit our site. Card number: 378282246310005.</div>"
Large Language Models are transforming the AI landscape with few-shot capabilities.
这是一个包含有用技术信息的中文段落。电话号码:010-12345678
Buy now! Best product ever. Best product ever. Best product ever.
Binary file not shown.
Binary file added test_data/image/image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added test_data/image/image.png:Zone.Identifier
Binary file not shown.
Empty file added uploads/temp/Birds.m4a
Empty file.
Binary file added uploads/temp/Birds.m4a:Zone.Identifier
Binary file not shown.
Empty file.
Binary file added uploads/temp/Kind of weather.m4a:Zone.Identifier
Binary file not shown.
Empty file added uploads/temp/Order a pie.m4a
Empty file.
Binary file added uploads/temp/Order a pie.m4a:Zone.Identifier
Binary file not shown.
Empty file.
Binary file added uploads/temp/Reporting an issue.m4a:Zone.Identifier
Binary file not shown.
Empty file.
Binary file not shown.
Empty file added uploads/temp/Thanks.m4a
Empty file.
Binary file added uploads/temp/Thanks.m4a:Zone.Identifier
Binary file not shown.
Empty file.
Binary file not shown.
Empty file.
Binary file not shown.
Empty file.
Binary file added uploads/temp/What is your name.m4a:Zone.Identifier
Binary file not shown.
Empty file added uploads/temp/sample-1.mp3
Empty file.
Binary file added uploads/temp/sample-1.mp3:Zone.Identifier
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-080325_sample-1_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
the way the page of that, they didn't, you know, that you some people said it's on kren the benso instead of court and a musical or just not hurry up which one just understand in China that's me.
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-080722_sample-1_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
We pay for that, you know that, you see people's numbers on our voice are not very important for you in the taken. Of what it says we call it being an signing dit and telling- will be through a marriage but no marriage. But no, it is not there in the same class you know the previous parliament, why? If it was categorised there then we had to convince. So some of it is the thing that we are trying to get through s一起 may also be generated. FIRST How to deal carry a head on a quota of all 12rió He calls it wogs,
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Okay, so for East California Avenue, what are we doing? Oh, he's riding notes. Are you riding notes? Okay, then that's on the other side. I can't walk as well. Yeah. It's very good notes, bro. Just say sidewalk is too. Oh, yeah.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
So today is July 18th, 2025, it's a Friday, it's sunny, I mean Sun Huzi in California, where I live at the grud and this is a message to myself. So this is a message to your teacher's self, he and dear he and he did not give up and you pass a view. You came here and you actually made it so congratulations. Congratulations, take back, take a break, you know, just close your eyes and be thankful for where you've got him. We're thankful for the people who supported you to get there. The people who challenged you, the people who are with you all the way, your family, your babe, wifey. So this is just to remind you that you did it, you met enjoy enjoy this new life. Enjoy this new life.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
So today is July 18th, 2025, it's a Friday, it's sunny, I mean Sun Huzi in California, where I live at the grud and this is a message to myself. So this is a message to your teacher's self, he and dear he and he did not give up and you pass a view. You came here and you actually made it so congratulations. Congratulations, take back, take a break, you know, just close your eyes and be thankful for where you've got him. We're thankful for the people who supported you to get there. The people who challenged you, the people who are with you all the way, your family, your babe, wifey. So this is just to remind you that you did it, you met enjoy enjoy this new life. Enjoy this new life.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What is your name?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What kind of weather is it outside?
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
So this is a test. Tell me what you think.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am reporting an issue. Can you call me back?
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-131232_Order a pie_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Go to www.google.com and order a pie.
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-131423_Thanks_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
No, thank you. Seems my issue has been resolved. Thank you so much for your help.
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-140128_Birds_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I'm deal of birds
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
What kind of weather is it outside?
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-185255_Order a pie_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Go to www.google.com and order a pie.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am reporting an issue. Can you call me back?
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-193709_Thanks_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
No, thank you. Seems my issue has been resolved. Thank you so much for your help.
Binary file not shown.
1 change: 1 addition & 0 deletions uploads/text/20251024-193856_Birds_transcript.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I'm deal of birds
Binary file not shown.
Empty file added utils/__init__.py
Empty file.
Binary file added utils/__init__.py:Zone.Identifier
Binary file not shown.
24 changes: 24 additions & 0 deletions utils/asr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import logging
import tempfile
import whisper

# load the asr model
asr_model = whisper.load_model("tiny")

def transcribe_audio(audio_bytes, original_filename):
"""
Accept an audio bytes
Write it to a temporary file
transcribe it
"""
ext = os.path.splitext(original_filename)[1]
# Create a temporary file
with tempfile.NamedTemporaryFile(suffix=ext, delete=True) as temp_audio:
temp_audio.write(audio_bytes)
temp_audio.flush() # Ensure data is written

# Transcribe using Whisper
result = asr_model.transcribe(temp_audio.name)

return result.get("text", "")
Binary file added utils/asr.py:Zone.Identifier
Binary file not shown.
Loading