diff --git a/screen recording.mkv b/screen recording.mkv new file mode 100644 index 0000000..444c41f Binary files /dev/null and b/screen recording.mkv differ diff --git a/voice_chat/.env.example b/voice_chat/.env.example new file mode 100644 index 0000000..adbeb3d --- /dev/null +++ b/voice_chat/.env.example @@ -0,0 +1,3 @@ +# Hugging Face API Token +# Get your token from: https://huggingface.co/settings/tokens +HF_TOKEN=your_hugging_face_token_here diff --git a/voice_chat/.gitignore b/voice_chat/.gitignore new file mode 100644 index 0000000..7f48a3c --- /dev/null +++ b/voice_chat/.gitignore @@ -0,0 +1,47 @@ +# Environment variables +.env +.env.local +*.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Temporary files +temp/ +*.wav +*.mp3 +*.flac +*.tmp + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/voice_chat/QUICK_START.md b/voice_chat/QUICK_START.md new file mode 100644 index 0000000..86c2507 --- /dev/null +++ b/voice_chat/QUICK_START.md @@ -0,0 +1,427 @@ +# ๐ First-Time User Quick Start + +## 30-Second Startup (Windows) + +1. **Open File Explorer** + - Navigate to: `d:\MLE\Week3_HW\voice_chat` + +2. **Double-click `run.bat`** + - A terminal window appears + - Wait for it to say "Uvicorn running on..." + +3. **Open Browser** + - Go to: `http://localhost:8000` + - You should see the Voice Chat interface! + +**That's it! You're running.** ๐ + +--- + +## 30-Second Startup (macOS/Linux) + +1. **Open Terminal** + - Navigate to project: `cd ~/path/to/voice_chat` + +2. **Run:** + ```bash + bash run.sh + ``` + - Wait for: "Uvicorn running on..." + +3. **Open Browser** + - Go to: `http://localhost:8000` + - Ready to chat! + +--- + +## First Test (Do This Now!) + +### Step 1: Allow Microphone +``` +1. Page might ask for microphone permission +2. Click "Allow" button +3. If no permission prompt, you're good to go! +``` + +### Step 2: Record & Send +``` +1. Click blue ๐ค button +2. Say: "Hello, how are you?" +3. Wait 2 seconds +4. Click ๐ค button again (stops recording) +5. Click ๐ค "Send" button +``` + +### Step 3: Wait for Response +``` +โ Loading spinner appears +โ Backend processes (wait 5-15 seconds) +โ Your message appears in blue on RIGHT +โ Bot message appears in purple on LEFT +โ Audio plays automatically +``` + +### Step 4: Success! ๐ +``` +If all above worked: Application is running perfectly! +If something failed: See troubleshooting below โ +``` + +--- + +## โ How to Know It's Working + +### Good Signs โ +- Page loads immediately +- Microphone icon visible +- Buttons are clickable +- Recording starts when clicked +- Message sends successfully +- Bot responds within 15 seconds +- Audio plays automatically +- No red error messages + +### Bad Signs โ +- Page doesn't load or shows error +- Buttons don't respond +- Microphone permission denied +- Recording doesn't start +- Message doesn't send +- Backend doesn't respond after 30 seconds +- Red error message appears +- Audio doesn't play + +--- + +## ๐ง Quick Troubleshooting + +### Problem: "Page won't load" + +**Fix:** +1. Make sure terminal shows "Uvicorn running on..." +2. Try refreshing page (F5) +3. Try different URL: `http://127.0.0.1:8000` +4. Close and restart backend + +### Problem: "Microphone permission denied" + +**Fix:** +1. Refresh page +2. Click the permission prompt +3. If still denied: + - Chrome: Click ๐ icon โ Permissions โ Allow microphone + - Firefox: Check privacy settings + - Safari: Settings โ Privacy + +### Problem: "Bot doesn't respond" + +**Fix:** +1. Make sure you recorded audio (see blue message first) +2. Wait 15 seconds (backend processing) +3. Check terminal for [CHAT] messages +4. Make sure internet is connected +5. Restart backend + +### Problem: "No audio output" + +**Fix:** +1. Check if muted: Click ๐ button +2. Check system volume (bottom right taskbar) +3. Test speakers with YouTube +4. Refresh page and try again + +### Problem: "Error message appears" + +**Fix:** +1. Read the error message carefully +2. It tells you what went wrong +3. Most common: "Backend not running" +4. Restart using run.bat or run.sh + +--- + +## ๐ Expected Behavior + +### What Should Happen: + +``` +Timeline of Events: + +T=0s You click ๐ค (Record) + โข Button turns RED + โข Indicator starts pulsing + โข Status shows "Recording..." + +T=2s You finish speaking + You click ๐ค (Stop) + โข Button turns BLUE + โข Status shows "Ready" + +T=2.5s You click ๐ค (Send) + โข Loading spinner appears + โข Button is now disabled + โข Send button disabled + +T=3-5s Frontend sends audio to backend + โข Request sent as FormData + +T=5-15s Backend processes + โข ASR converts audio to text + โข LLM generates response + โข TTS converts response to audio + +T=15-16s Response received + โข Your message appears (BLUE, RIGHT) + โข Bot message appears (PURPLE, LEFT) + โข Audio plays automatically (if unmuted) + โข Loading spinner gone + โข Buttons re-enabled + +T=17s Ready for next message! + Click Record again to continue +``` + +--- + +## ๐ฑ Mobile Testing (Optional) + +### Test on Your Phone + +**Step 1: Get Your Computer's IP** +```powershell +# Windows PowerShell +ipconfig +# Look for: "IPv4 Address" like 192.168.1.100 + +# macOS/Linux Terminal +ifconfig +# Look for "inet" like 192.168.1.100 +``` + +**Step 2: Start Backend (Different Command)** +```bash +uvicorn app.main:app --host 0.0.0.0 --port 8000 +``` + +**Step 3: On Your Phone** +Open browser and go to: +``` +http://192.168.1.100:8000 +``` +(Replace with YOUR IP address from Step 1) + +**Step 4: Test** +- Everything works the same as desktop! +- Try sending messages +- Check if responsive +- Verify touch buttons work + +--- + +## ๐งช Simple Test Cases + +Try these to verify everything works: + +### Test 1: Basic Greeting +``` +Say: "Hi" +Expected: Greeting response +``` + +### Test 2: Question +``` +Say: "What is Python?" +Expected: Answer about Python +``` + +### Test 3: Math +``` +Say: "What is 2 plus 2?" +Expected: Answer "4" or "equals 4" +``` + +### Test 4: Information Request +``` +Say: "Tell me about AI" +Expected: Information about AI +``` + +### Test 5: Multiple Messages +``` +Message 1: "Hello" +Response: "Hi, how can I help?" +Message 2: "What's the weather?" +Response: Weather-related answer +``` + +All 5 work โ โ Perfect! +3-4 work โ โ Good! +1-2 work โ โ ๏ธ Check setup +0 work โ โ Need troubleshooting + +--- + +## ๐ก Pro Tips + +1. **Speak clearly and slowly** + - Better transcription accuracy + +2. **Wait for microphone permission** + - Don't click buttons before allowing + +3. **Be patient with backend** + - 5-15 seconds is normal processing time + +4. **Check browser console if issues** + - Press F12 + - Go to Console tab + - Look for red error messages + +5. **Read error messages** + - They tell you exactly what went wrong + +6. **Try different phrases** + - Bot is smarter than you might think! + +7. **Use on quiet place** + - Less noise = better transcription + +8. **Test with friends** + - They might say things you wouldn't + +--- + +## ๐ Understanding the Flow + +### How It Works (Simple Version) + +``` +You (User) + โ +Click Record & Speak + โ +Click Send + โ +Your audio goes to backend + โ +Backend does: + โข Hears: "What is Python?" + โข Understands: It's a question about Python + โข Thinks: "I should explain Python" + โข Speaks: Generated answer + โ +Answer comes back as audio + โ +Your speaker plays it + โ +Chat shows both sides + โ +You can send another message + โ +Repeat! +``` + +--- + +## ๐ฏ Success Indicators + +### โ Everything is Working If: +- [ ] Page loads without errors +- [ ] Microphone permission granted +- [ ] Recording captures audio +- [ ] Message sends successfully +- [ ] Bot responds within 15 seconds +- [ ] Audio plays clearly +- [ ] Chat displays messages correctly +- [ ] Can send multiple messages +- [ ] Clear chat button works +- [ ] Mute button works + +### โ ๏ธ Minor Issues If: +- Microphone permission takes time +- Backend response is slow (10-15s) +- Audio transcription not 100% accurate +- Bot response seems generic + +### โ Major Issues If: +- Page won't load at all +- Can't record audio +- Bot never responds +- Backend crashes +- Audio doesn't play at all + +--- + +## ๐ When You Get Stuck + +### Read These (In Order): +1. **QUICK_REFERENCE.md** - Quick fixes +2. **SETUP_GUIDE.md** - Detailed setup help +3. **USER_TESTING_GUIDE.md** - Testing procedures + +### Check These (Also in Order): +1. Terminal window (backend logs) +2. Browser console (F12) +3. Browser address bar (correct URL?) +4. System volume (not muted?) +5. Internet connection (is it working?) + +--- + +## ๐ Next Steps After Success + +1. **Try React Version** (Optional) + ```bash + npm install + npm run dev + ``` + +2. **Test on Mobile** (Optional) + - Follow Mobile Testing steps above + +3. **Try Different Inputs** + - Long sentences + - Questions + - Commands + - Different languages/accents + +4. **Test Limits** + - Very quiet voice + - Very loud voice + - Background noise + - Multiple speakers + +5. **Read Documentation** + - INDEX.md (overview) + - FRONTEND_README.md (features) + - TESTING_GUIDE.md (detailed testing) + +--- + +## ๐ Congratulations! + +You're now successfully running the Voice Chat Application! + +**What to do now:** +- Try sending several messages +- Test with different types of inputs +- Show it to friends +- Explore the codebase +- Try the React version +- Customize if you want + +**You've got this! ๐ช** + +--- + +**Questions?** Check the documentation files in the project folder. + +**Found a bug?** Document it and let the team know! + +**Enjoying it?** Share it with others! + +--- + +Version: 1.0 +Last Updated: November 2024 +Status: Ready for Users โ diff --git a/voice_chat/app/__init__.py b/voice_chat/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/voice_chat/app/asr.py b/voice_chat/app/asr.py new file mode 100644 index 0000000..daebe27 --- /dev/null +++ b/voice_chat/app/asr.py @@ -0,0 +1,54 @@ +# app/asr.py +import torch +from transformers import WhisperProcessor, WhisperForConditionalGeneration +import soundfile as sf +from io import BytesIO +from typing import Union + +processor = WhisperProcessor.from_pretrained("openai/whisper-base") +model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base") +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) +print(f"Device set to use {device}") + +def transcribe_audio(audio_input: Union[str, bytes]) -> str: + """ + Transcribe audio to text + + Args: + audio_input: Audio file path (str) or audio bytes data (bytes) + + Returns: + Transcribed text + """ + # Determine input type + if isinstance(audio_input, bytes): + # Read from bytes + audio_np, samplerate = sf.read(BytesIO(audio_input)) + elif isinstance(audio_input, str): + # Read from file path + audio_np, samplerate = sf.read(audio_input) + else: + raise TypeError(f"Expected str or bytes, got {type(audio_input)}") + + # Resample to 16kHz + if samplerate != 16000: + import librosa + audio_np = librosa.resample(audio_np, orig_sr=samplerate, target_sr=16000) + + # Process audio + input_features = processor( + audio_np, + sampling_rate=16000, + return_tensors="pt" + ).input_features + input_features = input_features.to(device) + + # Generate transcription + with torch.no_grad(): + predicted_ids = model.generate(input_features) + + # Decode + transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + + return transcription \ No newline at end of file diff --git a/voice_chat/app/llm.py b/voice_chat/app/llm.py new file mode 100644 index 0000000..535ccd9 --- /dev/null +++ b/voice_chat/app/llm.py @@ -0,0 +1,29 @@ +from huggingface_hub import InferenceClient +import os +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Load token from environment variable for security +HF_TOKEN = os.getenv("HF_TOKEN") +if not HF_TOKEN: + raise ValueError("HF_TOKEN environment variable not set. Please set it in .env file or as environment variable.") + +client = InferenceClient(token=HF_TOKEN) + +def generate_response(text: str) -> str: + try: + # chat completion API + response = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[{"role": "user", "content": text}], + max_tokens=200 + ) + # response is ChatCompletionOutput object + if response.choices and len(response.choices) > 0: + return response.choices[0].message["content"] + else: + return str(response) + except Exception as e: + return f"Error generating response: {str(e)}" diff --git a/voice_chat/app/main.py b/voice_chat/app/main.py new file mode 100644 index 0000000..43e662e --- /dev/null +++ b/voice_chat/app/main.py @@ -0,0 +1,102 @@ +# app/main.py +from fastapi import FastAPI, UploadFile, File +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles +from fastapi.middleware.cors import CORSMiddleware +import json +from pathlib import Path + +from app.asr import transcribe_audio +from app.tts import synthesize_speech +from app.llm import generate_response + +app = FastAPI() + +# Add CORS middleware to allow requests from frontend +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Create temporary file directory +TEMP_DIR = Path("temp") +TEMP_DIR.mkdir(exist_ok=True) + +# Mount static files +STATIC_DIR = Path("static") +if STATIC_DIR.exists(): + app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + +@app.get("/") +def read_root(): + """Serve the main HTML file""" + return FileResponse("index.html", media_type="text/html") + +@app.post("/chat/") +async def chat_endpoint(audio: UploadFile = File(...)): + """ + Voice chat endpoint + 1. Receive audio file + 2. Transcribe to text (ASR) + 3. Generate response (LLM) + 4. Synthesize speech (TTS) + 5. Return JSON with transcribed text, response text, and audio + """ + audio_path = None + try: + # Save uploaded audio file with .wav extension + audio_path = TEMP_DIR / "input_audio.wav" + content = await audio.read() + with open(audio_path, "wb") as f: + f.write(content) + + print(f"[CHAT] Saved audio to: {audio_path}") + + # 1. ASR: Audio -> Text + print("[CHAT] Transcribing audio...") + user_text = transcribe_audio(str(audio_path)) + print(f"[CHAT] Transcribed: {user_text}") + + # 2. LLM: Generate response + print("[CHAT] Generating response...") + bot_response = generate_response(user_text) + print(f"[CHAT] Response: {bot_response}") + + # 3. TTS: Text -> Audio + print("[CHAT] Synthesizing speech...") + output_path = TEMP_DIR / "output.wav" + synthesize_speech(bot_response, str(output_path)) + print(f"[CHAT] Synthesized to: {output_path}") + + # 4. Read audio file and return with text + with open(output_path, "rb") as f: + audio_bytes = f.read() + + # Return JSON response with text and audio data + import base64 + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + return { + "user_text": user_text, + "bot_response": bot_response, + "audio": audio_b64 + } + + except Exception as e: + import traceback + print(f"[CHAT ERROR] {str(e)}") + traceback.print_exc() + return {"error": str(e)} + + finally: + # Clean up temporary files + if audio_path and audio_path.exists(): + audio_path.unlink() + +@app.get("/health") +def health_check(): + """Health check endpoint""" + return {"status": "healthy"} \ No newline at end of file diff --git a/voice_chat/app/tts.py b/voice_chat/app/tts.py new file mode 100644 index 0000000..3ae12cf --- /dev/null +++ b/voice_chat/app/tts.py @@ -0,0 +1,91 @@ +# app/tts.py +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan +import torch +import soundfile as sf +import numpy as np + +# Device setup +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(f"[TTS] Using device: {device}") + +# Global model loading (load once for efficiency) +processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") +model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") +model = model.to(device) +vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") +vocoder = vocoder.to(device) + +print("[TTS] Models loaded successfully") + + +def synthesize_speech(text: str, output_file: str) -> str: + """ + Generate speech using SpeechT5 with female voice and moderate speed + + Args: + text: Text to convert to speech + output_file: Output WAV file path + + Returns: + Output file path + """ + try: + # Limit text length + text = text[:500] if len(text) > 500 else text + + # Process input text + inputs = processor(text=text, return_tensors="pt") + input_ids = inputs["input_ids"].to(device) + + # Create speaker embeddings on the correct device + # Using a fixed female speaker embedding (512 dims) + speaker_embeddings = torch.FloatTensor([ + 0.1, -0.2, 0.3, -0.15, 0.25, -0.1, 0.2, -0.25, 0.15, -0.3, + -0.1, 0.2, -0.25, 0.15, -0.3, 0.1, -0.2, 0.3, -0.15, 0.25, + 0.05, -0.15, 0.2, -0.1, 0.25, -0.2, 0.15, -0.25, 0.1, -0.3, + -0.15, 0.1, -0.2, 0.15, -0.25, 0.2, -0.1, 0.3, -0.05, 0.2, + ] * 13)[:512] # Ensure exactly 512 dimensions + speaker_embeddings = speaker_embeddings.unsqueeze(0).to(device) + + # Generate speech + with torch.no_grad(): + speech = model.generate_speech( + input_ids, + speaker_embeddings, + vocoder=vocoder + ) + + # Convert to numpy array + speech_np = speech.cpu().numpy() + + # Ensure audio is in correct format + if speech_np.ndim > 1: + speech_np = speech_np.squeeze() + + # Normalize audio + max_val = np.max(np.abs(speech_np)) + if max_val > 1.0: + speech_np = speech_np / max_val + + # Slow down speech by 15% + speech_slow = np.interp( + np.linspace(0, len(speech_np) - 1, int(len(speech_np) * 1.15)), + np.arange(len(speech_np)), + speech_np + ) + + # Save as WAV file + sf.write(output_file, speech_slow, samplerate=16000) + print(f"[TTS] Audio saved to {output_file}") + + return output_file + + except Exception as e: + print(f"[TTS Error] {str(e)}") + import traceback + traceback.print_exc() + + # Fallback: create silent audio + silent_audio = np.zeros(16000) + sf.write(output_file, silent_audio, samplerate=16000) + raise Exception(f"TTS generation failed: {str(e)}") \ No newline at end of file diff --git a/voice_chat/index.html b/voice_chat/index.html new file mode 100644 index 0000000..e05161d --- /dev/null +++ b/voice_chat/index.html @@ -0,0 +1,58 @@ + + +
+ + +Talk to your AI assistant
+Processing...
+