diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..1000218 --- /dev/null +++ b/.env.example @@ -0,0 +1,15 @@ +# PersonaPlex Environment Configuration +# Copy this file to .env and fill in your values + +# Hugging Face API Token (Required) +# Get your token from: https://huggingface.co/settings/tokens +# Required to download PersonaPlex models +HF_TOKEN=your_token_here + +# Optional: Custom voices directory +# Specify a custom location for your voice embeddings +# CUSTOM_VOICE_DIR=/path/to/my/voices + +# Optional: Custom cache directory for Hugging Face models +# By default, models are cached in ~/.cache/huggingface/ +# HF_HOME=/path/to/custom/cache diff --git a/.gitignore b/.gitignore index 3278df4..9d432a4 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,14 @@ mlx-trace.json # Include everything in assets !assets/ !assets/** + +# Custom voice files (keep directory structure in git, ignore voice files) +custom_voices/*.pt +custom_voices/*.wav +# But keep the README +!custom_voices/README.md + +# Claude Code and personal tooling files (should never be committed) +.agent/ +Agents.md +Claude.local.md diff --git a/FRONTEND_DEVELOPMENT.md b/FRONTEND_DEVELOPMENT.md new file mode 100644 index 0000000..519fd5e --- /dev/null +++ b/FRONTEND_DEVELOPMENT.md @@ -0,0 +1,284 @@ +# Frontend Development Guide + +This guide explains how to develop and test custom UI changes for PersonaPlex. + +## Understanding Smart Auto-Detection + +**PersonaPlex now automatically detects and serves your custom UI!** You no longer need to use the `--static` flag for development. + +### How Auto-Detection Works + +When you start the server, it checks: +1. Does `client/dist` exist in the project directory? +2. **YES** → Automatically serves your custom UI +3. **NO** → Downloads and serves the default UI from HuggingFace + +### Starting the Server (Auto-Detection) + +```bash +# Just start the server normally - no flags needed! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**Log output with custom UI detected:** +``` +Found custom UI at /home/.../personaplex-blackwell/client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +serving static content from /home/.../personaplex-blackwell/client/dist +``` + +**Log output without custom UI:** +``` +retrieving the static content +static_path = /home/.../.cache/huggingface/.../dist +serving static content from /home/.../.cache/huggingface/.../dist +``` + +### Manual Override (Optional) + +You can still manually specify the UI source if needed: + +```bash +# Force specific directory +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static /path/to/custom/dist + +# Disable static serving +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none +``` + +## Frontend Development Workflow + +### Prerequisites + +1. Install Node.js and npm (if not already installed): + ```bash + # Check if already installed + node --version + npm --version + ``` + +2. Install frontend dependencies: + ```bash + cd client + npm install + ``` + +### Development Steps (Simplified!) + +#### 1. Make Your Changes +Edit files in the `client/src/` directory: +- `client/src/components/` - React components +- `client/src/styles/` - CSS and styling +- `client/src/App.tsx` - Main application component + +#### 2. Build the Frontend +```bash +cd client +npm run build +cd .. +``` + +This creates/updates the `client/dist` directory with your compiled code. + +#### 3. Start Server (Auto-Detection!) +```bash +# From project root - server auto-detects custom UI! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +#### 4. Verify Custom UI is Loaded +Check the server logs for: +``` +Found custom UI at .../client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +``` + +If you see `retrieving the static content`, the build might not exist. Go back to step 2. + +#### 5. Test Your Changes +1. Open the Web UI: https://localhost:8998 +2. Hard refresh (Ctrl+Shift+R or Cmd+Shift+R) to clear browser cache +3. Test your modifications + +#### 6. Iterate +Repeat steps 1-5 for each change: +```bash +# Make changes to client/src/... +cd client && npm run build && cd .. + +# Restart server (Ctrl+C to stop first) - auto-detects custom UI! +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**That's it! No `--static` flag needed anymore.** + +## Troubleshooting + +### Changes Not Appearing + +**Problem:** You rebuilt the frontend but don't see changes in the browser. + +**Solutions:** +1. **Verify server is using custom UI:** + - Check logs for `static_path = client/dist` + - If not, restart with `--static client/dist` + +2. **Clear browser cache:** + - Hard refresh: Ctrl+Shift+R (Windows/Linux) or Cmd+Shift+R (Mac) + - Or open DevTools (F12) → Network tab → Check "Disable cache" + +3. **Verify build completed successfully:** + ```bash + cd client + npm run build + ls -la dist/ # Should show recent timestamps + ``` + +4. **Check for build errors:** + ```bash + cd client + npm run build 2>&1 | grep -i error + ``` + +### Server Won't Start with --static Flag + +**Problem:** Error when starting server with `--static client/dist` + +**Solutions:** +1. **Verify dist directory exists:** + ```bash + ls -la client/dist/ + ``` + If missing, build the frontend first: `cd client && npm run build` + +2. **Check path is correct:** + - Use relative path: `--static client/dist` + - From project root, not from client/ directory + +### Frontend Build Fails + +**Problem:** `npm run build` fails with errors + +**Solutions:** +1. **Check Node.js version:** + ```bash + node --version + # Should be 16.x or higher + ``` + +2. **Reinstall dependencies:** + ```bash + cd client + rm -rf node_modules package-lock.json + npm install + npm run build + ``` + +3. **Check for TypeScript errors:** + ```bash + cd client + npm run type-check + ``` + +## Development Tips + +### Shell Alias for Quick Development +Add to your `~/.bashrc` or `~/.zshrc`: + +```bash +# Quick start with custom UI +alias moshi-dev='conda activate personaplex && SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist' + +# Quick frontend rebuild +alias moshi-build='cd client && npm run build && cd ..' +``` + +Usage: +```bash +# Make changes to client/src/... +moshi-build # Rebuild frontend +moshi-dev # Start server with custom UI +``` + +### Watch Mode for Live Development + +For faster iteration, use the frontend in development mode: + +```bash +# Terminal 1: Start backend server (without static flag) +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# Terminal 2: Start frontend dev server with hot reload +cd client +npm run dev +``` + +Then access the UI at the Vite dev server URL (usually http://localhost:5173). + +**Note:** This requires configuring CORS in the backend. Check `client/vite.config.ts` for proxy settings. + +## Production Deployment + +When ready to deploy your custom UI: + +1. Build the production bundle: + ```bash + cd client + npm run build + ``` + +2. Test the production build: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + ``` + +3. Verify everything works correctly + +4. Commit your changes: + ```bash + git add client/src/ client/dist/ + git commit -m "Add custom UI feature: [description]" + ``` + +## File Structure + +``` +personaplex-blackwell/ +├── client/ # Frontend source code +│ ├── src/ # Source files (edit these) +│ │ ├── components/ # React components +│ │ ├── styles/ # CSS files +│ │ ├── App.tsx # Main app +│ │ └── main.tsx # Entry point +│ ├── dist/ # Built files (generated) +│ │ ├── index.html # HTML entry +│ │ ├── assets/ # JS/CSS bundles +│ │ └── ... +│ ├── package.json # Dependencies +│ ├── vite.config.ts # Build config +│ └── tsconfig.json # TypeScript config +└── moshi/ # Backend Python code +``` + +## Quick Reference + +| Task | Command | +|------|---------| +| Install dependencies | `cd client && npm install` | +| Build frontend | `cd client && npm run build` | +| Start with custom UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | +| Start with default UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Dev server (hot reload) | `cd client && npm run dev` | +| Type check | `cd client && npm run type-check` | +| Lint code | `cd client && npm run lint` | + +## Getting Help + +If you encounter issues not covered here: +1. Check [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems +2. Verify your Node.js and npm versions +3. Check the browser console (F12) for JavaScript errors +4. Review server logs for static file serving errors diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..2e1f2d8 --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,125 @@ +# PersonaPlex Quick Start Guide + +This guide provides the essential steps to get PersonaPlex running quickly. + +## Prerequisites + +1. Install [Opus audio codec](https://github.com/xiph/opus) development library: + ```bash + # Ubuntu/Debian + sudo apt install libopus-dev + + # macOS + brew install opus + ``` + +2. Accept the [PersonaPlex model license](https://huggingface.co/nvidia/personaplex-7b-v1) on Hugging Face + +## Installation + +### Step 1: Create Conda Environment + +```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex +``` + +### Step 2: Install Moshi Package + +**For most GPUs:** +```bash +cd moshi +pip install -e . +cd .. +``` + +**For Blackwell GPUs (RTX 50 series):** +```bash +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Then install moshi +cd moshi +pip install -e . +cd .. +``` + +### Step 3: Set Up Hugging Face Token + +**Recommended: Use .env file (persists across sessions)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual Hugging Face token +``` + +**Alternative: Use export (temporary, only for current session)** +```bash +export HF_TOKEN=your_token_here +``` + +## Running the Web UI + +**CRITICAL: Always activate the conda environment first!** + +```bash +# 1. Activate the environment +conda activate personaplex + +# 2. Launch the server (automatically detects custom UI if it exists) +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# 3. Access the Web UI at: https://localhost:8998 +``` + +### Smart Auto-Detection + +The server now **automatically detects and uses your custom UI** if you've built it! + +- If `client/dist` exists → Your custom UI is served automatically +- If `client/dist` doesn't exist → Default UI is downloaded from HuggingFace + +**Verify which UI loaded** by checking the server logs: +- Custom UI: `Found custom UI at .../client/dist, using it instead of default` +- Default UI: `retrieving the static content` (downloads from HuggingFace) + +### Building Custom UI (If Modified) + +Only needed if you've changed the frontend code: + +```bash +cd client +npm run build +cd .. + +# Now start the server - it will auto-detect your custom build +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +## Quick Command Reference + +| Task | Command | +|------|---------| +| Activate environment | `conda activate personaplex` | +| Start Web UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Start with CPU offload | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload` | +| Start with local frontend | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | + +## Troubleshooting + +**Error: "No module named 'moshi'"** +- Solution: Activate the conda environment: `conda activate personaplex` + +**Error: "Access denied" when downloading model** +- Solution: Accept the model license and set up your HF token in `.env` file (see Step 3) + +For more issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). + +## Next Steps + +- See [README.md](README.md) for detailed documentation +- Explore voice customization options +- Try different persona prompts +- Check out offline evaluation mode diff --git a/README.md b/README.md index 5509a74..fb01d40 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ [![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) [![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/5jAXrrbwRb) +**🚀 New to PersonaPlex? See [QUICKSTART.md](QUICKSTART.md) for a fast setup guide!** + +**🎨 Developing custom UI? See [FRONTEND_DEVELOPMENT.md](FRONTEND_DEVELOPMENT.md) for frontend development workflow!** + PersonaPlex is a real-time, full-duplex speech-to-speech conversational model that enables persona control through text-based role prompts and audio-based voice conditioning. Trained on a combination of synthetic and real conversations, it produces natural, low-latency spoken interactions with a consistent persona. PersonaPlex is based on the [Moshi](https://arxiv.org/abs/2410.00037) architecture and weights.

@@ -31,31 +35,109 @@ brew install opus ### Installation -Download this repository and install with: +Download this repository and set up the environment: + +#### Option 1: Using Conda (Recommended) ```bash -pip install moshi/. +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install the moshi package in editable mode (for development) +cd moshi +pip install -e . +cd .. ``` -Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): +**Note:** Use `pip install -e .` (editable mode) during development so code changes are immediately reflected without reinstalling. + +#### Option 2: For Blackwell GPUs (RTX 50 series) +Blackwell GPUs require PyTorch with CUDA 13.0+ support. Install PyTorch first, then the moshi package: ```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Then install the moshi package (will use existing PyTorch) +pip install moshi/. ``` +See https://github.com/NVIDIA/personaplex/issues/2 for more details on Blackwell GPU support. + ### Accept Model License -Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
-Then set up your Huggingface authentication: + +Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1). + +Then set up your Huggingface authentication using one of these methods: + +**Option 1: .env file (Recommended)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual token +``` + +**Option 2: Environment variable** ```bash export HF_TOKEN= ``` +**Option 3: Hugging Face CLI** +```bash +pip install huggingface_hub +huggingface-cli login +``` + + ### Launch Server -Launch server for live interaction (temporary SSL certs for https): +**IMPORTANT: First activate the conda environment:** +```bash +conda activate personaplex +``` + +#### Smart Auto-Detection (Recommended) + +The server **automatically detects and serves your custom UI** if `client/dist` exists: ```bash +# If client/dist exists, it will be used automatically! SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` +**Auto-detection behavior:** +1. Checks if `client/dist` exists in your project +2. If yes → serves custom UI from `./client/dist` +3. If no → downloads and serves default UI from HuggingFace + +**How to verify which UI is loading:** +Check the server logs: +- **Custom UI (auto-detected)**: + ``` + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + ``` +- **Default UI (no custom build)**: + ``` + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +#### Manual Override (Optional) + +You can still explicitly specify which UI to use with the `--static` flag: +```bash +# Force use of custom UI from specific directory +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + +# Disable static serving entirely +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none +``` + + **CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): ```bash SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload @@ -74,7 +156,6 @@ Add `--cpu-offload` to any command below if your GPU has insufficient memory (re **Assistant example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATF2.pt" \ --input-wav "assets/test/input_assistant.wav" \ @@ -85,7 +166,6 @@ python -m moshi.offline \ **Service example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATM1.pt" \ --text-prompt "$(cat assets/test/prompt_service.txt)" \ @@ -105,6 +185,118 @@ Variety(female): VARF0, VARF1, VARF2, VARF3, VARF4 Variety(male): VARM0, VARM1, VARM2, VARM3, VARM4 ``` +### Custom Voices + +PersonaPlex supports **dynamic custom voice loading** - add new voices and they automatically appear in the Web UI without code changes! + +#### Quick Start + +**Step 1: Prepare your audio file** + +Record a ~10 second WAV file of clear speech. Convert it to mono 24kHz format: +```bash +ffmpeg -i your_recording.wav -ac 1 -ar 24000 my_voice.wav +``` + +**Step 2: Copy to voices directory** + +Copy the converted audio to the voices directory: +```bash +cp my_voice.wav ~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/ +``` + +**Step 3: Generate voice embeddings** + +Run the offline script with `--save-voice-embeddings` to generate the `.pt` file: +```bash +python -m moshi.offline \ + --voice-prompt "my_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" +``` + +This creates `my_voice.pt` in the voices directory. + +**Step 4: Use your custom voice** + +**With the Web UI:** Restart the server and your custom voice automatically appears in the voice dropdown! Custom voices appear first in the list. +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**With offline mode:** Use the `.pt` file directly: +```bash +python -m moshi.offline --voice-prompt "my_voice.pt" ... +``` + +#### Custom Voices Directory + +For easier management, you can place custom voices in the `custom_voices/` directory: + +```bash +# Create your custom voices directory (if it doesn't exist) +mkdir -p custom_voices + +# Place voice files here +cp my_voice.wav custom_voices/ +# Generate embeddings... +# The generated my_voice.pt will appear in the Web UI! +``` + +**Configure custom location (optional):** + +Preferred method - add to your `.env` file: +```bash +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +Or use environment variable (temporary): +```bash +export CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +#### Voice File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices in the Web UI +- **`.wav` files**: Source audio (24kHz mono) - used only to GENERATE the `.pt` embeddings + +**Important:** Only `.pt` files appear in the voice selector dropdown. The `.wav` files are intermediate source files used during voice generation. + +#### API Access + +List all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns JSON with all voices, their types, and categories. + +## Example Usage + +### Auto-Detection +```bash +# Build frontend +cd client && npm run build && cd .. + +# Server auto-detects - no flag needed! +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +# Logs: "Found custom UI at .../client/dist, using it instead of default" +``` + +### Custom Voices +```bash +# Add voice file +cp my_voice.wav custom_voices/ + +# Generate embeddings +python -m moshi.offline --voice-prompt "my_voice.wav" \ + --save-voice-embeddings --input-wav "assets/test/input_assistant.wav" --output-wav "/tmp/out.wav" + +# Restart server - voice appears in UI automatically! +``` + ## Prompting Guide The model is trained on synthetic conversations for a fixed assistant role and varying customer service roles. @@ -159,6 +351,14 @@ Personaplex finetunes Moshi and benefits from the generalization capabilities of You enjoy having a good conversation. Have a technical discussion about fixing a reactor core on a spaceship to Mars. You are an astronaut on a Mars mission. Your name is Alex. You are already dealing with a reactor core meltdown on a Mars mission. Several ship systems are failing, and continued instability will lead to catastrophic failure. You explain what is happening and you urgently ask for help thinking through how to stabilize the reactor. ``` +## Troubleshooting + +For common issues and solutions, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md), including: +- Code changes not reflected when running server (editable install issue) +- Custom voices not appearing in Web UI +- Frontend build and development issues +- Environment and dependency problems + ## License The present code is provided under the MIT license. The weights for the models are released under the NVIDIA Open Model license. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..d1af55a --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,250 @@ +# Troubleshooting Guide + +## Common Issues + +### Module Not Found: 'moshi' + +**Symptom:** When running `python -m moshi.server`, you get: +``` +ModuleNotFoundError: No module named 'moshi' +``` + +**Root Cause:** The conda environment is not activated, or moshi is not installed in the active environment. + +**Solution:** + +1. Activate the conda environment: + ```bash + conda activate personaplex + ``` + +2. Verify the environment is active (you should see `(personaplex)` in your prompt): + ```bash + conda info --envs + # Should show * next to personaplex + ``` + +3. If moshi is not installed, install it: + ```bash + cd moshi + pip install -e . + cd .. + ``` + +4. Try running the server again: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +**Prevention:** Always activate the conda environment before running PersonaPlex commands. Add a reminder to your workflow or shell configuration. + +## Development Issues + +### Code Changes Not Reflected When Running Server + +**Symptom:** You made changes to backend code (e.g., added new endpoints, modified routes), but when you restart the server with `python -m moshi.server`, the changes don't appear. New API endpoints return 404, and debug logging doesn't show up. + +**Root Cause:** The moshi-personaplex package was installed in regular mode (`pip install .`) instead of editable mode (`pip install -e .`). When installed normally, pip copies the code to site-packages (e.g., `/path/to/envs/personaplex/lib/python3.10/site-packages/moshi`), and Python loads from there instead of your source directory. + +**Solution:** + +1. Check if the package is installed: + ```bash + pip list | grep moshi + ``` + +2. Check installation location: + ```bash + pip show moshi-personaplex + ``` + + If `Location` shows `site-packages`, it's not in editable mode. + +3. Uninstall and reinstall in editable mode: + ```bash + pip uninstall -y moshi-personaplex + cd moshi + pip install -e . + ``` + +4. Verify editable install: + ```bash + pip show moshi-personaplex + ``` + + The `Location` should show something like `/path/to/repo/moshi` instead of `site-packages`. + +5. Restart the server: + ```bash + python -m moshi.server + ``` + +**Prevention:** Always use `pip install -e .` (with the `-e` flag) when installing packages for development. + +## Server Issues + +### Custom UI Not Loading (Server Uses Default UI) + +**Symptom:** You've modified the frontend (client/ directory), rebuilt it, but when you start the server, your changes don't appear. + +**Root Cause:** The `client/dist` directory doesn't exist or is empty. The server auto-detects custom UI by checking if `client/dist` exists. + +**Solution:** + +1. **Verify the build exists:** + ```bash + ls -la client/dist/ + ``` + If this directory doesn't exist or is empty, you need to build the frontend first. + +2. **Build the frontend:** + ```bash + cd client + npm install # If you haven't already + npm run build + cd .. + ``` + +3. **Restart the server** (it will now auto-detect the custom UI): + ```bash + conda activate personaplex + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +4. **Verify auto-detection worked** by checking the logs: + ``` + # SUCCESS - Custom UI detected: + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + + # FAIL - No custom UI found: + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +5. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) to clear cached assets + +**When auto-detection won't work:** +- `client/dist` directory doesn't exist +- `client/dist` exists but is empty +- Permissions prevent reading the directory + +**Manual override (if needed):** +If auto-detection fails but you know the build exists, use the `--static` flag: +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist +``` + +**Development tip:** +After making frontend changes, rebuild and the server will auto-detect: +```bash +cd client && npm run build && cd .. +# Restart server - custom UI detected automatically! +``` + +### Server Returns 404 for API Endpoints + +If specific API endpoints return 404: + +1. Check if routes are registered correctly by looking for debug logging at server startup +2. Verify the package is installed in editable mode (see above) +3. Check that static routes are registered AFTER API routes in server.py +4. Clear Python cache and restart: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + python -m moshi.server + ``` + +### Build Directory Conflicts + +If you suspect the `moshi/build/` directory contains old code: + +1. Move it out of the way: + ```bash + mv moshi/build moshi/build.bak + ``` + +2. Clear Python cache: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + ``` + +3. Restart the server + +## Frontend Issues + +### Frontend Not Showing New Features + +If you modified React components but don't see changes: + +1. Rebuild the frontend: + ```bash + cd client + npm run build + ``` + +2. Restart the server (it serves the static files from client/dist) + +3. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) + +### Voice Dropdown Shows "Error loading voices" + +1. Check if the server is running: + ```bash + ps aux | grep moshi.server + ``` + +2. Test the API endpoint directly: + ```bash + curl http://localhost:8998/api/voices + ``` + +3. Check server logs for errors + +4. Verify VoiceDiscovery can find voice files: + ```bash + python -c "from moshi.voice_discovery import VoiceDiscovery; print(len(VoiceDiscovery.list_voices()))" + ``` + +## Environment Issues + +### Missing HuggingFace Token + +If models fail to download: + +1. Create a `.env` file in the repository root: + ```bash + cp .env.example .env + # Edit .env and add your token: HF_TOKEN=your_token_here + ``` +2. Or set environment variable: + ```bash + export HF_TOKEN=your_token_here + ``` +3. Or use the Hugging Face CLI: + ```bash + pip install huggingface_hub + huggingface-cli login + ``` + +### ffmpeg Not Found + +If voice generation fails with "Command 'ffmpeg' not found": + +```bash +sudo apt install ffmpeg # Ubuntu/Debian +brew install ffmpeg # macOS +``` + +## Getting Help + +If you encounter issues not covered here: + +1. Check the README.md for setup instructions +2. Review recent commits for breaking changes +3. Open an issue at https://github.com/nvidia/personaplex-7b-v1/issues with: + - Your environment (OS, Python version, conda/venv) + - Steps to reproduce the issue + - Complete error messages and logs diff --git a/client/package-lock.json b/client/package-lock.json index 3997642..b1b8f01 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -1315,6 +1315,7 @@ "integrity": "sha512-cisd7gxkzjBKU2GgdYrTdtQx1SORymWyaAFhaxQPK9bYO9ot3Y5OikQRvY0VYQtvwjeQnizCINJAenh/V7MK2w==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.2.2" @@ -1769,6 +1770,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2169,6 +2171,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2911,6 +2914,7 @@ "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -2967,6 +2971,7 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -4270,6 +4275,7 @@ "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", "dev": true, "license": "MIT", + "peer": true, "bin": { "jiti": "bin/jiti.js" } @@ -5017,6 +5023,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -5197,6 +5204,7 @@ "integrity": "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==", "dev": true, "license": "MIT", + "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -5424,6 +5432,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -5436,6 +5445,7 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -6283,6 +6293,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -6450,6 +6461,7 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6525,6 +6537,7 @@ "integrity": "sha512-4Z+L8I2OqhZV8qA132M4wNL30ypZGYOQVBfMgxDH/K5UX0PNqTu1c6za9ST5r9+tavvHiTWmBnKzpCJ/GlVFtg==", "dev": true, "license": "BSD-2-Clause", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "7.18.0", "@typescript-eslint/types": "7.18.0", @@ -6767,6 +6780,7 @@ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", diff --git a/client/src/hooks/useVoices.ts b/client/src/hooks/useVoices.ts new file mode 100644 index 0000000..2f269c5 --- /dev/null +++ b/client/src/hooks/useVoices.ts @@ -0,0 +1,52 @@ +import { useState, useEffect } from 'react'; + +export interface Voice { + name: string; + type: 'embeddings' | 'audio'; + category: 'custom' | 'natural-female' | 'natural-male' | 'variety-female' | 'variety-male' | 'other'; + path: string; +} + +export interface UseVoicesReturn { + voices: Voice[]; + loading: boolean; + error: string | null; + refresh: () => void; +} + +export function useVoices(): UseVoicesReturn { + const [voices, setVoices] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + const fetchVoices = async () => { + setLoading(true); + setError(null); + + try { + const response = await fetch('/api/voices'); + if (!response.ok) { + throw new Error(`Failed to fetch voices: ${response.statusText}`); + } + + const data = await response.json(); + setVoices(data.voices || []); + } catch (err) { + setError(err instanceof Error ? err.message : 'Unknown error'); + console.error('Error fetching voices:', err); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchVoices(); + }, []); + + return { + voices, + loading, + error, + refresh: fetchVoices, + }; +} diff --git a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx index 07afedb..66f6396 100644 --- a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx +++ b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx @@ -1,6 +1,7 @@ import { FC, RefObject, useState } from "react"; import { useModelParams } from "../../hooks/useModelParams"; import { Button } from "../../../../components/Button/Button"; +import { useVoices } from "../../../../hooks/useVoices"; type ModelParamsProps = { isConnected: boolean; @@ -24,6 +25,7 @@ export const ModelParams:FC = ({ }) => { const [modalVoicePrompt, setModalVoicePrompt] = useState(voicePrompt); const [modalTextPrompt, setModalTextPrompt] = useState(textPrompt); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); return (

@@ -37,25 +39,25 @@ export const ModelParams:FC = ({ diff --git a/client/src/pages/Queue/Queue.tsx b/client/src/pages/Queue/Queue.tsx index 7d0d44b..5acef1d 100644 --- a/client/src/pages/Queue/Queue.tsx +++ b/client/src/pages/Queue/Queue.tsx @@ -7,13 +7,7 @@ import { Button } from "../../components/Button/Button"; import { useModelParams } from "../Conversation/hooks/useModelParams"; import { env } from "../../env"; import { prewarmDecoderWorker } from "../../decoder/decoderWorker"; - -const VOICE_OPTIONS = [ - "NATF0.pt", "NATF1.pt", "NATF2.pt", "NATF3.pt", - "NATM0.pt", "NATM1.pt", "NATM2.pt", "NATM3.pt", - "VARF0.pt", "VARF1.pt", "VARF2.pt", "VARF3.pt", "VARF4.pt", - "VARM0.pt", "VARM1.pt", "VARM2.pt", "VARM3.pt", "VARM4.pt", -]; +import { useVoices } from "../../hooks/useVoices"; const TEXT_PROMPT_PRESETS = [ { @@ -41,6 +35,9 @@ interface HomepageProps { setTextPrompt: (value: string) => void; voicePrompt: string; setVoicePrompt: (value: string) => void; + voicesLoading: boolean; + voicesError: string | null; + voices: Array<{ name: string; type: string; category: string; path: string }>; } const Homepage = ({ @@ -50,6 +47,9 @@ const Homepage = ({ setTextPrompt, voicePrompt, setVoicePrompt, + voicesLoading, + voicesError, + voices, }: HomepageProps) => { return (
@@ -102,16 +102,24 @@ const Homepage = ({ name="voice-prompt" value={voicePrompt} onChange={(e) => setVoicePrompt(e.target.value)} - className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent" + disabled={voicesLoading} + className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent disabled:bg-gray-100 disabled:cursor-not-allowed" > - {VOICE_OPTIONS.map((voice) => ( - - ))} + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}
@@ -132,6 +140,7 @@ export const Queue:FC = () => { const [hasMicrophoneAccess, setHasMicrophoneAccess] = useState(false); const [showMicrophoneAccessMessage, setShowMicrophoneAccessMessage] = useState(false); const modelParams = useModelParams(); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); const audioContext = useRef(null); const worklet = useRef(null); @@ -209,6 +218,9 @@ export const Queue:FC = () => { setTextPrompt={modelParams.setTextPrompt} voicePrompt={modelParams.voicePrompt} setVoicePrompt={modelParams.setVoicePrompt} + voicesLoading={voicesLoading} + voicesError={voicesError} + voices={voices} /> )} diff --git a/custom_voices/README.md b/custom_voices/README.md new file mode 100644 index 0000000..6f12f93 --- /dev/null +++ b/custom_voices/README.md @@ -0,0 +1,99 @@ +# Custom Voices Directory + +This directory is for storing your custom voice files. Any voice files (.pt or .wav) placed here will automatically appear in the PersonaPlex web interface voice selector. + +## Quick Start + +1. **Prepare your audio file** (10+ seconds of clear speech): + ```bash + ffmpeg -i your_recording.mp3 -ac 1 -ar 24000 your_voice.wav + ``` + +2. **Generate voice embeddings**: + ```bash + # Copy the WAV file to the voices directory + VOICES_DIR=$(python -c "from pathlib import Path; from huggingface_hub import snapshot_download; print(Path(snapshot_download(repo_id='nvidia/personaplex-7b-v1', allow_patterns=['voices/*'])) / 'voices')") + cp your_voice.wav "$VOICES_DIR/" + + # Generate embeddings + python -m moshi.offline \ + --voice-prompt "your_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" + ``` + +3. **Restart the server** and your voice will appear in the dropdown! + +## File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices that appear in the UI dropdown +- **`.wav` files**: Source audio recordings (24kHz mono) - used only to GENERATE the .pt embeddings, not selectable as voices + +**Important**: Only `.pt` files appear in the voice selector dropdown. `.wav` files are intermediate source files used during voice generation. + +## Configuration + +By default, PersonaPlex looks for voices in: +1. HuggingFace cache: `~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/` +2. Custom directory: `./custom_voices/` (this directory) + +To use a different custom voices directory: + +Add to your `.env` file (recommended): +```bash +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +Or use environment variable: +```bash +export CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +## Voice Naming Convention + +Pre-packaged voices follow this naming: +- `NATF*` = Natural Female +- `NATM*` = Natural Male +- `VARF*` = Variety Female +- `VARM*` = Variety Male + +Custom voices (any other name) will appear first in the dropdown, followed by the categorized pre-packaged voices. + +## API Access + +You can list all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns: +```json +{ + "voices": [ + {"name": "your_voice.pt", "type": "embeddings", "category": "custom", "path": "..."}, + {"name": "NATF0.pt", "type": "embeddings", "category": "natural-female", "path": "..."}, + ... + ], + "count": 20 +} +``` + +## Tips + +- Use high-quality audio recordings (clear speech, minimal background noise) +- 10-30 seconds of audio is usually sufficient +- The voice will reflect the speaking style and characteristics of the input audio +- Experiment with different recordings to find the best voice for your use case + +## Troubleshooting + +If your custom voice doesn't appear: +1. Verify the file is in the correct directory (`ls custom_voices/`) +2. Check the file extension is `.pt` or `.wav` +3. Restart the PersonaPlex server +4. Test the API endpoint: `curl http://localhost:8998/api/voices` +5. Check server logs for errors + +For more help, see `TROUBLESHOOTING.md` in the repository root. diff --git a/moshi/moshi/offline.py b/moshi/moshi/offline.py index f690620..904895c 100644 --- a/moshi/moshi/offline.py +++ b/moshi/moshi/offline.py @@ -51,6 +51,10 @@ import sentencepiece import sphn from huggingface_hub import hf_hub_download +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() from .client_utils import make_log from .models import loaders, LMGen, MimiModel @@ -381,9 +385,21 @@ def main(): help="Offload LM model layers to CPU when GPU memory is insufficient. " "Requires 'accelerate' package.") parser.add_argument("--seed", type=int, default=-1, help="Seed for reproducibility (-1 disables)") + parser.add_argument( + "--save-voice-embeddings", action="store_true", + help="Save voice prompt embeddings to a .pt file for faster reuse" + ) args = parser.parse_args() + # Warn if .env exists but HF_TOKEN is not set + env_file = Path(__file__).parent.parent.parent / ".env" + if env_file.exists() and not os.getenv("HF_TOKEN"): + log("warning", + "Found .env file but HF_TOKEN is not set. " + "Models requiring authentication may fail to download. " + "See .env.example for configuration details.") + # If --voice-prompt-dir is omitted, voices.tgz is downloaded from HF and extracted. voice_prompt_dir = _get_voice_prompt_dir( args.voice_prompt_dir, @@ -403,6 +419,7 @@ def main(): # Normalize greedy flag behavior (True if present, False otherwise) greedy = bool(args.greedy) + save_embeddings = bool(args.save_voice_embeddings) with torch.no_grad(): run_inference( @@ -422,7 +439,7 @@ def main(): topk_audio=args.topk_audio, topk_text=args.topk_text, greedy=greedy, - save_voice_prompt_embeddings=False, + save_voice_prompt_embeddings=save_embeddings, cpu_offload=args.cpu_offload, ) diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 771f491..669a2ec 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -44,11 +44,16 @@ import sphn import torch import random +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() from .client_utils import make_log, colorize from .models import loaders, MimiModel, LMModel, LMGen from .utils.connection import create_ssl_context, get_lan_ip from .utils.logging import setup_logger, ColorizedLog +from .voice_discovery import VoiceDiscovery logger = setup_logger(__name__) @@ -308,6 +313,20 @@ async def is_alive(): clog.log("info", "done with connection") return ws + async def handle_list_voices(self, request): + """List all available voices from configured directories.""" + try: + voices = VoiceDiscovery.list_voices() + return web.json_response({ + 'voices': voices, + 'count': len(voices) + }) + except Exception as e: + logger.error(f"Error listing voices: {e}") + return web.json_response({ + 'error': str(e) + }, status=500) + def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Optional[str]: """ @@ -338,8 +357,51 @@ def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Opti return str(voices_dir) +def _is_valid_ui_build(dist_path: Path) -> bool: + """ + Validate that a directory contains a valid UI build. + + Args: + dist_path: Path to the dist directory + + Returns: + True if the directory contains a valid build (has index.html), False otherwise + """ + if not dist_path.is_dir(): + return False + + # Check for essential file - index.html must exist and be non-empty + index_html = dist_path / "index.html" + try: + return index_html.exists() and index_html.stat().st_size > 0 + except (OSError, PermissionError): + return False + + def _get_static_path(static: Optional[str]) -> Optional[str]: if static is None: + # Auto-detect: prefer local custom UI (client/dist) if it exists + try: + # Priority 1: Check current working directory (works for all install modes) + cwd_dist = Path.cwd() / "client" / "dist" + if _is_valid_ui_build(cwd_dist): + logger.info(f"Found custom UI at {cwd_dist}, using it instead of default") + return str(cwd_dist) + + # Priority 2: Check project root relative to __file__ (works for editable installs) + # server.py is in moshi/moshi/, so project root is 2 levels up + project_root = Path(__file__).parent.parent.parent + local_dist = project_root / "client" / "dist" + + if _is_valid_ui_build(local_dist): + logger.info(f"Found custom UI at {local_dist}, using it instead of default") + return str(local_dist) + + except (OSError, PermissionError) as e: + logger.warning(f"Could not check for custom UI: {e}. Falling back to default.") + # Fall through to HuggingFace download + + # Fall back to HuggingFace default UI logger.info("retrieving the static content") dist_tgz = hf_hub_download("nvidia/personaplex-7b-v1", "dist.tgz") dist_tgz = Path(dist_tgz) @@ -392,6 +454,16 @@ def main(): ) args = parser.parse_args() + + # Warn if .env exists but HF_TOKEN is not set + env_file = Path(__file__).parent.parent.parent / ".env" + if env_file.exists() and not os.getenv("HF_TOKEN"): + logger.warning( + "Found .env file but HF_TOKEN is not set. " + "Models requiring authentication may fail to download. " + "See .env.example for configuration details." + ) + args.voice_prompt_dir = _get_voice_prompt_dir( args.voice_prompt_dir, args.hf_repo, @@ -457,7 +529,19 @@ def main(): logger.info("warming up the model") state.warmup() app = web.Application() + + # Register API routes FIRST before static catch-all + async def test_endpoint(request): + return web.json_response({"status": "ok", "test": True}) + + app.router.add_get("/api/test", test_endpoint) app.router.add_get("/api/chat", state.handle_chat) + app.router.add_get("/api/voices", state.handle_list_voices) + + # Debug: log registered routes + logger.info(f"Registered routes so far: {[r.resource.canonical for r in app.router.routes()]}") + + # Register static routes AFTER API routes if static_path is not None: async def handle_root(_): return web.FileResponse(os.path.join(static_path, "index.html")) @@ -467,6 +551,9 @@ async def handle_root(_): app.router.add_static( "/", path=static_path, follow_symlinks=True, name="static" ) + + # Debug: log all routes after registration + logger.info(f"All registered routes: {[(r.method, r.resource.canonical) for r in app.router.routes()]}") protocol = "http" ssl_context = None if args.ssl is not None: diff --git a/moshi/moshi/voice_discovery.py b/moshi/moshi/voice_discovery.py new file mode 100644 index 0000000..40cb406 --- /dev/null +++ b/moshi/moshi/voice_discovery.py @@ -0,0 +1,105 @@ +# Copyright (c) Kyutai, all rights reserved. +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Voice discovery service for listing available voices.""" +from pathlib import Path +from typing import List, Dict +import os + + +class VoiceDiscovery: + """Discovers and lists available voice files.""" + + @staticmethod + def get_voice_directories() -> List[Path]: + """Get all directories where voices can be stored. + + Returns: + List of Path objects for directories containing voice files + """ + dirs = [] + + # 1. HuggingFace cache voices directory + hf_cache = os.environ.get('HF_HOME', str(Path.home() / '.cache/huggingface')) + hf_voices = Path(hf_cache) / 'hub' + + # Find personaplex model snapshot + for model_dir in hf_voices.glob('models--nvidia--personaplex-7b-v1/snapshots/*'): + voices_dir = model_dir / 'voices' + if voices_dir.exists(): + dirs.append(voices_dir) + + # 2. Custom voices directory (from .env or default) + custom_dir = os.environ.get('CUSTOM_VOICE_DIR', './custom_voices') + custom_path = Path(custom_dir) + if custom_path.exists(): + dirs.append(custom_path) + + return dirs + + @staticmethod + def list_voices() -> List[Dict[str, str]]: + """List all available voices. + + Only returns .pt embedding files, not .wav source audio files. + .wav files are used to generate embeddings and should not be listed as voices. + + Returns: + List of voice info dicts with keys: name, type, category, path + Sorted with custom voices first, then by category, then alphabetically + """ + voices = [] + seen_names = set() + + for voice_dir in VoiceDiscovery.get_voice_directories(): + # Find .pt files (voice embeddings only) + for pt_file in voice_dir.glob('*.pt'): + name = pt_file.name + if name not in seen_names: + category = VoiceDiscovery._categorize_voice(name) + voices.append({ + 'name': name, + 'type': 'embeddings', + 'category': category, + 'path': str(pt_file) + }) + seen_names.add(name) + + # Sort: custom first, then by category, then by name + def sort_key(v): + cat_order = { + 'custom': 0, + 'natural-female': 1, + 'natural-male': 2, + 'variety-female': 3, + 'variety-male': 4, + 'other': 5 + } + return (cat_order.get(v['category'], 99), v['name']) + + return sorted(voices, key=sort_key) + + @staticmethod + def _categorize_voice(filename: str) -> str: + """Categorize voice by filename pattern. + + Args: + filename: Voice filename (.pt extension) + + Returns: + Category string: custom, natural-female, natural-male, + variety-female, variety-male, or other + """ + name = filename.replace('.pt', '') + + if name.startswith('NATF'): + return 'natural-female' + elif name.startswith('NATM'): + return 'natural-male' + elif name.startswith('VARF'): + return 'variety-female' + elif name.startswith('VARM'): + return 'variety-male' + else: + return 'custom' diff --git a/moshi/pyproject.toml b/moshi/pyproject.toml index ead71e9..2ff6ca7 100644 --- a/moshi/pyproject.toml +++ b/moshi/pyproject.toml @@ -10,8 +10,10 @@ dependencies = [ "sentencepiece == 0.2", "sounddevice == 0.5", "sphn >= 0.1.4, < 0.2", - "torch >= 2.2.0, < 2.5", + "torch >= 2.2.0", "aiohttp>=3.10.5, <3.11", + "pyloudnorm >= 0.1.0", + "python-dotenv >= 1.0.0, < 2.0", ] authors = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] maintainers = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] diff --git a/moshi/requirements.txt b/moshi/requirements.txt index e060822..90cfb09 100644 --- a/moshi/requirements.txt +++ b/moshi/requirements.txt @@ -5,5 +5,6 @@ einops==0.7 sentencepiece==0.2 sounddevice==0.5 sphn>=0.1.4,<0.2 -torch>=2.2.0,<2.5 -aiohttp>=3.10.5,<3.11 \ No newline at end of file +torch>=2.2.0 +aiohttp>=3.10.5,<3.11 +pyloudnorm>=0.1.0 \ No newline at end of file
Voice Prompt: {modalVoicePrompt} - setModalVoicePrompt(e.target.value)} + > + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}