From 617b2f4693f5e0074aba05097f909472160a7527 Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Thu, 15 Jan 2026 10:52:28 -0800 Subject: [PATCH 01/24] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 165a05b..76045cb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Weights](https://img.shields.io/badge/🤗-Weights-yellow)](https://huggingface.co/nvidia/personaplex-7b-v1) [![Paper](https://img.shields.io/badge/📄-Paper-blue)](PAPER_LINK) -[![Demo](https://img.shields.io/badge/🎮-Demo-green)](DEMO_LINK) +[![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) [![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/ZrkX72mp) PersonaPlex is a real-time, full-duplex speech-to-speech conversational model that enables persona control through text-based role prompts and audio-based voice conditioning. Trained on a combination of synthetic and real conversations, it produces natural, low-latency spoken interactions with a consistent persona. PersonaPlex is based on the [Moshi](https://arxiv.org/abs/2410.00037) architecture and weights. @@ -131,4 +131,4 @@ The present code is provided under the MIT license. The weights for the models a ## Citation -`TBD` \ No newline at end of file +`TBD` From 1def0c694bf63fc9f317e21c20bd7072f309bb1c Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Thu, 15 Jan 2026 20:48:50 -0800 Subject: [PATCH 02/24] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 76045cb..ab38412 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,13 @@ Download this repository and install with: pip install moshi/. ``` +### Accept Model License +Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
+Then set up your Huggingface authentication: +```bash +export HF_TOKEN= +``` + ### Launch Server Launch server for live interaction (temporary SSL certs for https): From 1535caa6fcbd42b14bddf87a1ffae528bdc6a8ed Mon Sep 17 00:00:00 2001 From: Brandon Tuttle Date: Fri, 16 Jan 2026 11:04:04 -0500 Subject: [PATCH 03/24] add containerized support for personaplex --- .dockerignore | 1 + Dockerfile | 19 +++++++++++++++++++ docker-compose.yaml | 20 ++++++++++++++++++++ package-lock.json | 10 +++++----- 4 files changed, 45 insertions(+), 5 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 docker-compose.yaml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1998c29 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.cache \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..87d9d0c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +ARG BASE_IMAGE="nvcr.io/nvidia/cuda" +ARG BASE_IMAGE_TAG="12.4.1-runtime-ubuntu22.04" + +FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +WORKDIR /app/moshi/ + +COPY moshi/ /app/moshi/ +RUN uv venv /app/moshi/.venv --python 3.12 +RUN uv sync + +RUN mkdir -p /app/ssl + +EXPOSE 8998 + +ENTRYPOINT [] +CMD ["/app/moshi/.venv/bin/python", "-m", "moshi.server", "--ssl", "/app/ssl"] \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..da88c3f --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,20 @@ +services: + personaplex: + build: + context: . + dockerfile: Dockerfile + ports: + - "8998:8998" + environment: + - NO_TORCH_COMPILE=1 + env_file: + - .env + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [ gpu ] + volumes: + - ./.cache:/root/.cache diff --git a/package-lock.json b/package-lock.json index 5e0e826..c0c63ed 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { - "name": "moshi", - "lockfileVersion": 3, - "requires": true, - "packages": {} -} + "name": "moshi", + "lockfileVersion": 3, + "requires": true, + "packages": {} + } \ No newline at end of file From 62ae4f7aa957ef99af41465bec99e85f4c73a6a2 Mon Sep 17 00:00:00 2001 From: Rajarshi Roy Date: Sun, 18 Jan 2026 02:59:14 -0800 Subject: [PATCH 04/24] Temporary fix for HF downloads tracking. --- moshi/moshi/offline.py | 4 ++++ moshi/moshi/server.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/moshi/moshi/offline.py b/moshi/moshi/offline.py index 34ef98d..f7b89d0 100644 --- a/moshi/moshi/offline.py +++ b/moshi/moshi/offline.py @@ -181,6 +181,10 @@ def run_inference( if seed is not None and seed != -1: seed_all(seed) + # Download config.json to increment download counter + # No worries about double-counting since config.json will be cached the second time + hf_hub_download(hf_repo, "config.json") + # 1) Load Mimi encoders/decoders (same as server.py) log("info", "loading mimi") if mimi_weight is None: diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 9cf5e05..0b0d6b7 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -421,6 +421,10 @@ def main(): else: tunnel_token = args.gradio_tunnel_token + # Download config.json to increment download counter + # No worries about double-counting since config.json will be cached the second time + hf_hub_download(args.hf_repo, "config.json") + logger.info("loading mimi") if args.mimi_weight is None: args.mimi_weight = hf_hub_download(args.hf_repo, loaders.MIMI_NAME) From e4e8f4c5f11a0e7cfc004e7858304eed680a8d87 Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Tue, 20 Jan 2026 15:54:50 -0800 Subject: [PATCH 05/24] Update README.md --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ab38412..7f7f5c0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # PersonaPlex: Voice and Role Control for Full Duplex Conversational Speech Models [![Weights](https://img.shields.io/badge/🤗-Weights-yellow)](https://huggingface.co/nvidia/personaplex-7b-v1) -[![Paper](https://img.shields.io/badge/📄-Paper-blue)](PAPER_LINK) +[![Paper](https://img.shields.io/badge/📄-Paper-blue)](https://research.nvidia.com/labs/adlr/files/personaplex/personaplex_preprint.pdf) [![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) [![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/ZrkX72mp) @@ -138,4 +138,11 @@ The present code is provided under the MIT license. The weights for the models a ## Citation -`TBD` +If you use PersonaPlex in your research, please cite our paper: +```bibtex +@article{roy2026personaplex, + title={PersonaPlex: Voice and Role Control for Full Duplex Conversational Speech Models}, + author={Roy, Rajarshi and Raiman, Jonathan and Lee, Sang-gil and Ene, Teodor-Dumitru and Kirby, Robert and Kim, Sungwon and Kim, Jaehyeon and Catanzaro, Bryan}, + year={2026} +} +``` From 0947e855543be81a5e0d623964480f59e548604b Mon Sep 17 00:00:00 2001 From: wachawo Date: Thu, 22 Jan 2026 00:38:55 +0000 Subject: [PATCH 06/24] Install build dependencies and libopus-dev to fix Docker build --- Dockerfile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 87d9d0c..2cc6f1f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,14 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + pkg-config \ + clang \ + ca-certificates \ + libopus-dev \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /app/moshi/ COPY moshi/ /app/moshi/ @@ -16,4 +24,4 @@ RUN mkdir -p /app/ssl EXPOSE 8998 ENTRYPOINT [] -CMD ["/app/moshi/.venv/bin/python", "-m", "moshi.server", "--ssl", "/app/ssl"] \ No newline at end of file +CMD ["/app/moshi/.venv/bin/python", "-m", "moshi.server", "--ssl", "/app/ssl"] From b0b78e49c9e661324ca9595e08df0768e0e29463 Mon Sep 17 00:00:00 2001 From: wachawo Date: Thu, 22 Jan 2026 00:57:17 +0000 Subject: [PATCH 07/24] Removed unused dependencies --- Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2cc6f1f..e146f9b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,8 +8,6 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ pkg-config \ - clang \ - ca-certificates \ libopus-dev \ && rm -rf /var/lib/apt/lists/* From 828e15fc4652062fed4aa2a343daca463d9e2e2b Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Thu, 22 Jan 2026 16:25:01 -0800 Subject: [PATCH 08/24] Update README.md to update permanent discord link and installation fix for Blackwell GPUs. --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7f7f5c0..800ba4d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Weights](https://img.shields.io/badge/🤗-Weights-yellow)](https://huggingface.co/nvidia/personaplex-7b-v1) [![Paper](https://img.shields.io/badge/📄-Paper-blue)](https://research.nvidia.com/labs/adlr/files/personaplex/personaplex_preprint.pdf) [![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) -[![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/ZrkX72mp) +[![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/5jAXrrbwRb) PersonaPlex is a real-time, full-duplex speech-to-speech conversational model that enables persona control through text-based role prompts and audio-based voice conditioning. Trained on a combination of synthetic and real conversations, it produces natural, low-latency spoken interactions with a consistent persona. PersonaPlex is based on the [Moshi](https://arxiv.org/abs/2410.00037) architecture and weights. @@ -22,6 +22,12 @@ Download this repository and install with: pip install moshi/. ``` +Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): +```bash +pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 +``` + + ### Accept Model License Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
Then set up your Huggingface authentication: From 84e209c1609c7a326b269847cc7da802b355ac6f Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Fri, 23 Jan 2026 15:20:12 +0900 Subject: [PATCH 09/24] Add custom voice support and fix Blackwell GPU compatibility - Add --save-voice-embeddings CLI flag to offline.py for generating custom voice prompt embeddings from WAV files - Remove torch < 2.5 upper bound to allow PyTorch 2.10+ for RTX 5090 - Add missing pyloudnorm dependency required for audio normalization - Update README with conda setup instructions, Blackwell GPU guide, and custom voice creation tutorial - Update .gitignore for Claude Code local settings --- .gitignore | 10 +++++++ README.md | 63 ++++++++++++++++++++++++++++++++++++++++-- moshi/moshi/offline.py | 7 ++++- moshi/pyproject.toml | 3 +- moshi/requirements.txt | 5 ++-- 5 files changed, 81 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 3278df4..d5ee5c6 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,13 @@ mlx-trace.json # Include everything in assets !assets/ !assets/** + + +# Claude Code local settings (added by init-team-ai) +.claude/ +*.local.json +Agents.md +Claude.local.md +.agent/ +.DS_Store +Thumbs.db diff --git a/README.md b/README.md index 800ba4d..c6215b5 100644 --- a/README.md +++ b/README.md @@ -17,16 +17,34 @@ PersonaPlex is a real-time, full-duplex speech-to-speech conversational model th ### Installation -Download this repository and install with: +Download this repository and set up the environment: + +#### Option 1: Using Conda (Recommended) ```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install the moshi package pip install moshi/. ``` -Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): +#### Option 2: For Blackwell GPUs (RTX 50 series) +Blackwell GPUs require PyTorch with CUDA 12.8. Install PyTorch first, then the moshi package: ```bash -pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install PyTorch with CUDA 12.8 FIRST +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + +# Then install the moshi package (will use existing PyTorch) +pip install moshi/. ``` +See https://github.com/NVIDIA/personaplex/issues/2 for more details on Blackwell GPU support. + ### Accept Model License Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
@@ -84,6 +102,45 @@ Variety(female): VARF0, VARF1, VARF2, VARF3, VARF4 Variety(male): VARM0, VARM1, VARM2, VARM3, VARM4 ``` +### Custom Voices + +You can create custom voice embeddings from your own audio recordings: + +**Step 1: Prepare your audio file** + +Record a ~10 second WAV file of clear speech. Convert it to mono 24kHz format: +```bash +ffmpeg -i your_recording.wav -ac 1 -ar 24000 my_voice.wav +``` + +**Step 2: Copy to voices directory** + +Copy the converted audio to the voices directory: +```bash +cp my_voice.wav ~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/ +``` + +**Step 3: Generate voice embeddings** + +Run the offline script with `--save-voice-embeddings` to generate the `.pt` file: +```bash +python -m moshi.offline \ + --voice-prompt "my_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" +``` + +This creates `my_voice.pt` in the voices directory. You can now use it with the server or offline mode: +```bash +# With the server (select from dropdown in Web UI) +python -m moshi.server --ssl "$SSL_DIR" + +# With offline mode +python -m moshi.offline --voice-prompt "my_voice.pt" ... +``` + ## Prompting Guide The model is trained on synthetic conversations for a fixed assistant role and varying customer service roles. diff --git a/moshi/moshi/offline.py b/moshi/moshi/offline.py index f7b89d0..537585d 100644 --- a/moshi/moshi/offline.py +++ b/moshi/moshi/offline.py @@ -377,6 +377,10 @@ def main(): "--device", type=str, default="cuda", help="Device on which to run, defaults to 'cuda'." ) parser.add_argument("--seed", type=int, default=-1, help="Seed for reproducibility (-1 disables)") + parser.add_argument( + "--save-voice-embeddings", action="store_true", + help="Save voice prompt embeddings to a .pt file for faster reuse" + ) args = parser.parse_args() @@ -399,6 +403,7 @@ def main(): # Normalize greedy flag behavior (True if present, False otherwise) greedy = bool(args.greedy) + save_embeddings = bool(args.save_voice_embeddings) with torch.no_grad(): run_inference( @@ -418,7 +423,7 @@ def main(): topk_audio=args.topk_audio, topk_text=args.topk_text, greedy=greedy, - save_voice_prompt_embeddings=False, + save_voice_prompt_embeddings=save_embeddings, ) diff --git a/moshi/pyproject.toml b/moshi/pyproject.toml index ead71e9..bd96473 100644 --- a/moshi/pyproject.toml +++ b/moshi/pyproject.toml @@ -10,8 +10,9 @@ dependencies = [ "sentencepiece == 0.2", "sounddevice == 0.5", "sphn >= 0.1.4, < 0.2", - "torch >= 2.2.0, < 2.5", + "torch >= 2.2.0", "aiohttp>=3.10.5, <3.11", + "pyloudnorm >= 0.1.0", ] authors = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] maintainers = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] diff --git a/moshi/requirements.txt b/moshi/requirements.txt index e060822..90cfb09 100644 --- a/moshi/requirements.txt +++ b/moshi/requirements.txt @@ -5,5 +5,6 @@ einops==0.7 sentencepiece==0.2 sounddevice==0.5 sphn>=0.1.4,<0.2 -torch>=2.2.0,<2.5 -aiohttp>=3.10.5,<3.11 \ No newline at end of file +torch>=2.2.0 +aiohttp>=3.10.5,<3.11 +pyloudnorm>=0.1.0 \ No newline at end of file From aaa06927deffaf21458805fddf7bd7210f56f78d Mon Sep 17 00:00:00 2001 From: Rafael Gomes <7319476+grafael@users.noreply.github.com> Date: Fri, 23 Jan 2026 09:42:17 -0300 Subject: [PATCH 10/24] Add --cpu-offload flag for both server and offline modes, which offloads model layers to CPU when GPU memory is insufficient. --- README.md | 7 ++ moshi/moshi/models/loaders.py | 124 ++++++++++++++++++++++++++++++++++ moshi/moshi/offline.py | 7 +- moshi/moshi/server.py | 5 +- 4 files changed, 141 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7f7f5c0..746b478 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,11 @@ Launch server for live interaction (temporary SSL certs for https): SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` +**CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload +``` + Access the Web UI from a browser at `localhost:8998` if running locally, otherwise look for the access link printed by the script: ``` Access the Web UI directly at https://11.54.401.33:8998 @@ -45,6 +50,8 @@ Access the Web UI directly at https://11.54.401.33:8998 For offline evaluation use the offline script that streams in an input wav file and produces an output wav file from the captured output stream. The output file will be the same duration as the input file. +Add `--cpu-offload` to any command below if your GPU has insufficient memory (requires `accelerate` package). + **Assistant example:** ```bash HF_TOKEN= \ diff --git a/moshi/moshi/models/loaders.py b/moshi/moshi/models/loaders.py index 7443f69..49ffc7f 100644 --- a/moshi/moshi/models/loaders.py +++ b/moshi/moshi/models/loaders.py @@ -24,10 +24,13 @@ # LICENSE file in the root directory of this source tree. """Retrieves the pretrained models for Moshi and Mimi.""" from pathlib import Path +import logging from safetensors.torch import load_model, load_file import torch +logger = logging.getLogger(__name__) + from .compression import MimiModel from .lm import LMModel from ..modules import SEANetEncoder, SEANetDecoder, transformer @@ -166,13 +169,30 @@ def get_moshi_lm( device: torch.device | str = "cpu", dtype: torch.dtype = torch.bfloat16, delays=None, + cpu_offload: bool = False, ) -> LMModel: + """Return a pretrained Moshi LM model. + + Args: + filename: Path to model weights. + copy_missing_weights: Whether to copy missing weights from existing layers. + device: Target device for the model. + dtype: Data type for model weights. + delays: Optional custom delays configuration. + cpu_offload: If True, offload model layers to CPU when GPU memory is + insufficient. Uses accelerate's device_map="auto". + """ # Copy to avoid mutating a shared/global dict lm_kwargs = dict(_lm_kwargs) lm_kwargs["dep_q"] = 16 if delays is not None: lm_kwargs["delays"] = delays + if cpu_offload and filename is not None: + return _get_moshi_lm_with_offload( + filename, copy_missing_weights, device, dtype, lm_kwargs + ) + model = LMModel(device=device, dtype=dtype, **lm_kwargs).to(device=device, dtype=dtype) if filename is None: @@ -233,3 +253,107 @@ def get_moshi_lm( model.to(device) model.eval() return model + + +def _get_moshi_lm_with_offload( + filename: str | Path, + copy_missing_weights: bool, + device: torch.device | str, + dtype: torch.dtype, + lm_kwargs: dict, +) -> LMModel: + """Load Moshi LM with CPU offloading using accelerate. + + This function distributes model layers across GPU and CPU based on + available GPU memory. Layers that don't fit on GPU are kept on CPU + and moved to GPU only during forward pass. + """ + try: + from accelerate import infer_auto_device_map, dispatch_model + except ImportError: + raise ImportError( + "CPU offloading requires the 'accelerate' package. " + "Install it with: pip install accelerate" + ) + + filename = str(filename) + logger.info("Loading model with CPU offloading enabled") + + # First, create model on CPU to get the architecture + model = LMModel(device="cpu", dtype=dtype, **lm_kwargs) + + # Load state_dict to CPU + if filename.endswith(".safetensors"): + state_dict = load_file(filename, device="cpu") + else: + with open(filename, "rb") as f: + state_dict = torch.load(f, map_location="cpu") + + # Apply weight patches (same as non-offload path) + model_sd = model.state_dict() + for name, tensor in list(state_dict.items()): + if "depformer" in name and "self_attn" in name and name in model_sd: + if tensor.shape != model_sd[name].shape: + logger.info(f"Expanding {name}") + missing = ( + tensor + if copy_missing_weights + else model_sd[name][tensor.shape[0]:] + ) + state_dict[name] = torch.concat([tensor, missing], dim=0) + + if copy_missing_weights: + to_replace = ["gating", "linears", "depformer_in", "depformer_emb"] + for name in model_sd.keys(): + if name in state_dict: + continue + replaced = False + for old, new in zip(range(8), range(8, 16)): + for rep in to_replace: + needle = f"{rep}.{new}." + if needle in name: + src = name.replace(needle, f"{rep}.{old}.") + if src in state_dict: + logger.info(f"Replacing {name} <- {src}") + state_dict[name] = state_dict[src] + replaced = True + break + if replaced: + break + if not replaced: + logger.warning(f"Missing {name}") + + model.load_state_dict(state_dict, strict=False, assign=True) + + # Determine target device + dev = torch.device(device) if isinstance(device, str) else device + + if dev.type != "cuda": + # If not using CUDA, just move to the target device without offloading + logger.info(f"CPU offload requested but device is {dev}, skipping offload") + model.to(dev) + model.eval() + return model + + # Infer device map based on available GPU memory + device_map = infer_auto_device_map( + model, + max_memory=None, # Let accelerate auto-detect available memory + no_split_module_classes=["StreamingTransformerLayer"], + dtype=dtype, + ) + + # Log the device distribution + gpu_layers = sum(1 for v in device_map.values() if v == 0 or v == "cuda:0") + cpu_layers = sum(1 for v in device_map.values() if v == "cpu") + logger.info(f"Device map: {gpu_layers} modules on GPU, {cpu_layers} modules on CPU") + + # Dispatch model across devices + model = dispatch_model( + model, + device_map=device_map, + offload_dir="offload_weights", # Directory for disk offload if needed + ) + + model.eval() + return model diff --git a/moshi/moshi/offline.py b/moshi/moshi/offline.py index f7b89d0..f690620 100644 --- a/moshi/moshi/offline.py +++ b/moshi/moshi/offline.py @@ -168,6 +168,7 @@ def run_inference( topk_text: int, greedy: bool, save_voice_prompt_embeddings: bool, + cpu_offload: bool = False, ): """Run offline inference using an input WAV as the user-side stream. @@ -202,7 +203,7 @@ def run_inference( log("info", "loading moshi") if moshi_weight is None: moshi_weight = hf_hub_download(hf_repo, loaders.MOSHI_NAME) # type: ignore - lm = loaders.get_moshi_lm(moshi_weight, device=device) + lm = loaders.get_moshi_lm(moshi_weight, device=device, cpu_offload=cpu_offload) lm.eval() log("info", "moshi loaded") @@ -376,6 +377,9 @@ def main(): parser.add_argument( "--device", type=str, default="cuda", help="Device on which to run, defaults to 'cuda'." ) + parser.add_argument("--cpu-offload", action="store_true", + help="Offload LM model layers to CPU when GPU memory is insufficient. " + "Requires 'accelerate' package.") parser.add_argument("--seed", type=int, default=-1, help="Seed for reproducibility (-1 disables)") args = parser.parse_args() @@ -419,6 +423,7 @@ def main(): topk_text=args.topk_text, greedy=greedy, save_voice_prompt_embeddings=False, + cpu_offload=args.cpu_offload, ) diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 0b0d6b7..771f491 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -370,6 +370,9 @@ def main(): help="HF repo to look into, defaults PersonaPlex. " "Use this to select a different pre-trained model.") parser.add_argument("--device", type=str, default="cuda", help="Device on which to run, defaults to 'cuda'.") + parser.add_argument("--cpu-offload", action="store_true", + help="Offload LM model layers to CPU when GPU memory is insufficient. " + "Requires 'accelerate' package.") parser.add_argument( "--voice-prompt-dir", type=str, @@ -439,7 +442,7 @@ def main(): logger.info("loading moshi") if args.moshi_weight is None: args.moshi_weight = hf_hub_download(args.hf_repo, loaders.MOSHI_NAME) - lm = loaders.get_moshi_lm(args.moshi_weight, device=args.device) + lm = loaders.get_moshi_lm(args.moshi_weight, device=args.device, cpu_offload=args.cpu_offload) lm.eval() logger.info("moshi loaded") state = ServerState( From ce22eeb820af4cc96e7de87fcc3b2a0cd3592b3e Mon Sep 17 00:00:00 2001 From: Jake Hall Date: Fri, 23 Jan 2026 14:12:04 +0000 Subject: [PATCH 11/24] docs: add mention of opus dependency --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 800ba4d..885a60e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,20 @@ PersonaPlex is a real-time, full-duplex speech-to-speech conversational model th ## Usage +### Prerequisites + +Install the [Opus audio codec](https://github.com/xiph/opus) development library: +```bash +# Ubuntu/Debian +sudo apt install libopus-dev + +# Fedora/RHEL +sudo dnf install opus-devel + +# macOS +brew install opus +``` + ### Installation Download this repository and install with: From 0eda307125ced73d9a55d5927bf7c9f3e9454cb0 Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Fri, 23 Jan 2026 07:57:34 -0800 Subject: [PATCH 12/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 800ba4d..1ba7841 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ pip install moshi/. Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): ```bash -pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 +pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 ``` From f97974bb8041246ca89caf121270918dcaedb68c Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Fri, 23 Jan 2026 07:58:05 -0800 Subject: [PATCH 13/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ba7841..10a543d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ pip install moshi/. Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): ```bash -pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 ``` From c9f6d0aacd67fcbccfe3198a7cd294fe5bbfad96 Mon Sep 17 00:00:00 2001 From: rajarshiroy-nvidia Date: Fri, 23 Jan 2026 13:13:31 -0800 Subject: [PATCH 14/24] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 117455a..5509a74 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Access the Web UI directly at https://11.54.401.33:8998 For offline evaluation use the offline script that streams in an input wav file and produces an output wav file from the captured output stream. The output file will be the same duration as the input file. -Add `--cpu-offload` to any command below if your GPU has insufficient memory (requires `accelerate` package). +Add `--cpu-offload` to any command below if your GPU has insufficient memory (requires `accelerate` package). Or install cpu-only PyTorch for offline evaluation on pure CPU. **Assistant example:** ```bash From 0c3dd8b398bad9d0c2a1a1fc590cf8aee6b8d530 Mon Sep 17 00:00:00 2001 From: tsdocode Date: Fri, 23 Jan 2026 22:44:25 +0000 Subject: [PATCH 15/24] fix: reduce memory need during model init --- moshi/moshi/models/loaders.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/moshi/moshi/models/loaders.py b/moshi/moshi/models/loaders.py index 49ffc7f..d38dcc3 100644 --- a/moshi/moshi/models/loaders.py +++ b/moshi/moshi/models/loaders.py @@ -193,9 +193,11 @@ def get_moshi_lm( filename, copy_missing_weights, device, dtype, lm_kwargs ) - model = LMModel(device=device, dtype=dtype, **lm_kwargs).to(device=device, dtype=dtype) - + # Init with meta device to avoid init dummy memory + init_device = "meta" if filename is not None else device + model = LMModel(device=init_device, dtype=dtype, **lm_kwargs) if filename is None: + model.to(device=device, dtype=dtype) model.eval() return model @@ -213,7 +215,6 @@ def get_moshi_lm( # torch checkpoint with open(filename, "rb") as f: state_dict = torch.load(f, map_location="cpu") - # Patch 1: expand depformer self_attn weights if needed model_sd = model.state_dict() for name, tensor in list(state_dict.items()): @@ -249,10 +250,14 @@ def get_moshi_lm( if not replaced: print("Missing %s", name) + # Assign weights to target device + dev = torch.device(device) if isinstance(device, str) else device + for key in state_dict: + state_dict[key] = state_dict[key].to(device=dev, dtype=dtype) + model.load_state_dict(state_dict, strict=False, assign=True) - model.to(device) model.eval() - return model + return model.to(device=device, dtype=dtype) def _get_moshi_lm_with_offload( From 0d0f89869c0f9f572fa9028f35da08eed9b30ba6 Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Mon, 26 Jan 2026 16:03:33 +0900 Subject: [PATCH 16/24] Add .env file support for HF_TOKEN configuration - Add python-dotenv dependency to pyproject.toml - Load environment variables from .env file in server.py and offline.py - Add warning when .env exists but HF_TOKEN is not set - Create .env.example template for users - Update README.md with .env configuration instructions The .env file is optional and all existing workflows continue to work. Users can now configure HF_TOKEN via .env file, environment variable, or huggingface-cli login. --- .env.example | 7 +++++++ README.md | 26 ++++++++++++++++++++++---- moshi/moshi/offline.py | 12 ++++++++++++ moshi/moshi/server.py | 14 ++++++++++++++ moshi/pyproject.toml | 1 + 5 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c2c8eb5 --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +# Hugging Face API Token +# Get your token from: https://huggingface.co/settings/tokens +# Required to download PersonaPlex models +HF_TOKEN=your_token_here + +# Optional: Custom cache directory for Hugging Face models +# HF_HOME=/path/to/custom/cache diff --git a/README.md b/README.md index 01236cb..0364fba 100644 --- a/README.md +++ b/README.md @@ -61,16 +61,36 @@ See https://github.com/NVIDIA/personaplex/issues/2 for more details on Blackwell ### Accept Model License -Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
-Then set up your Huggingface authentication: + +Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1). + +Then set up your Huggingface authentication using one of these methods: + +**Option 1: Using .env file (Recommended)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual token +``` + +**Option 2: Environment variable** ```bash export HF_TOKEN= ``` +**Option 3: Hugging Face CLI** +```bash +pip install huggingface_hub +huggingface-cli login +``` + +**Note:** The .env file is optional. All existing workflows continue to work. + ### Launch Server Launch server for live interaction (temporary SSL certs for https): ```bash +# The server automatically loads your HF_TOKEN from the .env file SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` @@ -92,7 +112,6 @@ Add `--cpu-offload` to any command below if your GPU has insufficient memory (re **Assistant example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATF2.pt" \ --input-wav "assets/test/input_assistant.wav" \ @@ -103,7 +122,6 @@ python -m moshi.offline \ **Service example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATM1.pt" \ --text-prompt "$(cat assets/test/prompt_service.txt)" \ diff --git a/moshi/moshi/offline.py b/moshi/moshi/offline.py index 7140d21..904895c 100644 --- a/moshi/moshi/offline.py +++ b/moshi/moshi/offline.py @@ -51,6 +51,10 @@ import sentencepiece import sphn from huggingface_hub import hf_hub_download +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() from .client_utils import make_log from .models import loaders, LMGen, MimiModel @@ -388,6 +392,14 @@ def main(): args = parser.parse_args() + # Warn if .env exists but HF_TOKEN is not set + env_file = Path(__file__).parent.parent.parent / ".env" + if env_file.exists() and not os.getenv("HF_TOKEN"): + log("warning", + "Found .env file but HF_TOKEN is not set. " + "Models requiring authentication may fail to download. " + "See .env.example for configuration details.") + # If --voice-prompt-dir is omitted, voices.tgz is downloaded from HF and extracted. voice_prompt_dir = _get_voice_prompt_dir( args.voice_prompt_dir, diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 771f491..22b76e1 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -44,6 +44,10 @@ import sphn import torch import random +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() from .client_utils import make_log, colorize from .models import loaders, MimiModel, LMModel, LMGen @@ -392,6 +396,16 @@ def main(): ) args = parser.parse_args() + + # Warn if .env exists but HF_TOKEN is not set + env_file = Path(__file__).parent.parent.parent / ".env" + if env_file.exists() and not os.getenv("HF_TOKEN"): + logger.warning( + "Found .env file but HF_TOKEN is not set. " + "Models requiring authentication may fail to download. " + "See .env.example for configuration details." + ) + args.voice_prompt_dir = _get_voice_prompt_dir( args.voice_prompt_dir, args.hf_repo, diff --git a/moshi/pyproject.toml b/moshi/pyproject.toml index bd96473..2ff6ca7 100644 --- a/moshi/pyproject.toml +++ b/moshi/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "torch >= 2.2.0", "aiohttp>=3.10.5, <3.11", "pyloudnorm >= 0.1.0", + "python-dotenv >= 1.0.0, < 2.0", ] authors = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] maintainers = [{name="Rajarshi Roy", email="rajarshir@nvidia.com"}] From 8743e633e53452595d3bea609b57302269013311 Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Mon, 26 Jan 2026 18:07:44 +0900 Subject: [PATCH 17/24] Add dynamic custom voice system with automatic Web UI discovery Implement dynamic voice discovery system that allows users to add custom voices without modifying code. Voices automatically appear in the Web UI dropdown after generating embeddings and restarting the server. Backend changes: - Add VoiceDiscovery service to scan configured voice directories - Add /api/voices REST endpoint returning voice list with metadata - Support custom voices directory (configurable via CUSTOM_VOICE_DIR) - Only list .pt embedding files (not .wav source audio) Frontend changes: - Add useVoices React hook for dynamic voice fetching - Update Queue and ModelParams components to use dynamic voice loading - Add loading and error states for better UX - Custom voices appear first in dropdown Infrastructure: - Add custom_voices/ directory with comprehensive README - Update .gitignore to exclude voice files but keep directory structure - Add TROUBLESHOOTING.md documenting common issues - Update README.md with installation, server, and custom voice docs Key fixes applied during implementation: - Package must be installed in editable mode (pip install -e .) for dev - Server needs --static client/dist flag to serve local frontend builds - API routes must be registered before static routes in aiohttp - Critical: Only .pt files are selectable voices (not .wav source files) --- .env.example | 2 +- .gitignore | 6 + README.md | 74 ++++++++- TROUBLESHOOTING.md | 145 ++++++++++++++++++ client/package-lock.json | 14 ++ client/src/hooks/useVoices.ts | 52 +++++++ .../components/ModelParams/ModelParams.tsx | 40 ++--- client/src/pages/Queue/Queue.tsx | 44 ++++-- custom_voices/README.md | 92 +++++++++++ moshi/moshi/server.py | 30 ++++ moshi/moshi/voice_discovery.py | 105 +++++++++++++ 11 files changed, 561 insertions(+), 43 deletions(-) create mode 100644 TROUBLESHOOTING.md create mode 100644 client/src/hooks/useVoices.ts create mode 100644 custom_voices/README.md create mode 100644 moshi/moshi/voice_discovery.py diff --git a/.env.example b/.env.example index c2c8eb5..d1f98b0 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,6 @@ # Get your token from: https://huggingface.co/settings/tokens # Required to download PersonaPlex models HF_TOKEN=your_token_here - +CUSTOM_VOICE_DIR=./custom_voices # Optional: Custom cache directory for Hugging Face models # HF_HOME=/path/to/custom/cache diff --git a/.gitignore b/.gitignore index d5ee5c6..466b7b2 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,9 @@ Claude.local.md .agent/ .DS_Store Thumbs.db + +# Custom voice files (keep directory structure in git, ignore voice files) +custom_voices/*.pt +custom_voices/*.wav +# But keep the README +!custom_voices/README.md diff --git a/README.md b/README.md index 0364fba..f3b053a 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,14 @@ Download this repository and set up the environment: conda create -n personaplex python=3.10 -y conda activate personaplex -# Install the moshi package -pip install moshi/. +# Install the moshi package in editable mode (for development) +cd moshi +pip install -e . +cd .. ``` +**Note:** Use `pip install -e .` (editable mode) during development so code changes are immediately reflected without reinstalling. + #### Option 2: For Blackwell GPUs (RTX 50 series) Blackwell GPUs require PyTorch with CUDA 12.8. Install PyTorch first, then the moshi package: ```bash @@ -94,6 +98,11 @@ Launch server for live interaction (temporary SSL certs for https): SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` +**For Development:** If you've modified the frontend (client/ directory), use the `--static` flag to serve your local build: +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist +``` + **CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): ```bash SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload @@ -143,7 +152,9 @@ Variety(male): VARM0, VARM1, VARM2, VARM3, VARM4 ### Custom Voices -You can create custom voice embeddings from your own audio recordings: +PersonaPlex supports **dynamic custom voice loading** - add new voices and they automatically appear in the Web UI without code changes! + +#### Quick Start **Step 1: Prepare your audio file** @@ -171,15 +182,56 @@ python -m moshi.offline \ --output-text "/tmp/test_output.json" ``` -This creates `my_voice.pt` in the voices directory. You can now use it with the server or offline mode: +This creates `my_voice.pt` in the voices directory. + +**Step 4: Use your custom voice** + +**With the Web UI:** Restart the server and your custom voice automatically appears in the voice dropdown! Custom voices appear first in the list. ```bash -# With the server (select from dropdown in Web UI) -python -m moshi.server --ssl "$SSL_DIR" +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` -# With offline mode +**With offline mode:** Use the `.pt` file directly: +```bash python -m moshi.offline --voice-prompt "my_voice.pt" ... ``` +#### Custom Voices Directory + +For easier management, you can place custom voices in the `custom_voices/` directory: + +```bash +# Create your custom voices directory (if it doesn't exist) +mkdir -p custom_voices + +# Place voice files here +cp my_voice.wav custom_voices/ +# Generate embeddings... +# The generated my_voice.pt will appear in the Web UI! +``` + +**Configure custom location (optional):** +```bash +# In .env file +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +#### Voice File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices in the Web UI +- **`.wav` files**: Source audio (24kHz mono) - used only to GENERATE the `.pt` embeddings + +**Important:** Only `.pt` files appear in the voice selector dropdown. The `.wav` files are intermediate source files used during voice generation. + +#### API Access + +List all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns JSON with all voices, their types, and categories. + ## Prompting Guide The model is trained on synthetic conversations for a fixed assistant role and varying customer service roles. @@ -234,6 +286,14 @@ Personaplex finetunes Moshi and benefits from the generalization capabilities of You enjoy having a good conversation. Have a technical discussion about fixing a reactor core on a spaceship to Mars. You are an astronaut on a Mars mission. Your name is Alex. You are already dealing with a reactor core meltdown on a Mars mission. Several ship systems are failing, and continued instability will lead to catastrophic failure. You explain what is happening and you urgently ask for help thinking through how to stabilize the reactor. ``` +## Troubleshooting + +For common issues and solutions, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md), including: +- Code changes not reflected when running server (editable install issue) +- Custom voices not appearing in Web UI +- Frontend build and development issues +- Environment and dependency problems + ## License The present code is provided under the MIT license. The weights for the models are released under the NVIDIA Open Model license. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..c89ccef --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,145 @@ +# Troubleshooting Guide + +## Development Issues + +### Code Changes Not Reflected When Running Server + +**Symptom:** You made changes to backend code (e.g., added new endpoints, modified routes), but when you restart the server with `python -m moshi.server`, the changes don't appear. New API endpoints return 404, and debug logging doesn't show up. + +**Root Cause:** The moshi-personaplex package was installed in regular mode (`pip install .`) instead of editable mode (`pip install -e .`). When installed normally, pip copies the code to site-packages (e.g., `/path/to/envs/personaplex/lib/python3.10/site-packages/moshi`), and Python loads from there instead of your source directory. + +**Solution:** + +1. Check if the package is installed: + ```bash + pip list | grep moshi + ``` + +2. Check installation location: + ```bash + pip show moshi-personaplex + ``` + + If `Location` shows `site-packages`, it's not in editable mode. + +3. Uninstall and reinstall in editable mode: + ```bash + pip uninstall -y moshi-personaplex + cd moshi + pip install -e . + ``` + +4. Verify editable install: + ```bash + pip show moshi-personaplex + ``` + + The `Location` should show something like `/path/to/repo/moshi` instead of `site-packages`. + +5. Restart the server: + ```bash + python -m moshi.server + ``` + +**Prevention:** Always use `pip install -e .` (with the `-e` flag) when installing packages for development. + +## Server Issues + +### Server Returns 404 for API Endpoints + +If specific API endpoints return 404: + +1. Check if routes are registered correctly by looking for debug logging at server startup +2. Verify the package is installed in editable mode (see above) +3. Check that static routes are registered AFTER API routes in server.py +4. Clear Python cache and restart: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + python -m moshi.server + ``` + +### Build Directory Conflicts + +If you suspect the `moshi/build/` directory contains old code: + +1. Move it out of the way: + ```bash + mv moshi/build moshi/build.bak + ``` + +2. Clear Python cache: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + ``` + +3. Restart the server + +## Frontend Issues + +### Frontend Not Showing New Features + +If you modified React components but don't see changes: + +1. Rebuild the frontend: + ```bash + cd client + npm run build + ``` + +2. Restart the server (it serves the static files from client/dist) + +3. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) + +### Voice Dropdown Shows "Error loading voices" + +1. Check if the server is running: + ```bash + ps aux | grep moshi.server + ``` + +2. Test the API endpoint directly: + ```bash + curl http://localhost:8998/api/voices + ``` + +3. Check server logs for errors + +4. Verify VoiceDiscovery can find voice files: + ```bash + python -c "from moshi.voice_discovery import VoiceDiscovery; print(len(VoiceDiscovery.list_voices()))" + ``` + +## Environment Issues + +### Missing HuggingFace Token + +If models fail to download: + +1. Create a `.env` file in the repository root +2. Add your HuggingFace token: + ``` + HF_TOKEN=your_token_here + ``` +3. See `.env.example` for more details + +### ffmpeg Not Found + +If voice generation fails with "Command 'ffmpeg' not found": + +```bash +sudo apt install ffmpeg # Ubuntu/Debian +brew install ffmpeg # macOS +``` + +## Getting Help + +If you encounter issues not covered here: + +1. Check the README.md for setup instructions +2. Review recent commits for breaking changes +3. Open an issue at https://github.com/nvidia/personaplex-7b-v1/issues with: + - Your environment (OS, Python version, conda/venv) + - Steps to reproduce the issue + - Complete error messages and logs diff --git a/client/package-lock.json b/client/package-lock.json index 3997642..b1b8f01 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -1315,6 +1315,7 @@ "integrity": "sha512-cisd7gxkzjBKU2GgdYrTdtQx1SORymWyaAFhaxQPK9bYO9ot3Y5OikQRvY0VYQtvwjeQnizCINJAenh/V7MK2w==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.2.2" @@ -1769,6 +1770,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2169,6 +2171,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2911,6 +2914,7 @@ "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -2967,6 +2971,7 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -4270,6 +4275,7 @@ "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", "dev": true, "license": "MIT", + "peer": true, "bin": { "jiti": "bin/jiti.js" } @@ -5017,6 +5023,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -5197,6 +5204,7 @@ "integrity": "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==", "dev": true, "license": "MIT", + "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -5424,6 +5432,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -5436,6 +5445,7 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -6283,6 +6293,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -6450,6 +6461,7 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6525,6 +6537,7 @@ "integrity": "sha512-4Z+L8I2OqhZV8qA132M4wNL30ypZGYOQVBfMgxDH/K5UX0PNqTu1c6za9ST5r9+tavvHiTWmBnKzpCJ/GlVFtg==", "dev": true, "license": "BSD-2-Clause", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "7.18.0", "@typescript-eslint/types": "7.18.0", @@ -6767,6 +6780,7 @@ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", diff --git a/client/src/hooks/useVoices.ts b/client/src/hooks/useVoices.ts new file mode 100644 index 0000000..2f269c5 --- /dev/null +++ b/client/src/hooks/useVoices.ts @@ -0,0 +1,52 @@ +import { useState, useEffect } from 'react'; + +export interface Voice { + name: string; + type: 'embeddings' | 'audio'; + category: 'custom' | 'natural-female' | 'natural-male' | 'variety-female' | 'variety-male' | 'other'; + path: string; +} + +export interface UseVoicesReturn { + voices: Voice[]; + loading: boolean; + error: string | null; + refresh: () => void; +} + +export function useVoices(): UseVoicesReturn { + const [voices, setVoices] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + const fetchVoices = async () => { + setLoading(true); + setError(null); + + try { + const response = await fetch('/api/voices'); + if (!response.ok) { + throw new Error(`Failed to fetch voices: ${response.statusText}`); + } + + const data = await response.json(); + setVoices(data.voices || []); + } catch (err) { + setError(err instanceof Error ? err.message : 'Unknown error'); + console.error('Error fetching voices:', err); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchVoices(); + }, []); + + return { + voices, + loading, + error, + refresh: fetchVoices, + }; +} diff --git a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx index 07afedb..66f6396 100644 --- a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx +++ b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx @@ -1,6 +1,7 @@ import { FC, RefObject, useState } from "react"; import { useModelParams } from "../../hooks/useModelParams"; import { Button } from "../../../../components/Button/Button"; +import { useVoices } from "../../../../hooks/useVoices"; type ModelParamsProps = { isConnected: boolean; @@ -24,6 +25,7 @@ export const ModelParams:FC = ({ }) => { const [modalVoicePrompt, setModalVoicePrompt] = useState(voicePrompt); const [modalTextPrompt, setModalTextPrompt] = useState(textPrompt); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); return (
@@ -37,25 +39,25 @@ export const ModelParams:FC = ({ diff --git a/client/src/pages/Queue/Queue.tsx b/client/src/pages/Queue/Queue.tsx index 7d0d44b..5acef1d 100644 --- a/client/src/pages/Queue/Queue.tsx +++ b/client/src/pages/Queue/Queue.tsx @@ -7,13 +7,7 @@ import { Button } from "../../components/Button/Button"; import { useModelParams } from "../Conversation/hooks/useModelParams"; import { env } from "../../env"; import { prewarmDecoderWorker } from "../../decoder/decoderWorker"; - -const VOICE_OPTIONS = [ - "NATF0.pt", "NATF1.pt", "NATF2.pt", "NATF3.pt", - "NATM0.pt", "NATM1.pt", "NATM2.pt", "NATM3.pt", - "VARF0.pt", "VARF1.pt", "VARF2.pt", "VARF3.pt", "VARF4.pt", - "VARM0.pt", "VARM1.pt", "VARM2.pt", "VARM3.pt", "VARM4.pt", -]; +import { useVoices } from "../../hooks/useVoices"; const TEXT_PROMPT_PRESETS = [ { @@ -41,6 +35,9 @@ interface HomepageProps { setTextPrompt: (value: string) => void; voicePrompt: string; setVoicePrompt: (value: string) => void; + voicesLoading: boolean; + voicesError: string | null; + voices: Array<{ name: string; type: string; category: string; path: string }>; } const Homepage = ({ @@ -50,6 +47,9 @@ const Homepage = ({ setTextPrompt, voicePrompt, setVoicePrompt, + voicesLoading, + voicesError, + voices, }: HomepageProps) => { return (
@@ -102,16 +102,24 @@ const Homepage = ({ name="voice-prompt" value={voicePrompt} onChange={(e) => setVoicePrompt(e.target.value)} - className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent" + disabled={voicesLoading} + className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent disabled:bg-gray-100 disabled:cursor-not-allowed" > - {VOICE_OPTIONS.map((voice) => ( - - ))} + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}
@@ -132,6 +140,7 @@ export const Queue:FC = () => { const [hasMicrophoneAccess, setHasMicrophoneAccess] = useState(false); const [showMicrophoneAccessMessage, setShowMicrophoneAccessMessage] = useState(false); const modelParams = useModelParams(); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); const audioContext = useRef(null); const worklet = useRef(null); @@ -209,6 +218,9 @@ export const Queue:FC = () => { setTextPrompt={modelParams.setTextPrompt} voicePrompt={modelParams.voicePrompt} setVoicePrompt={modelParams.setVoicePrompt} + voicesLoading={voicesLoading} + voicesError={voicesError} + voices={voices} /> )} diff --git a/custom_voices/README.md b/custom_voices/README.md new file mode 100644 index 0000000..189f22f --- /dev/null +++ b/custom_voices/README.md @@ -0,0 +1,92 @@ +# Custom Voices Directory + +This directory is for storing your custom voice files. Any voice files (.pt or .wav) placed here will automatically appear in the PersonaPlex web interface voice selector. + +## Quick Start + +1. **Prepare your audio file** (10+ seconds of clear speech): + ```bash + ffmpeg -i your_recording.mp3 -ac 1 -ar 24000 your_voice.wav + ``` + +2. **Generate voice embeddings**: + ```bash + # Copy the WAV file to the voices directory + VOICES_DIR=$(python -c "from pathlib import Path; from huggingface_hub import snapshot_download; print(Path(snapshot_download(repo_id='nvidia/personaplex-7b-v1', allow_patterns=['voices/*'])) / 'voices')") + cp your_voice.wav "$VOICES_DIR/" + + # Generate embeddings + python -m moshi.offline \ + --voice-prompt "your_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" + ``` + +3. **Restart the server** and your voice will appear in the dropdown! + +## File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices that appear in the UI dropdown +- **`.wav` files**: Source audio recordings (24kHz mono) - used only to GENERATE the .pt embeddings, not selectable as voices + +**Important**: Only `.pt` files appear in the voice selector dropdown. `.wav` files are intermediate source files used during voice generation. + +## Configuration + +By default, PersonaPlex looks for voices in: +1. HuggingFace cache: `~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/` +2. Custom directory: `./custom_voices/` (this directory) + +To use a different custom voices directory, set the `CUSTOM_VOICE_DIR` environment variable in `.env`: +``` +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +## Voice Naming Convention + +Pre-packaged voices follow this naming: +- `NATF*` = Natural Female +- `NATM*` = Natural Male +- `VARF*` = Variety Female +- `VARM*` = Variety Male + +Custom voices (any other name) will appear first in the dropdown, followed by the categorized pre-packaged voices. + +## API Access + +You can list all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns: +```json +{ + "voices": [ + {"name": "your_voice.pt", "type": "embeddings", "category": "custom", "path": "..."}, + {"name": "NATF0.pt", "type": "embeddings", "category": "natural-female", "path": "..."}, + ... + ], + "count": 20 +} +``` + +## Tips + +- Use high-quality audio recordings (clear speech, minimal background noise) +- 10-30 seconds of audio is usually sufficient +- The voice will reflect the speaking style and characteristics of the input audio +- Experiment with different recordings to find the best voice for your use case + +## Troubleshooting + +If your custom voice doesn't appear: +1. Verify the file is in the correct directory (`ls custom_voices/`) +2. Check the file extension is `.pt` or `.wav` +3. Restart the PersonaPlex server +4. Test the API endpoint: `curl http://localhost:8998/api/voices` +5. Check server logs for errors + +For more help, see `TROUBLESHOOTING.md` in the repository root. diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 22b76e1..a157f83 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -53,6 +53,7 @@ from .models import loaders, MimiModel, LMModel, LMGen from .utils.connection import create_ssl_context, get_lan_ip from .utils.logging import setup_logger, ColorizedLog +from .voice_discovery import VoiceDiscovery logger = setup_logger(__name__) @@ -312,6 +313,20 @@ async def is_alive(): clog.log("info", "done with connection") return ws + async def handle_list_voices(self, request): + """List all available voices from configured directories.""" + try: + voices = VoiceDiscovery.list_voices() + return web.json_response({ + 'voices': voices, + 'count': len(voices) + }) + except Exception as e: + logger.error(f"Error listing voices: {e}") + return web.json_response({ + 'error': str(e) + }, status=500) + def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Optional[str]: """ @@ -471,7 +486,19 @@ def main(): logger.info("warming up the model") state.warmup() app = web.Application() + + # Register API routes FIRST before static catch-all + async def test_endpoint(request): + return web.json_response({"status": "ok", "test": True}) + + app.router.add_get("/api/test", test_endpoint) app.router.add_get("/api/chat", state.handle_chat) + app.router.add_get("/api/voices", state.handle_list_voices) + + # Debug: log registered routes + logger.info(f"Registered routes so far: {[r.resource.canonical for r in app.router.routes()]}") + + # Register static routes AFTER API routes if static_path is not None: async def handle_root(_): return web.FileResponse(os.path.join(static_path, "index.html")) @@ -481,6 +508,9 @@ async def handle_root(_): app.router.add_static( "/", path=static_path, follow_symlinks=True, name="static" ) + + # Debug: log all routes after registration + logger.info(f"All registered routes: {[(r.method, r.resource.canonical) for r in app.router.routes()]}") protocol = "http" ssl_context = None if args.ssl is not None: diff --git a/moshi/moshi/voice_discovery.py b/moshi/moshi/voice_discovery.py new file mode 100644 index 0000000..40cb406 --- /dev/null +++ b/moshi/moshi/voice_discovery.py @@ -0,0 +1,105 @@ +# Copyright (c) Kyutai, all rights reserved. +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Voice discovery service for listing available voices.""" +from pathlib import Path +from typing import List, Dict +import os + + +class VoiceDiscovery: + """Discovers and lists available voice files.""" + + @staticmethod + def get_voice_directories() -> List[Path]: + """Get all directories where voices can be stored. + + Returns: + List of Path objects for directories containing voice files + """ + dirs = [] + + # 1. HuggingFace cache voices directory + hf_cache = os.environ.get('HF_HOME', str(Path.home() / '.cache/huggingface')) + hf_voices = Path(hf_cache) / 'hub' + + # Find personaplex model snapshot + for model_dir in hf_voices.glob('models--nvidia--personaplex-7b-v1/snapshots/*'): + voices_dir = model_dir / 'voices' + if voices_dir.exists(): + dirs.append(voices_dir) + + # 2. Custom voices directory (from .env or default) + custom_dir = os.environ.get('CUSTOM_VOICE_DIR', './custom_voices') + custom_path = Path(custom_dir) + if custom_path.exists(): + dirs.append(custom_path) + + return dirs + + @staticmethod + def list_voices() -> List[Dict[str, str]]: + """List all available voices. + + Only returns .pt embedding files, not .wav source audio files. + .wav files are used to generate embeddings and should not be listed as voices. + + Returns: + List of voice info dicts with keys: name, type, category, path + Sorted with custom voices first, then by category, then alphabetically + """ + voices = [] + seen_names = set() + + for voice_dir in VoiceDiscovery.get_voice_directories(): + # Find .pt files (voice embeddings only) + for pt_file in voice_dir.glob('*.pt'): + name = pt_file.name + if name not in seen_names: + category = VoiceDiscovery._categorize_voice(name) + voices.append({ + 'name': name, + 'type': 'embeddings', + 'category': category, + 'path': str(pt_file) + }) + seen_names.add(name) + + # Sort: custom first, then by category, then by name + def sort_key(v): + cat_order = { + 'custom': 0, + 'natural-female': 1, + 'natural-male': 2, + 'variety-female': 3, + 'variety-male': 4, + 'other': 5 + } + return (cat_order.get(v['category'], 99), v['name']) + + return sorted(voices, key=sort_key) + + @staticmethod + def _categorize_voice(filename: str) -> str: + """Categorize voice by filename pattern. + + Args: + filename: Voice filename (.pt extension) + + Returns: + Category string: custom, natural-female, natural-male, + variety-female, variety-male, or other + """ + name = filename.replace('.pt', '') + + if name.startswith('NATF'): + return 'natural-female' + elif name.startswith('NATM'): + return 'natural-male' + elif name.startswith('VARF'): + return 'variety-female' + elif name.startswith('VARM'): + return 'variety-male' + else: + return 'custom' From 645dd0b40bf3debe5b3f45f70c6536ae8002e2ec Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 17:17:23 +0900 Subject: [PATCH 18/24] Add smart auto-detection for custom UI builds Server now automatically detects and serves custom UI from client/dist without requiring --static flag, simplifying development workflow. Falls back to HuggingFace default UI when custom build is unavailable. Adds comprehensive documentation including QUICKSTART.md and FRONTEND_DEVELOPMENT.md guides. --- FRONTEND_DEVELOPMENT.md | 284 ++++++++++++++++++++++++++++++++++++++++ QUICKSTART.md | 119 +++++++++++++++++ README.md | 44 ++++++- TROUBLESHOOTING.md | 97 ++++++++++++++ moshi/moshi/server.py | 43 ++++++ 5 files changed, 583 insertions(+), 4 deletions(-) create mode 100644 FRONTEND_DEVELOPMENT.md create mode 100644 QUICKSTART.md diff --git a/FRONTEND_DEVELOPMENT.md b/FRONTEND_DEVELOPMENT.md new file mode 100644 index 0000000..519fd5e --- /dev/null +++ b/FRONTEND_DEVELOPMENT.md @@ -0,0 +1,284 @@ +# Frontend Development Guide + +This guide explains how to develop and test custom UI changes for PersonaPlex. + +## Understanding Smart Auto-Detection + +**PersonaPlex now automatically detects and serves your custom UI!** You no longer need to use the `--static` flag for development. + +### How Auto-Detection Works + +When you start the server, it checks: +1. Does `client/dist` exist in the project directory? +2. **YES** → Automatically serves your custom UI +3. **NO** → Downloads and serves the default UI from HuggingFace + +### Starting the Server (Auto-Detection) + +```bash +# Just start the server normally - no flags needed! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**Log output with custom UI detected:** +``` +Found custom UI at /home/.../personaplex-blackwell/client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +serving static content from /home/.../personaplex-blackwell/client/dist +``` + +**Log output without custom UI:** +``` +retrieving the static content +static_path = /home/.../.cache/huggingface/.../dist +serving static content from /home/.../.cache/huggingface/.../dist +``` + +### Manual Override (Optional) + +You can still manually specify the UI source if needed: + +```bash +# Force specific directory +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static /path/to/custom/dist + +# Disable static serving +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none +``` + +## Frontend Development Workflow + +### Prerequisites + +1. Install Node.js and npm (if not already installed): + ```bash + # Check if already installed + node --version + npm --version + ``` + +2. Install frontend dependencies: + ```bash + cd client + npm install + ``` + +### Development Steps (Simplified!) + +#### 1. Make Your Changes +Edit files in the `client/src/` directory: +- `client/src/components/` - React components +- `client/src/styles/` - CSS and styling +- `client/src/App.tsx` - Main application component + +#### 2. Build the Frontend +```bash +cd client +npm run build +cd .. +``` + +This creates/updates the `client/dist` directory with your compiled code. + +#### 3. Start Server (Auto-Detection!) +```bash +# From project root - server auto-detects custom UI! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +#### 4. Verify Custom UI is Loaded +Check the server logs for: +``` +Found custom UI at .../client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +``` + +If you see `retrieving the static content`, the build might not exist. Go back to step 2. + +#### 5. Test Your Changes +1. Open the Web UI: https://localhost:8998 +2. Hard refresh (Ctrl+Shift+R or Cmd+Shift+R) to clear browser cache +3. Test your modifications + +#### 6. Iterate +Repeat steps 1-5 for each change: +```bash +# Make changes to client/src/... +cd client && npm run build && cd .. + +# Restart server (Ctrl+C to stop first) - auto-detects custom UI! +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**That's it! No `--static` flag needed anymore.** + +## Troubleshooting + +### Changes Not Appearing + +**Problem:** You rebuilt the frontend but don't see changes in the browser. + +**Solutions:** +1. **Verify server is using custom UI:** + - Check logs for `static_path = client/dist` + - If not, restart with `--static client/dist` + +2. **Clear browser cache:** + - Hard refresh: Ctrl+Shift+R (Windows/Linux) or Cmd+Shift+R (Mac) + - Or open DevTools (F12) → Network tab → Check "Disable cache" + +3. **Verify build completed successfully:** + ```bash + cd client + npm run build + ls -la dist/ # Should show recent timestamps + ``` + +4. **Check for build errors:** + ```bash + cd client + npm run build 2>&1 | grep -i error + ``` + +### Server Won't Start with --static Flag + +**Problem:** Error when starting server with `--static client/dist` + +**Solutions:** +1. **Verify dist directory exists:** + ```bash + ls -la client/dist/ + ``` + If missing, build the frontend first: `cd client && npm run build` + +2. **Check path is correct:** + - Use relative path: `--static client/dist` + - From project root, not from client/ directory + +### Frontend Build Fails + +**Problem:** `npm run build` fails with errors + +**Solutions:** +1. **Check Node.js version:** + ```bash + node --version + # Should be 16.x or higher + ``` + +2. **Reinstall dependencies:** + ```bash + cd client + rm -rf node_modules package-lock.json + npm install + npm run build + ``` + +3. **Check for TypeScript errors:** + ```bash + cd client + npm run type-check + ``` + +## Development Tips + +### Shell Alias for Quick Development +Add to your `~/.bashrc` or `~/.zshrc`: + +```bash +# Quick start with custom UI +alias moshi-dev='conda activate personaplex && SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist' + +# Quick frontend rebuild +alias moshi-build='cd client && npm run build && cd ..' +``` + +Usage: +```bash +# Make changes to client/src/... +moshi-build # Rebuild frontend +moshi-dev # Start server with custom UI +``` + +### Watch Mode for Live Development + +For faster iteration, use the frontend in development mode: + +```bash +# Terminal 1: Start backend server (without static flag) +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# Terminal 2: Start frontend dev server with hot reload +cd client +npm run dev +``` + +Then access the UI at the Vite dev server URL (usually http://localhost:5173). + +**Note:** This requires configuring CORS in the backend. Check `client/vite.config.ts` for proxy settings. + +## Production Deployment + +When ready to deploy your custom UI: + +1. Build the production bundle: + ```bash + cd client + npm run build + ``` + +2. Test the production build: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + ``` + +3. Verify everything works correctly + +4. Commit your changes: + ```bash + git add client/src/ client/dist/ + git commit -m "Add custom UI feature: [description]" + ``` + +## File Structure + +``` +personaplex-blackwell/ +├── client/ # Frontend source code +│ ├── src/ # Source files (edit these) +│ │ ├── components/ # React components +│ │ ├── styles/ # CSS files +│ │ ├── App.tsx # Main app +│ │ └── main.tsx # Entry point +│ ├── dist/ # Built files (generated) +│ │ ├── index.html # HTML entry +│ │ ├── assets/ # JS/CSS bundles +│ │ └── ... +│ ├── package.json # Dependencies +│ ├── vite.config.ts # Build config +│ └── tsconfig.json # TypeScript config +└── moshi/ # Backend Python code +``` + +## Quick Reference + +| Task | Command | +|------|---------| +| Install dependencies | `cd client && npm install` | +| Build frontend | `cd client && npm run build` | +| Start with custom UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | +| Start with default UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Dev server (hot reload) | `cd client && npm run dev` | +| Type check | `cd client && npm run type-check` | +| Lint code | `cd client && npm run lint` | + +## Getting Help + +If you encounter issues not covered here: +1. Check [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems +2. Verify your Node.js and npm versions +3. Check the browser console (F12) for JavaScript errors +4. Review server logs for static file serving errors diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..87604ab --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,119 @@ +# PersonaPlex Quick Start Guide + +This guide provides the essential steps to get PersonaPlex running quickly. + +## Prerequisites + +1. Install [Opus audio codec](https://github.com/xiph/opus) development library: + ```bash + # Ubuntu/Debian + sudo apt install libopus-dev + + # macOS + brew install opus + ``` + +2. Accept the [PersonaPlex model license](https://huggingface.co/nvidia/personaplex-7b-v1) on Hugging Face + +## Installation + +### Step 1: Create Conda Environment + +```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex +``` + +### Step 2: Install Moshi Package + +**For most GPUs:** +```bash +cd moshi +pip install -e . +cd .. +``` + +**For Blackwell GPUs (RTX 50 series):** +```bash +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Then install moshi +cd moshi +pip install -e . +cd .. +``` + +### Step 3: Set Up Hugging Face Token + +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual Hugging Face token +``` + +## Running the Web UI + +**CRITICAL: Always activate the conda environment first!** + +```bash +# 1. Activate the environment +conda activate personaplex + +# 2. Launch the server (automatically detects custom UI if it exists) +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# 3. Access the Web UI at: https://localhost:8998 +``` + +### Smart Auto-Detection + +The server now **automatically detects and uses your custom UI** if you've built it! + +- If `client/dist` exists → Your custom UI is served automatically +- If `client/dist` doesn't exist → Default UI is downloaded from HuggingFace + +**Verify which UI loaded** by checking the server logs: +- Custom UI: `Found custom UI at .../client/dist, using it instead of default` +- Default UI: `retrieving the static content` (downloads from HuggingFace) + +### Building Custom UI (If Modified) + +Only needed if you've changed the frontend code: + +```bash +cd client +npm run build +cd .. + +# Now start the server - it will auto-detect your custom build +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +## Quick Command Reference + +| Task | Command | +|------|---------| +| Activate environment | `conda activate personaplex` | +| Start Web UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Start with CPU offload | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload` | +| Start with local frontend | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | + +## Troubleshooting + +**Error: "No module named 'moshi'"** +- Solution: Activate the conda environment: `conda activate personaplex` + +**Error: "Access denied" when downloading model** +- Solution: Accept the model license and set up your HF token in `.env` + +For more issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). + +## Next Steps + +- See [README.md](README.md) for detailed documentation +- Explore voice customization options +- Try different persona prompts +- Check out offline evaluation mode diff --git a/README.md b/README.md index f3b053a..2e6e36d 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ [![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) [![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/5jAXrrbwRb) +**🚀 New to PersonaPlex? See [QUICKSTART.md](QUICKSTART.md) for a fast setup guide!** + +**🎨 Developing custom UI? See [FRONTEND_DEVELOPMENT.md](FRONTEND_DEVELOPMENT.md) for frontend development workflow!** + PersonaPlex is a real-time, full-duplex speech-to-speech conversational model that enables persona control through text-based role prompts and audio-based voice conditioning. Trained on a combination of synthetic and real conversations, it produces natural, low-latency spoken interactions with a consistent persona. PersonaPlex is based on the [Moshi](https://arxiv.org/abs/2410.00037) architecture and weights.

@@ -48,13 +52,13 @@ cd .. **Note:** Use `pip install -e .` (editable mode) during development so code changes are immediately reflected without reinstalling. #### Option 2: For Blackwell GPUs (RTX 50 series) -Blackwell GPUs require PyTorch with CUDA 12.8. Install PyTorch first, then the moshi package: +Blackwell GPUs require PyTorch with CUDA 13.0+ support. Install PyTorch first, then the moshi package: ```bash # Create and activate conda environment conda create -n personaplex python=3.10 -y conda activate personaplex -# Install PyTorch with CUDA 13.0 FIRST +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 # Then install the moshi package (will use existing PyTorch) @@ -92,15 +96,47 @@ huggingface-cli login ### Launch Server -Launch server for live interaction (temporary SSL certs for https): +**IMPORTANT: First activate the conda environment:** +```bash +conda activate personaplex +``` + +#### Smart Auto-Detection (Recommended) + +The server **automatically detects and serves your custom UI** if `client/dist` exists: ```bash # The server automatically loads your HF_TOKEN from the .env file +# If client/dist exists, it will be used automatically! SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` -**For Development:** If you've modified the frontend (client/ directory), use the `--static` flag to serve your local build: +**Auto-detection behavior:** +1. Checks if `client/dist` exists in your project +2. If yes → serves custom UI from `./client/dist` +3. If no → downloads and serves default UI from HuggingFace + +**How to verify which UI is loading:** +Check the server logs: +- **Custom UI (auto-detected)**: + ``` + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + ``` +- **Default UI (no custom build)**: + ``` + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +#### Manual Override (Optional) + +You can still explicitly specify which UI to use with the `--static` flag: ```bash +# Force use of custom UI from specific directory SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + +# Disable static serving entirely +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none ``` **CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index c89ccef..92a3fd2 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -1,5 +1,43 @@ # Troubleshooting Guide +## Common Issues + +### Module Not Found: 'moshi' + +**Symptom:** When running `python -m moshi.server`, you get: +``` +ModuleNotFoundError: No module named 'moshi' +``` + +**Root Cause:** The conda environment is not activated, or moshi is not installed in the active environment. + +**Solution:** + +1. Activate the conda environment: + ```bash + conda activate personaplex + ``` + +2. Verify the environment is active (you should see `(personaplex)` in your prompt): + ```bash + conda info --envs + # Should show * next to personaplex + ``` + +3. If moshi is not installed, install it: + ```bash + cd moshi + pip install -e . + cd .. + ``` + +4. Try running the server again: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +**Prevention:** Always activate the conda environment before running PersonaPlex commands. Add a reminder to your workflow or shell configuration. + ## Development Issues ### Code Changes Not Reflected When Running Server @@ -45,6 +83,65 @@ ## Server Issues +### Custom UI Not Loading (Server Uses Default UI) + +**Symptom:** You've modified the frontend (client/ directory), rebuilt it, but when you start the server, your changes don't appear. + +**Root Cause:** The `client/dist` directory doesn't exist or is empty. The server auto-detects custom UI by checking if `client/dist` exists. + +**Solution:** + +1. **Verify the build exists:** + ```bash + ls -la client/dist/ + ``` + If this directory doesn't exist or is empty, you need to build the frontend first. + +2. **Build the frontend:** + ```bash + cd client + npm install # If you haven't already + npm run build + cd .. + ``` + +3. **Restart the server** (it will now auto-detect the custom UI): + ```bash + conda activate personaplex + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +4. **Verify auto-detection worked** by checking the logs: + ``` + # SUCCESS - Custom UI detected: + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + + # FAIL - No custom UI found: + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +5. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) to clear cached assets + +**When auto-detection won't work:** +- `client/dist` directory doesn't exist +- `client/dist` exists but is empty +- Permissions prevent reading the directory + +**Manual override (if needed):** +If auto-detection fails but you know the build exists, use the `--static` flag: +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist +``` + +**Development tip:** +After making frontend changes, rebuild and the server will auto-detect: +```bash +cd client && npm run build && cd .. +# Restart server - custom UI detected automatically! +``` + ### Server Returns 404 for API Endpoints If specific API endpoints return 404: diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index a157f83..669a2ec 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -357,8 +357,51 @@ def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Opti return str(voices_dir) +def _is_valid_ui_build(dist_path: Path) -> bool: + """ + Validate that a directory contains a valid UI build. + + Args: + dist_path: Path to the dist directory + + Returns: + True if the directory contains a valid build (has index.html), False otherwise + """ + if not dist_path.is_dir(): + return False + + # Check for essential file - index.html must exist and be non-empty + index_html = dist_path / "index.html" + try: + return index_html.exists() and index_html.stat().st_size > 0 + except (OSError, PermissionError): + return False + + def _get_static_path(static: Optional[str]) -> Optional[str]: if static is None: + # Auto-detect: prefer local custom UI (client/dist) if it exists + try: + # Priority 1: Check current working directory (works for all install modes) + cwd_dist = Path.cwd() / "client" / "dist" + if _is_valid_ui_build(cwd_dist): + logger.info(f"Found custom UI at {cwd_dist}, using it instead of default") + return str(cwd_dist) + + # Priority 2: Check project root relative to __file__ (works for editable installs) + # server.py is in moshi/moshi/, so project root is 2 levels up + project_root = Path(__file__).parent.parent.parent + local_dist = project_root / "client" / "dist" + + if _is_valid_ui_build(local_dist): + logger.info(f"Found custom UI at {local_dist}, using it instead of default") + return str(local_dist) + + except (OSError, PermissionError) as e: + logger.warning(f"Could not check for custom UI: {e}. Falling back to default.") + # Fall through to HuggingFace download + + # Fall back to HuggingFace default UI logger.info("retrieving the static content") dist_tgz = hf_hub_download("nvidia/personaplex-7b-v1", "dist.tgz") dist_tgz = Path(dist_tgz) From ca0496538e23e563b5ffb73f8d62cad83d821441 Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Mon, 26 Jan 2026 18:07:44 +0900 Subject: [PATCH 19/24] Add dynamic custom voice system with automatic Web UI discovery Implement dynamic voice discovery system that allows users to add custom voices without modifying code. Voices automatically appear in the Web UI dropdown after generating embeddings and restarting the server. Backend changes: - Add VoiceDiscovery service to scan configured voice directories - Add /api/voices REST endpoint returning voice list with metadata - Support custom voices directory (configurable via CUSTOM_VOICE_DIR) - Only list .pt embedding files (not .wav source audio) Frontend changes: - Add useVoices React hook for dynamic voice fetching - Update Queue and ModelParams components to use dynamic voice loading - Add loading and error states for better UX - Custom voices appear first in dropdown Infrastructure: - Add custom_voices/ directory with comprehensive README - Update .gitignore to exclude voice files but keep directory structure - Add TROUBLESHOOTING.md documenting common issues - Update README.md with installation, server, and custom voice docs Key fixes applied during implementation: - Package must be installed in editable mode (pip install -e .) for dev - Server needs --static client/dist flag to serve local frontend builds - API routes must be registered before static routes in aiohttp - Critical: Only .pt files are selectable voices (not .wav source files) --- .gitignore | 6 + README.md | 149 +++++++++++++++++- TROUBLESHOOTING.md | 145 +++++++++++++++++ client/package-lock.json | 14 ++ client/src/hooks/useVoices.ts | 52 ++++++ .../components/ModelParams/ModelParams.tsx | 40 ++--- client/src/pages/Queue/Queue.tsx | 44 ++++-- custom_voices/README.md | 92 +++++++++++ moshi/moshi/server.py | 30 ++++ moshi/moshi/voice_discovery.py | 105 ++++++++++++ 10 files changed, 635 insertions(+), 42 deletions(-) create mode 100644 TROUBLESHOOTING.md create mode 100644 client/src/hooks/useVoices.ts create mode 100644 custom_voices/README.md create mode 100644 moshi/moshi/voice_discovery.py diff --git a/.gitignore b/.gitignore index 3278df4..7d1f30d 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,9 @@ mlx-trace.json # Include everything in assets !assets/ !assets/** + +# Custom voice files (keep directory structure in git, ignore voice files) +custom_voices/*.pt +custom_voices/*.wav +# But keep the README +!custom_voices/README.md diff --git a/README.md b/README.md index 5509a74..f3b053a 100644 --- a/README.md +++ b/README.md @@ -31,31 +31,78 @@ brew install opus ### Installation -Download this repository and install with: +Download this repository and set up the environment: + +#### Option 1: Using Conda (Recommended) ```bash -pip install moshi/. +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install the moshi package in editable mode (for development) +cd moshi +pip install -e . +cd .. ``` -Extra step for Blackwell based GPUs as suggested in (See https://github.com/NVIDIA/personaplex/issues/2): +**Note:** Use `pip install -e .` (editable mode) during development so code changes are immediately reflected without reinstalling. + +#### Option 2: For Blackwell GPUs (RTX 50 series) +Blackwell GPUs require PyTorch with CUDA 12.8. Install PyTorch first, then the moshi package: ```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex + +# Install PyTorch with CUDA 13.0 FIRST pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Then install the moshi package (will use existing PyTorch) +pip install moshi/. ``` +See https://github.com/NVIDIA/personaplex/issues/2 for more details on Blackwell GPU support. + ### Accept Model License -Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1).
-Then set up your Huggingface authentication: + +Log in to your Huggingface account and accept the PersonaPlex model license [here](https://huggingface.co/nvidia/personaplex-7b-v1). + +Then set up your Huggingface authentication using one of these methods: + +**Option 1: Using .env file (Recommended)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual token +``` + +**Option 2: Environment variable** ```bash export HF_TOKEN= ``` +**Option 3: Hugging Face CLI** +```bash +pip install huggingface_hub +huggingface-cli login +``` + +**Note:** The .env file is optional. All existing workflows continue to work. + ### Launch Server Launch server for live interaction (temporary SSL certs for https): ```bash +# The server automatically loads your HF_TOKEN from the .env file SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` +**For Development:** If you've modified the frontend (client/ directory), use the `--static` flag to serve your local build: +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist +``` + **CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): ```bash SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload @@ -74,7 +121,6 @@ Add `--cpu-offload` to any command below if your GPU has insufficient memory (re **Assistant example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATF2.pt" \ --input-wav "assets/test/input_assistant.wav" \ @@ -85,7 +131,6 @@ python -m moshi.offline \ **Service example:** ```bash -HF_TOKEN= \ python -m moshi.offline \ --voice-prompt "NATM1.pt" \ --text-prompt "$(cat assets/test/prompt_service.txt)" \ @@ -105,6 +150,88 @@ Variety(female): VARF0, VARF1, VARF2, VARF3, VARF4 Variety(male): VARM0, VARM1, VARM2, VARM3, VARM4 ``` +### Custom Voices + +PersonaPlex supports **dynamic custom voice loading** - add new voices and they automatically appear in the Web UI without code changes! + +#### Quick Start + +**Step 1: Prepare your audio file** + +Record a ~10 second WAV file of clear speech. Convert it to mono 24kHz format: +```bash +ffmpeg -i your_recording.wav -ac 1 -ar 24000 my_voice.wav +``` + +**Step 2: Copy to voices directory** + +Copy the converted audio to the voices directory: +```bash +cp my_voice.wav ~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/ +``` + +**Step 3: Generate voice embeddings** + +Run the offline script with `--save-voice-embeddings` to generate the `.pt` file: +```bash +python -m moshi.offline \ + --voice-prompt "my_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" +``` + +This creates `my_voice.pt` in the voices directory. + +**Step 4: Use your custom voice** + +**With the Web UI:** Restart the server and your custom voice automatically appears in the voice dropdown! Custom voices appear first in the list. +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**With offline mode:** Use the `.pt` file directly: +```bash +python -m moshi.offline --voice-prompt "my_voice.pt" ... +``` + +#### Custom Voices Directory + +For easier management, you can place custom voices in the `custom_voices/` directory: + +```bash +# Create your custom voices directory (if it doesn't exist) +mkdir -p custom_voices + +# Place voice files here +cp my_voice.wav custom_voices/ +# Generate embeddings... +# The generated my_voice.pt will appear in the Web UI! +``` + +**Configure custom location (optional):** +```bash +# In .env file +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +#### Voice File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices in the Web UI +- **`.wav` files**: Source audio (24kHz mono) - used only to GENERATE the `.pt` embeddings + +**Important:** Only `.pt` files appear in the voice selector dropdown. The `.wav` files are intermediate source files used during voice generation. + +#### API Access + +List all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns JSON with all voices, their types, and categories. + ## Prompting Guide The model is trained on synthetic conversations for a fixed assistant role and varying customer service roles. @@ -159,6 +286,14 @@ Personaplex finetunes Moshi and benefits from the generalization capabilities of You enjoy having a good conversation. Have a technical discussion about fixing a reactor core on a spaceship to Mars. You are an astronaut on a Mars mission. Your name is Alex. You are already dealing with a reactor core meltdown on a Mars mission. Several ship systems are failing, and continued instability will lead to catastrophic failure. You explain what is happening and you urgently ask for help thinking through how to stabilize the reactor. ``` +## Troubleshooting + +For common issues and solutions, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md), including: +- Code changes not reflected when running server (editable install issue) +- Custom voices not appearing in Web UI +- Frontend build and development issues +- Environment and dependency problems + ## License The present code is provided under the MIT license. The weights for the models are released under the NVIDIA Open Model license. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..c89ccef --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,145 @@ +# Troubleshooting Guide + +## Development Issues + +### Code Changes Not Reflected When Running Server + +**Symptom:** You made changes to backend code (e.g., added new endpoints, modified routes), but when you restart the server with `python -m moshi.server`, the changes don't appear. New API endpoints return 404, and debug logging doesn't show up. + +**Root Cause:** The moshi-personaplex package was installed in regular mode (`pip install .`) instead of editable mode (`pip install -e .`). When installed normally, pip copies the code to site-packages (e.g., `/path/to/envs/personaplex/lib/python3.10/site-packages/moshi`), and Python loads from there instead of your source directory. + +**Solution:** + +1. Check if the package is installed: + ```bash + pip list | grep moshi + ``` + +2. Check installation location: + ```bash + pip show moshi-personaplex + ``` + + If `Location` shows `site-packages`, it's not in editable mode. + +3. Uninstall and reinstall in editable mode: + ```bash + pip uninstall -y moshi-personaplex + cd moshi + pip install -e . + ``` + +4. Verify editable install: + ```bash + pip show moshi-personaplex + ``` + + The `Location` should show something like `/path/to/repo/moshi` instead of `site-packages`. + +5. Restart the server: + ```bash + python -m moshi.server + ``` + +**Prevention:** Always use `pip install -e .` (with the `-e` flag) when installing packages for development. + +## Server Issues + +### Server Returns 404 for API Endpoints + +If specific API endpoints return 404: + +1. Check if routes are registered correctly by looking for debug logging at server startup +2. Verify the package is installed in editable mode (see above) +3. Check that static routes are registered AFTER API routes in server.py +4. Clear Python cache and restart: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + python -m moshi.server + ``` + +### Build Directory Conflicts + +If you suspect the `moshi/build/` directory contains old code: + +1. Move it out of the way: + ```bash + mv moshi/build moshi/build.bak + ``` + +2. Clear Python cache: + ```bash + find moshi -type f -name '*.pyc' -delete + find moshi -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null + ``` + +3. Restart the server + +## Frontend Issues + +### Frontend Not Showing New Features + +If you modified React components but don't see changes: + +1. Rebuild the frontend: + ```bash + cd client + npm run build + ``` + +2. Restart the server (it serves the static files from client/dist) + +3. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) + +### Voice Dropdown Shows "Error loading voices" + +1. Check if the server is running: + ```bash + ps aux | grep moshi.server + ``` + +2. Test the API endpoint directly: + ```bash + curl http://localhost:8998/api/voices + ``` + +3. Check server logs for errors + +4. Verify VoiceDiscovery can find voice files: + ```bash + python -c "from moshi.voice_discovery import VoiceDiscovery; print(len(VoiceDiscovery.list_voices()))" + ``` + +## Environment Issues + +### Missing HuggingFace Token + +If models fail to download: + +1. Create a `.env` file in the repository root +2. Add your HuggingFace token: + ``` + HF_TOKEN=your_token_here + ``` +3. See `.env.example` for more details + +### ffmpeg Not Found + +If voice generation fails with "Command 'ffmpeg' not found": + +```bash +sudo apt install ffmpeg # Ubuntu/Debian +brew install ffmpeg # macOS +``` + +## Getting Help + +If you encounter issues not covered here: + +1. Check the README.md for setup instructions +2. Review recent commits for breaking changes +3. Open an issue at https://github.com/nvidia/personaplex-7b-v1/issues with: + - Your environment (OS, Python version, conda/venv) + - Steps to reproduce the issue + - Complete error messages and logs diff --git a/client/package-lock.json b/client/package-lock.json index 3997642..b1b8f01 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -1315,6 +1315,7 @@ "integrity": "sha512-cisd7gxkzjBKU2GgdYrTdtQx1SORymWyaAFhaxQPK9bYO9ot3Y5OikQRvY0VYQtvwjeQnizCINJAenh/V7MK2w==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.2.2" @@ -1769,6 +1770,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2169,6 +2171,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "baseline-browser-mapping": "^2.9.0", "caniuse-lite": "^1.0.30001759", @@ -2911,6 +2914,7 @@ "deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.6.1", @@ -2967,6 +2971,7 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -4270,6 +4275,7 @@ "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", "dev": true, "license": "MIT", + "peer": true, "bin": { "jiti": "bin/jiti.js" } @@ -5017,6 +5023,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -5197,6 +5204,7 @@ "integrity": "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA==", "dev": true, "license": "MIT", + "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -5424,6 +5432,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -5436,6 +5445,7 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -6283,6 +6293,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -6450,6 +6461,7 @@ "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6525,6 +6537,7 @@ "integrity": "sha512-4Z+L8I2OqhZV8qA132M4wNL30ypZGYOQVBfMgxDH/K5UX0PNqTu1c6za9ST5r9+tavvHiTWmBnKzpCJ/GlVFtg==", "dev": true, "license": "BSD-2-Clause", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "7.18.0", "@typescript-eslint/types": "7.18.0", @@ -6767,6 +6780,7 @@ "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "esbuild": "^0.21.3", "postcss": "^8.4.43", diff --git a/client/src/hooks/useVoices.ts b/client/src/hooks/useVoices.ts new file mode 100644 index 0000000..2f269c5 --- /dev/null +++ b/client/src/hooks/useVoices.ts @@ -0,0 +1,52 @@ +import { useState, useEffect } from 'react'; + +export interface Voice { + name: string; + type: 'embeddings' | 'audio'; + category: 'custom' | 'natural-female' | 'natural-male' | 'variety-female' | 'variety-male' | 'other'; + path: string; +} + +export interface UseVoicesReturn { + voices: Voice[]; + loading: boolean; + error: string | null; + refresh: () => void; +} + +export function useVoices(): UseVoicesReturn { + const [voices, setVoices] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + const fetchVoices = async () => { + setLoading(true); + setError(null); + + try { + const response = await fetch('/api/voices'); + if (!response.ok) { + throw new Error(`Failed to fetch voices: ${response.statusText}`); + } + + const data = await response.json(); + setVoices(data.voices || []); + } catch (err) { + setError(err instanceof Error ? err.message : 'Unknown error'); + console.error('Error fetching voices:', err); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchVoices(); + }, []); + + return { + voices, + loading, + error, + refresh: fetchVoices, + }; +} diff --git a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx index 07afedb..66f6396 100644 --- a/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx +++ b/client/src/pages/Conversation/components/ModelParams/ModelParams.tsx @@ -1,6 +1,7 @@ import { FC, RefObject, useState } from "react"; import { useModelParams } from "../../hooks/useModelParams"; import { Button } from "../../../../components/Button/Button"; +import { useVoices } from "../../../../hooks/useVoices"; type ModelParamsProps = { isConnected: boolean; @@ -24,6 +25,7 @@ export const ModelParams:FC = ({ }) => { const [modalVoicePrompt, setModalVoicePrompt] = useState(voicePrompt); const [modalTextPrompt, setModalTextPrompt] = useState(textPrompt); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); return (

Voice Prompt: {modalVoicePrompt} - setModalVoicePrompt(e.target.value)} + > + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}
@@ -37,25 +39,25 @@ export const ModelParams:FC = ({ diff --git a/client/src/pages/Queue/Queue.tsx b/client/src/pages/Queue/Queue.tsx index 7d0d44b..5acef1d 100644 --- a/client/src/pages/Queue/Queue.tsx +++ b/client/src/pages/Queue/Queue.tsx @@ -7,13 +7,7 @@ import { Button } from "../../components/Button/Button"; import { useModelParams } from "../Conversation/hooks/useModelParams"; import { env } from "../../env"; import { prewarmDecoderWorker } from "../../decoder/decoderWorker"; - -const VOICE_OPTIONS = [ - "NATF0.pt", "NATF1.pt", "NATF2.pt", "NATF3.pt", - "NATM0.pt", "NATM1.pt", "NATM2.pt", "NATM3.pt", - "VARF0.pt", "VARF1.pt", "VARF2.pt", "VARF3.pt", "VARF4.pt", - "VARM0.pt", "VARM1.pt", "VARM2.pt", "VARM3.pt", "VARM4.pt", -]; +import { useVoices } from "../../hooks/useVoices"; const TEXT_PROMPT_PRESETS = [ { @@ -41,6 +35,9 @@ interface HomepageProps { setTextPrompt: (value: string) => void; voicePrompt: string; setVoicePrompt: (value: string) => void; + voicesLoading: boolean; + voicesError: string | null; + voices: Array<{ name: string; type: string; category: string; path: string }>; } const Homepage = ({ @@ -50,6 +47,9 @@ const Homepage = ({ setTextPrompt, voicePrompt, setVoicePrompt, + voicesLoading, + voicesError, + voices, }: HomepageProps) => { return (
@@ -102,16 +102,24 @@ const Homepage = ({ name="voice-prompt" value={voicePrompt} onChange={(e) => setVoicePrompt(e.target.value)} - className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent" + disabled={voicesLoading} + className="w-full p-3 bg-white text-black border border-gray-300 rounded focus:outline-none focus:ring-2 focus:ring-[#76b900] focus:border-transparent disabled:bg-gray-100 disabled:cursor-not-allowed" > - {VOICE_OPTIONS.map((voice) => ( - - ))} + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}
@@ -132,6 +140,7 @@ export const Queue:FC = () => { const [hasMicrophoneAccess, setHasMicrophoneAccess] = useState(false); const [showMicrophoneAccessMessage, setShowMicrophoneAccessMessage] = useState(false); const modelParams = useModelParams(); + const { voices, loading: voicesLoading, error: voicesError } = useVoices(); const audioContext = useRef(null); const worklet = useRef(null); @@ -209,6 +218,9 @@ export const Queue:FC = () => { setTextPrompt={modelParams.setTextPrompt} voicePrompt={modelParams.voicePrompt} setVoicePrompt={modelParams.setVoicePrompt} + voicesLoading={voicesLoading} + voicesError={voicesError} + voices={voices} /> )} diff --git a/custom_voices/README.md b/custom_voices/README.md new file mode 100644 index 0000000..189f22f --- /dev/null +++ b/custom_voices/README.md @@ -0,0 +1,92 @@ +# Custom Voices Directory + +This directory is for storing your custom voice files. Any voice files (.pt or .wav) placed here will automatically appear in the PersonaPlex web interface voice selector. + +## Quick Start + +1. **Prepare your audio file** (10+ seconds of clear speech): + ```bash + ffmpeg -i your_recording.mp3 -ac 1 -ar 24000 your_voice.wav + ``` + +2. **Generate voice embeddings**: + ```bash + # Copy the WAV file to the voices directory + VOICES_DIR=$(python -c "from pathlib import Path; from huggingface_hub import snapshot_download; print(Path(snapshot_download(repo_id='nvidia/personaplex-7b-v1', allow_patterns=['voices/*'])) / 'voices')") + cp your_voice.wav "$VOICES_DIR/" + + # Generate embeddings + python -m moshi.offline \ + --voice-prompt "your_voice.wav" \ + --save-voice-embeddings \ + --input-wav "assets/test/input_assistant.wav" \ + --output-wav "/tmp/test_output.wav" \ + --output-text "/tmp/test_output.json" + ``` + +3. **Restart the server** and your voice will appear in the dropdown! + +## File Formats + +- **`.pt` files**: Voice embeddings - these are the actual selectable voices that appear in the UI dropdown +- **`.wav` files**: Source audio recordings (24kHz mono) - used only to GENERATE the .pt embeddings, not selectable as voices + +**Important**: Only `.pt` files appear in the voice selector dropdown. `.wav` files are intermediate source files used during voice generation. + +## Configuration + +By default, PersonaPlex looks for voices in: +1. HuggingFace cache: `~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/` +2. Custom directory: `./custom_voices/` (this directory) + +To use a different custom voices directory, set the `CUSTOM_VOICE_DIR` environment variable in `.env`: +``` +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +## Voice Naming Convention + +Pre-packaged voices follow this naming: +- `NATF*` = Natural Female +- `NATM*` = Natural Male +- `VARF*` = Variety Female +- `VARM*` = Variety Male + +Custom voices (any other name) will appear first in the dropdown, followed by the categorized pre-packaged voices. + +## API Access + +You can list all available voices programmatically: +```bash +curl http://localhost:8998/api/voices +``` + +Returns: +```json +{ + "voices": [ + {"name": "your_voice.pt", "type": "embeddings", "category": "custom", "path": "..."}, + {"name": "NATF0.pt", "type": "embeddings", "category": "natural-female", "path": "..."}, + ... + ], + "count": 20 +} +``` + +## Tips + +- Use high-quality audio recordings (clear speech, minimal background noise) +- 10-30 seconds of audio is usually sufficient +- The voice will reflect the speaking style and characteristics of the input audio +- Experiment with different recordings to find the best voice for your use case + +## Troubleshooting + +If your custom voice doesn't appear: +1. Verify the file is in the correct directory (`ls custom_voices/`) +2. Check the file extension is `.pt` or `.wav` +3. Restart the PersonaPlex server +4. Test the API endpoint: `curl http://localhost:8998/api/voices` +5. Check server logs for errors + +For more help, see `TROUBLESHOOTING.md` in the repository root. diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index 771f491..e878295 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -49,6 +49,7 @@ from .models import loaders, MimiModel, LMModel, LMGen from .utils.connection import create_ssl_context, get_lan_ip from .utils.logging import setup_logger, ColorizedLog +from .voice_discovery import VoiceDiscovery logger = setup_logger(__name__) @@ -308,6 +309,20 @@ async def is_alive(): clog.log("info", "done with connection") return ws + async def handle_list_voices(self, request): + """List all available voices from configured directories.""" + try: + voices = VoiceDiscovery.list_voices() + return web.json_response({ + 'voices': voices, + 'count': len(voices) + }) + except Exception as e: + logger.error(f"Error listing voices: {e}") + return web.json_response({ + 'error': str(e) + }, status=500) + def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Optional[str]: """ @@ -457,7 +472,19 @@ def main(): logger.info("warming up the model") state.warmup() app = web.Application() + + # Register API routes FIRST before static catch-all + async def test_endpoint(request): + return web.json_response({"status": "ok", "test": True}) + + app.router.add_get("/api/test", test_endpoint) app.router.add_get("/api/chat", state.handle_chat) + app.router.add_get("/api/voices", state.handle_list_voices) + + # Debug: log registered routes + logger.info(f"Registered routes so far: {[r.resource.canonical for r in app.router.routes()]}") + + # Register static routes AFTER API routes if static_path is not None: async def handle_root(_): return web.FileResponse(os.path.join(static_path, "index.html")) @@ -467,6 +494,9 @@ async def handle_root(_): app.router.add_static( "/", path=static_path, follow_symlinks=True, name="static" ) + + # Debug: log all routes after registration + logger.info(f"All registered routes: {[(r.method, r.resource.canonical) for r in app.router.routes()]}") protocol = "http" ssl_context = None if args.ssl is not None: diff --git a/moshi/moshi/voice_discovery.py b/moshi/moshi/voice_discovery.py new file mode 100644 index 0000000..40cb406 --- /dev/null +++ b/moshi/moshi/voice_discovery.py @@ -0,0 +1,105 @@ +# Copyright (c) Kyutai, all rights reserved. +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Voice discovery service for listing available voices.""" +from pathlib import Path +from typing import List, Dict +import os + + +class VoiceDiscovery: + """Discovers and lists available voice files.""" + + @staticmethod + def get_voice_directories() -> List[Path]: + """Get all directories where voices can be stored. + + Returns: + List of Path objects for directories containing voice files + """ + dirs = [] + + # 1. HuggingFace cache voices directory + hf_cache = os.environ.get('HF_HOME', str(Path.home() / '.cache/huggingface')) + hf_voices = Path(hf_cache) / 'hub' + + # Find personaplex model snapshot + for model_dir in hf_voices.glob('models--nvidia--personaplex-7b-v1/snapshots/*'): + voices_dir = model_dir / 'voices' + if voices_dir.exists(): + dirs.append(voices_dir) + + # 2. Custom voices directory (from .env or default) + custom_dir = os.environ.get('CUSTOM_VOICE_DIR', './custom_voices') + custom_path = Path(custom_dir) + if custom_path.exists(): + dirs.append(custom_path) + + return dirs + + @staticmethod + def list_voices() -> List[Dict[str, str]]: + """List all available voices. + + Only returns .pt embedding files, not .wav source audio files. + .wav files are used to generate embeddings and should not be listed as voices. + + Returns: + List of voice info dicts with keys: name, type, category, path + Sorted with custom voices first, then by category, then alphabetically + """ + voices = [] + seen_names = set() + + for voice_dir in VoiceDiscovery.get_voice_directories(): + # Find .pt files (voice embeddings only) + for pt_file in voice_dir.glob('*.pt'): + name = pt_file.name + if name not in seen_names: + category = VoiceDiscovery._categorize_voice(name) + voices.append({ + 'name': name, + 'type': 'embeddings', + 'category': category, + 'path': str(pt_file) + }) + seen_names.add(name) + + # Sort: custom first, then by category, then by name + def sort_key(v): + cat_order = { + 'custom': 0, + 'natural-female': 1, + 'natural-male': 2, + 'variety-female': 3, + 'variety-male': 4, + 'other': 5 + } + return (cat_order.get(v['category'], 99), v['name']) + + return sorted(voices, key=sort_key) + + @staticmethod + def _categorize_voice(filename: str) -> str: + """Categorize voice by filename pattern. + + Args: + filename: Voice filename (.pt extension) + + Returns: + Category string: custom, natural-female, natural-male, + variety-female, variety-male, or other + """ + name = filename.replace('.pt', '') + + if name.startswith('NATF'): + return 'natural-female' + elif name.startswith('NATM'): + return 'natural-male' + elif name.startswith('VARF'): + return 'variety-female' + elif name.startswith('VARM'): + return 'variety-male' + else: + return 'custom' From b728bfc648cd3e39e608fd549d154a5bcd6a4172 Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 17:17:23 +0900 Subject: [PATCH 20/24] Add smart auto-detection for custom UI builds Server now automatically detects and serves custom UI from client/dist without requiring --static flag, simplifying development workflow. Falls back to HuggingFace default UI when custom build is unavailable. Adds comprehensive documentation including QUICKSTART.md and FRONTEND_DEVELOPMENT.md guides. --- FRONTEND_DEVELOPMENT.md | 284 ++++++++++++++++++++++++++++++++++++++++ QUICKSTART.md | 119 +++++++++++++++++ README.md | 44 ++++++- TROUBLESHOOTING.md | 97 ++++++++++++++ moshi/moshi/server.py | 43 ++++++ 5 files changed, 583 insertions(+), 4 deletions(-) create mode 100644 FRONTEND_DEVELOPMENT.md create mode 100644 QUICKSTART.md diff --git a/FRONTEND_DEVELOPMENT.md b/FRONTEND_DEVELOPMENT.md new file mode 100644 index 0000000..519fd5e --- /dev/null +++ b/FRONTEND_DEVELOPMENT.md @@ -0,0 +1,284 @@ +# Frontend Development Guide + +This guide explains how to develop and test custom UI changes for PersonaPlex. + +## Understanding Smart Auto-Detection + +**PersonaPlex now automatically detects and serves your custom UI!** You no longer need to use the `--static` flag for development. + +### How Auto-Detection Works + +When you start the server, it checks: +1. Does `client/dist` exist in the project directory? +2. **YES** → Automatically serves your custom UI +3. **NO** → Downloads and serves the default UI from HuggingFace + +### Starting the Server (Auto-Detection) + +```bash +# Just start the server normally - no flags needed! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**Log output with custom UI detected:** +``` +Found custom UI at /home/.../personaplex-blackwell/client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +serving static content from /home/.../personaplex-blackwell/client/dist +``` + +**Log output without custom UI:** +``` +retrieving the static content +static_path = /home/.../.cache/huggingface/.../dist +serving static content from /home/.../.cache/huggingface/.../dist +``` + +### Manual Override (Optional) + +You can still manually specify the UI source if needed: + +```bash +# Force specific directory +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static /path/to/custom/dist + +# Disable static serving +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none +``` + +## Frontend Development Workflow + +### Prerequisites + +1. Install Node.js and npm (if not already installed): + ```bash + # Check if already installed + node --version + npm --version + ``` + +2. Install frontend dependencies: + ```bash + cd client + npm install + ``` + +### Development Steps (Simplified!) + +#### 1. Make Your Changes +Edit files in the `client/src/` directory: +- `client/src/components/` - React components +- `client/src/styles/` - CSS and styling +- `client/src/App.tsx` - Main application component + +#### 2. Build the Frontend +```bash +cd client +npm run build +cd .. +``` + +This creates/updates the `client/dist` directory with your compiled code. + +#### 3. Start Server (Auto-Detection!) +```bash +# From project root - server auto-detects custom UI! +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +#### 4. Verify Custom UI is Loaded +Check the server logs for: +``` +Found custom UI at .../client/dist, using it instead of default +static_path = /home/.../personaplex-blackwell/client/dist +``` + +If you see `retrieving the static content`, the build might not exist. Go back to step 2. + +#### 5. Test Your Changes +1. Open the Web UI: https://localhost:8998 +2. Hard refresh (Ctrl+Shift+R or Cmd+Shift+R) to clear browser cache +3. Test your modifications + +#### 6. Iterate +Repeat steps 1-5 for each change: +```bash +# Make changes to client/src/... +cd client && npm run build && cd .. + +# Restart server (Ctrl+C to stop first) - auto-detects custom UI! +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +**That's it! No `--static` flag needed anymore.** + +## Troubleshooting + +### Changes Not Appearing + +**Problem:** You rebuilt the frontend but don't see changes in the browser. + +**Solutions:** +1. **Verify server is using custom UI:** + - Check logs for `static_path = client/dist` + - If not, restart with `--static client/dist` + +2. **Clear browser cache:** + - Hard refresh: Ctrl+Shift+R (Windows/Linux) or Cmd+Shift+R (Mac) + - Or open DevTools (F12) → Network tab → Check "Disable cache" + +3. **Verify build completed successfully:** + ```bash + cd client + npm run build + ls -la dist/ # Should show recent timestamps + ``` + +4. **Check for build errors:** + ```bash + cd client + npm run build 2>&1 | grep -i error + ``` + +### Server Won't Start with --static Flag + +**Problem:** Error when starting server with `--static client/dist` + +**Solutions:** +1. **Verify dist directory exists:** + ```bash + ls -la client/dist/ + ``` + If missing, build the frontend first: `cd client && npm run build` + +2. **Check path is correct:** + - Use relative path: `--static client/dist` + - From project root, not from client/ directory + +### Frontend Build Fails + +**Problem:** `npm run build` fails with errors + +**Solutions:** +1. **Check Node.js version:** + ```bash + node --version + # Should be 16.x or higher + ``` + +2. **Reinstall dependencies:** + ```bash + cd client + rm -rf node_modules package-lock.json + npm install + npm run build + ``` + +3. **Check for TypeScript errors:** + ```bash + cd client + npm run type-check + ``` + +## Development Tips + +### Shell Alias for Quick Development +Add to your `~/.bashrc` or `~/.zshrc`: + +```bash +# Quick start with custom UI +alias moshi-dev='conda activate personaplex && SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist' + +# Quick frontend rebuild +alias moshi-build='cd client && npm run build && cd ..' +``` + +Usage: +```bash +# Make changes to client/src/... +moshi-build # Rebuild frontend +moshi-dev # Start server with custom UI +``` + +### Watch Mode for Live Development + +For faster iteration, use the frontend in development mode: + +```bash +# Terminal 1: Start backend server (without static flag) +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# Terminal 2: Start frontend dev server with hot reload +cd client +npm run dev +``` + +Then access the UI at the Vite dev server URL (usually http://localhost:5173). + +**Note:** This requires configuring CORS in the backend. Check `client/vite.config.ts` for proxy settings. + +## Production Deployment + +When ready to deploy your custom UI: + +1. Build the production bundle: + ```bash + cd client + npm run build + ``` + +2. Test the production build: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + ``` + +3. Verify everything works correctly + +4. Commit your changes: + ```bash + git add client/src/ client/dist/ + git commit -m "Add custom UI feature: [description]" + ``` + +## File Structure + +``` +personaplex-blackwell/ +├── client/ # Frontend source code +│ ├── src/ # Source files (edit these) +│ │ ├── components/ # React components +│ │ ├── styles/ # CSS files +│ │ ├── App.tsx # Main app +│ │ └── main.tsx # Entry point +│ ├── dist/ # Built files (generated) +│ │ ├── index.html # HTML entry +│ │ ├── assets/ # JS/CSS bundles +│ │ └── ... +│ ├── package.json # Dependencies +│ ├── vite.config.ts # Build config +│ └── tsconfig.json # TypeScript config +└── moshi/ # Backend Python code +``` + +## Quick Reference + +| Task | Command | +|------|---------| +| Install dependencies | `cd client && npm install` | +| Build frontend | `cd client && npm run build` | +| Start with custom UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | +| Start with default UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Dev server (hot reload) | `cd client && npm run dev` | +| Type check | `cd client && npm run type-check` | +| Lint code | `cd client && npm run lint` | + +## Getting Help + +If you encounter issues not covered here: +1. Check [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems +2. Verify your Node.js and npm versions +3. Check the browser console (F12) for JavaScript errors +4. Review server logs for static file serving errors diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..87604ab --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,119 @@ +# PersonaPlex Quick Start Guide + +This guide provides the essential steps to get PersonaPlex running quickly. + +## Prerequisites + +1. Install [Opus audio codec](https://github.com/xiph/opus) development library: + ```bash + # Ubuntu/Debian + sudo apt install libopus-dev + + # macOS + brew install opus + ``` + +2. Accept the [PersonaPlex model license](https://huggingface.co/nvidia/personaplex-7b-v1) on Hugging Face + +## Installation + +### Step 1: Create Conda Environment + +```bash +# Create and activate conda environment +conda create -n personaplex python=3.10 -y +conda activate personaplex +``` + +### Step 2: Install Moshi Package + +**For most GPUs:** +```bash +cd moshi +pip install -e . +cd .. +``` + +**For Blackwell GPUs (RTX 50 series):** +```bash +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Then install moshi +cd moshi +pip install -e . +cd .. +``` + +### Step 3: Set Up Hugging Face Token + +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual Hugging Face token +``` + +## Running the Web UI + +**CRITICAL: Always activate the conda environment first!** + +```bash +# 1. Activate the environment +conda activate personaplex + +# 2. Launch the server (automatically detects custom UI if it exists) +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + +# 3. Access the Web UI at: https://localhost:8998 +``` + +### Smart Auto-Detection + +The server now **automatically detects and uses your custom UI** if you've built it! + +- If `client/dist` exists → Your custom UI is served automatically +- If `client/dist` doesn't exist → Default UI is downloaded from HuggingFace + +**Verify which UI loaded** by checking the server logs: +- Custom UI: `Found custom UI at .../client/dist, using it instead of default` +- Default UI: `retrieving the static content` (downloads from HuggingFace) + +### Building Custom UI (If Modified) + +Only needed if you've changed the frontend code: + +```bash +cd client +npm run build +cd .. + +# Now start the server - it will auto-detect your custom build +conda activate personaplex +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +``` + +## Quick Command Reference + +| Task | Command | +|------|---------| +| Activate environment | `conda activate personaplex` | +| Start Web UI | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR"` | +| Start with CPU offload | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --cpu-offload` | +| Start with local frontend | `SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist` | + +## Troubleshooting + +**Error: "No module named 'moshi'"** +- Solution: Activate the conda environment: `conda activate personaplex` + +**Error: "Access denied" when downloading model** +- Solution: Accept the model license and set up your HF token in `.env` + +For more issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). + +## Next Steps + +- See [README.md](README.md) for detailed documentation +- Explore voice customization options +- Try different persona prompts +- Check out offline evaluation mode diff --git a/README.md b/README.md index f3b053a..2e6e36d 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ [![Demo](https://img.shields.io/badge/🎮-Demo-green)](https://research.nvidia.com/labs/adlr/personaplex/) [![Discord](https://img.shields.io/badge/Discord-Join-purple?logo=discord)](https://discord.gg/5jAXrrbwRb) +**🚀 New to PersonaPlex? See [QUICKSTART.md](QUICKSTART.md) for a fast setup guide!** + +**🎨 Developing custom UI? See [FRONTEND_DEVELOPMENT.md](FRONTEND_DEVELOPMENT.md) for frontend development workflow!** + PersonaPlex is a real-time, full-duplex speech-to-speech conversational model that enables persona control through text-based role prompts and audio-based voice conditioning. Trained on a combination of synthetic and real conversations, it produces natural, low-latency spoken interactions with a consistent persona. PersonaPlex is based on the [Moshi](https://arxiv.org/abs/2410.00037) architecture and weights.

@@ -48,13 +52,13 @@ cd .. **Note:** Use `pip install -e .` (editable mode) during development so code changes are immediately reflected without reinstalling. #### Option 2: For Blackwell GPUs (RTX 50 series) -Blackwell GPUs require PyTorch with CUDA 12.8. Install PyTorch first, then the moshi package: +Blackwell GPUs require PyTorch with CUDA 13.0+ support. Install PyTorch first, then the moshi package: ```bash # Create and activate conda environment conda create -n personaplex python=3.10 -y conda activate personaplex -# Install PyTorch with CUDA 13.0 FIRST +# Install PyTorch with CUDA 13.0+ support FIRST (required for Blackwell) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 # Then install the moshi package (will use existing PyTorch) @@ -92,15 +96,47 @@ huggingface-cli login ### Launch Server -Launch server for live interaction (temporary SSL certs for https): +**IMPORTANT: First activate the conda environment:** +```bash +conda activate personaplex +``` + +#### Smart Auto-Detection (Recommended) + +The server **automatically detects and serves your custom UI** if `client/dist` exists: ```bash # The server automatically loads your HF_TOKEN from the .env file +# If client/dist exists, it will be used automatically! SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` -**For Development:** If you've modified the frontend (client/ directory), use the `--static` flag to serve your local build: +**Auto-detection behavior:** +1. Checks if `client/dist` exists in your project +2. If yes → serves custom UI from `./client/dist` +3. If no → downloads and serves default UI from HuggingFace + +**How to verify which UI is loading:** +Check the server logs: +- **Custom UI (auto-detected)**: + ``` + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + ``` +- **Default UI (no custom build)**: + ``` + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +#### Manual Override (Optional) + +You can still explicitly specify which UI to use with the `--static` flag: ```bash +# Force use of custom UI from specific directory SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist + +# Disable static serving entirely +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static none ``` **CPU Offload:** If your GPU has insufficient memory, use the `--cpu-offload` flag to offload model layers to CPU. This requires the `accelerate` package (`pip install accelerate`): diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index c89ccef..92a3fd2 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -1,5 +1,43 @@ # Troubleshooting Guide +## Common Issues + +### Module Not Found: 'moshi' + +**Symptom:** When running `python -m moshi.server`, you get: +``` +ModuleNotFoundError: No module named 'moshi' +``` + +**Root Cause:** The conda environment is not activated, or moshi is not installed in the active environment. + +**Solution:** + +1. Activate the conda environment: + ```bash + conda activate personaplex + ``` + +2. Verify the environment is active (you should see `(personaplex)` in your prompt): + ```bash + conda info --envs + # Should show * next to personaplex + ``` + +3. If moshi is not installed, install it: + ```bash + cd moshi + pip install -e . + cd .. + ``` + +4. Try running the server again: + ```bash + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +**Prevention:** Always activate the conda environment before running PersonaPlex commands. Add a reminder to your workflow or shell configuration. + ## Development Issues ### Code Changes Not Reflected When Running Server @@ -45,6 +83,65 @@ ## Server Issues +### Custom UI Not Loading (Server Uses Default UI) + +**Symptom:** You've modified the frontend (client/ directory), rebuilt it, but when you start the server, your changes don't appear. + +**Root Cause:** The `client/dist` directory doesn't exist or is empty. The server auto-detects custom UI by checking if `client/dist` exists. + +**Solution:** + +1. **Verify the build exists:** + ```bash + ls -la client/dist/ + ``` + If this directory doesn't exist or is empty, you need to build the frontend first. + +2. **Build the frontend:** + ```bash + cd client + npm install # If you haven't already + npm run build + cd .. + ``` + +3. **Restart the server** (it will now auto-detect the custom UI): + ```bash + conda activate personaplex + SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" + ``` + +4. **Verify auto-detection worked** by checking the logs: + ``` + # SUCCESS - Custom UI detected: + Found custom UI at .../client/dist, using it instead of default + static_path = /home/.../personaplex-blackwell/client/dist + + # FAIL - No custom UI found: + retrieving the static content + static_path = /home/.../.cache/huggingface/.../dist + ``` + +5. Hard refresh your browser (Ctrl+Shift+R or Cmd+Shift+R) to clear cached assets + +**When auto-detection won't work:** +- `client/dist` directory doesn't exist +- `client/dist` exists but is empty +- Permissions prevent reading the directory + +**Manual override (if needed):** +If auto-detection fails but you know the build exists, use the `--static` flag: +```bash +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" --static client/dist +``` + +**Development tip:** +After making frontend changes, rebuild and the server will auto-detect: +```bash +cd client && npm run build && cd .. +# Restart server - custom UI detected automatically! +``` + ### Server Returns 404 for API Endpoints If specific API endpoints return 404: diff --git a/moshi/moshi/server.py b/moshi/moshi/server.py index e878295..15f92e8 100644 --- a/moshi/moshi/server.py +++ b/moshi/moshi/server.py @@ -353,8 +353,51 @@ def _get_voice_prompt_dir(voice_prompt_dir: Optional[str], hf_repo: str) -> Opti return str(voices_dir) +def _is_valid_ui_build(dist_path: Path) -> bool: + """ + Validate that a directory contains a valid UI build. + + Args: + dist_path: Path to the dist directory + + Returns: + True if the directory contains a valid build (has index.html), False otherwise + """ + if not dist_path.is_dir(): + return False + + # Check for essential file - index.html must exist and be non-empty + index_html = dist_path / "index.html" + try: + return index_html.exists() and index_html.stat().st_size > 0 + except (OSError, PermissionError): + return False + + def _get_static_path(static: Optional[str]) -> Optional[str]: if static is None: + # Auto-detect: prefer local custom UI (client/dist) if it exists + try: + # Priority 1: Check current working directory (works for all install modes) + cwd_dist = Path.cwd() / "client" / "dist" + if _is_valid_ui_build(cwd_dist): + logger.info(f"Found custom UI at {cwd_dist}, using it instead of default") + return str(cwd_dist) + + # Priority 2: Check project root relative to __file__ (works for editable installs) + # server.py is in moshi/moshi/, so project root is 2 levels up + project_root = Path(__file__).parent.parent.parent + local_dist = project_root / "client" / "dist" + + if _is_valid_ui_build(local_dist): + logger.info(f"Found custom UI at {local_dist}, using it instead of default") + return str(local_dist) + + except (OSError, PermissionError) as e: + logger.warning(f"Could not check for custom UI: {e}. Falling back to default.") + # Fall through to HuggingFace download + + # Fall back to HuggingFace default UI logger.info("retrieving the static content") dist_tgz = hf_hub_download("nvidia/personaplex-7b-v1", "dist.tgz") dist_tgz = Path(dist_tgz) From a2d59cb4c08a8ccd407860f9166935d3ac0c55ce Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 17:48:36 +0900 Subject: [PATCH 21/24] Remove .env documentation from PR - Remove .env file configuration instructions - Update to use export HF_TOKEN instead - Change CUSTOM_VOICE_DIR docs to use export - Keep only environment variable and huggingface-cli login methods --- QUICKSTART.md | 7 +++---- README.md | 18 ++++-------------- TROUBLESHOOTING.md | 11 +++++++---- custom_voices/README.md | 6 +++--- 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/QUICKSTART.md b/QUICKSTART.md index 87604ab..9852f2c 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -48,9 +48,8 @@ cd .. ### Step 3: Set Up Hugging Face Token ```bash -# Copy the template and add your token -cp .env.example .env -# Edit .env and replace 'your_token_here' with your actual Hugging Face token +# Set your Hugging Face token as an environment variable +export HF_TOKEN=your_token_here ``` ## Running the Web UI @@ -107,7 +106,7 @@ SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" - Solution: Activate the conda environment: `conda activate personaplex` **Error: "Access denied" when downloading model** -- Solution: Accept the model license and set up your HF token in `.env` +- Solution: Accept the model license and set your HF token: `export HF_TOKEN=your_token_here` For more issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). diff --git a/README.md b/README.md index 2e6e36d..77133d8 100644 --- a/README.md +++ b/README.md @@ -74,26 +74,17 @@ Log in to your Huggingface account and accept the PersonaPlex model license [her Then set up your Huggingface authentication using one of these methods: -**Option 1: Using .env file (Recommended)** -```bash -# Copy the template and add your token -cp .env.example .env -# Edit .env and replace 'your_token_here' with your actual token -``` - -**Option 2: Environment variable** +**Option 1: Environment variable** ```bash export HF_TOKEN= ``` -**Option 3: Hugging Face CLI** +**Option 2: Hugging Face CLI** ```bash pip install huggingface_hub huggingface-cli login ``` -**Note:** The .env file is optional. All existing workflows continue to work. - ### Launch Server **IMPORTANT: First activate the conda environment:** @@ -105,7 +96,6 @@ conda activate personaplex The server **automatically detects and serves your custom UI** if `client/dist` exists: ```bash -# The server automatically loads your HF_TOKEN from the .env file # If client/dist exists, it will be used automatically! SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" ``` @@ -248,8 +238,8 @@ cp my_voice.wav custom_voices/ **Configure custom location (optional):** ```bash -# In .env file -CUSTOM_VOICE_DIR=/path/to/my/voices +# Set environment variable +export CUSTOM_VOICE_DIR=/path/to/my/voices ``` #### Voice File Formats diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 92a3fd2..05d43e6 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -214,12 +214,15 @@ If you modified React components but don't see changes: If models fail to download: -1. Create a `.env` file in the repository root -2. Add your HuggingFace token: +1. Set your HuggingFace token as an environment variable: + ```bash + export HF_TOKEN=your_token_here ``` - HF_TOKEN=your_token_here +2. Or use the Hugging Face CLI: + ```bash + pip install huggingface_hub + huggingface-cli login ``` -3. See `.env.example` for more details ### ffmpeg Not Found diff --git a/custom_voices/README.md b/custom_voices/README.md index 189f22f..52d06c5 100644 --- a/custom_voices/README.md +++ b/custom_voices/README.md @@ -39,9 +39,9 @@ By default, PersonaPlex looks for voices in: 1. HuggingFace cache: `~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/` 2. Custom directory: `./custom_voices/` (this directory) -To use a different custom voices directory, set the `CUSTOM_VOICE_DIR` environment variable in `.env`: -``` -CUSTOM_VOICE_DIR=/path/to/my/voices +To use a different custom voices directory, set the `CUSTOM_VOICE_DIR` environment variable: +```bash +export CUSTOM_VOICE_DIR=/path/to/my/voices ``` ## Voice Naming Convention From d68aff1e1eb9183463669fad44f1aab7664e15ea Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 17:51:29 +0900 Subject: [PATCH 22/24] Add Example Usage section to README - Add practical examples for UI auto-detection - Add example for custom voice workflow - Show expected log output for verification --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 77133d8..6ce21b5 100644 --- a/README.md +++ b/README.md @@ -258,6 +258,30 @@ curl http://localhost:8998/api/voices Returns JSON with all voices, their types, and categories. +## Example Usage + +### Auto-Detection +```bash +# Build frontend +cd client && npm run build && cd .. + +# Server auto-detects - no flag needed! +SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" +# Logs: "Found custom UI at .../client/dist, using it instead of default" +``` + +### Custom Voices +```bash +# Add voice file +cp my_voice.wav custom_voices/ + +# Generate embeddings +python -m moshi.offline --voice-prompt "my_voice.wav" \ + --save-voice-embeddings --input-wav "assets/test/input_assistant.wav" --output-wav "/tmp/out.wav" + +# Restart server - voice appears in UI automatically! +``` + ## Prompting Guide The model is trained on synthetic conversations for a fixed assistant role and varying customer service roles. From 9c06c41f28ccfc4983716dc1d657e14719343377 Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 17:59:04 +0900 Subject: [PATCH 23/24] Add .env file support for easier token management - Add .env.example template with HF_TOKEN and CUSTOM_VOICE_DIR - Update documentation to recommend .env file as primary method - .env files are automatically loaded by server and offline scripts - Maintains backward compatibility with export and huggingface-cli methods Benefits of .env over export will be explained in PR description. --- .env.example | 12 ++++++++++-- QUICKSTART.md | 11 +++++++++-- README.md | 19 ++++++++++++++++--- TROUBLESHOOTING.md | 9 +++++++-- custom_voices/README.md | 9 ++++++++- 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.env.example b/.env.example index d1f98b0..1000218 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,15 @@ -# Hugging Face API Token +# PersonaPlex Environment Configuration +# Copy this file to .env and fill in your values + +# Hugging Face API Token (Required) # Get your token from: https://huggingface.co/settings/tokens # Required to download PersonaPlex models HF_TOKEN=your_token_here -CUSTOM_VOICE_DIR=./custom_voices + +# Optional: Custom voices directory +# Specify a custom location for your voice embeddings +# CUSTOM_VOICE_DIR=/path/to/my/voices + # Optional: Custom cache directory for Hugging Face models +# By default, models are cached in ~/.cache/huggingface/ # HF_HOME=/path/to/custom/cache diff --git a/QUICKSTART.md b/QUICKSTART.md index 9852f2c..2e1f2d8 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -47,8 +47,15 @@ cd .. ### Step 3: Set Up Hugging Face Token +**Recommended: Use .env file (persists across sessions)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual Hugging Face token +``` + +**Alternative: Use export (temporary, only for current session)** ```bash -# Set your Hugging Face token as an environment variable export HF_TOKEN=your_token_here ``` @@ -106,7 +113,7 @@ SSL_DIR=$(mktemp -d); python -m moshi.server --ssl "$SSL_DIR" - Solution: Activate the conda environment: `conda activate personaplex` **Error: "Access denied" when downloading model** -- Solution: Accept the model license and set your HF token: `export HF_TOKEN=your_token_here` +- Solution: Accept the model license and set up your HF token in `.env` file (see Step 3) For more issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md). diff --git a/README.md b/README.md index 6ce21b5..e576df7 100644 --- a/README.md +++ b/README.md @@ -74,12 +74,19 @@ Log in to your Huggingface account and accept the PersonaPlex model license [her Then set up your Huggingface authentication using one of these methods: -**Option 1: Environment variable** +**Option 1: .env file (Recommended)** +```bash +# Copy the template and add your token +cp .env.example .env +# Edit .env and replace 'your_token_here' with your actual token +``` + +**Option 2: Environment variable** ```bash export HF_TOKEN= ``` -**Option 2: Hugging Face CLI** +**Option 3: Hugging Face CLI** ```bash pip install huggingface_hub huggingface-cli login @@ -237,8 +244,14 @@ cp my_voice.wav custom_voices/ ``` **Configure custom location (optional):** + +Preferred method - add to your `.env` file: +```bash +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +Or use environment variable (temporary): ```bash -# Set environment variable export CUSTOM_VOICE_DIR=/path/to/my/voices ``` diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 05d43e6..d1af55a 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -214,11 +214,16 @@ If you modified React components but don't see changes: If models fail to download: -1. Set your HuggingFace token as an environment variable: +1. Create a `.env` file in the repository root: + ```bash + cp .env.example .env + # Edit .env and add your token: HF_TOKEN=your_token_here + ``` +2. Or set environment variable: ```bash export HF_TOKEN=your_token_here ``` -2. Or use the Hugging Face CLI: +3. Or use the Hugging Face CLI: ```bash pip install huggingface_hub huggingface-cli login diff --git a/custom_voices/README.md b/custom_voices/README.md index 52d06c5..6f12f93 100644 --- a/custom_voices/README.md +++ b/custom_voices/README.md @@ -39,7 +39,14 @@ By default, PersonaPlex looks for voices in: 1. HuggingFace cache: `~/.cache/huggingface/hub/models--nvidia--personaplex-7b-v1/snapshots/*/voices/` 2. Custom directory: `./custom_voices/` (this directory) -To use a different custom voices directory, set the `CUSTOM_VOICE_DIR` environment variable: +To use a different custom voices directory: + +Add to your `.env` file (recommended): +```bash +CUSTOM_VOICE_DIR=/path/to/my/voices +``` + +Or use environment variable: ```bash export CUSTOM_VOICE_DIR=/path/to/my/voices ``` From e38d377c8d5ae0b8f1b96f6d0c4ee6c0e8a1fdce Mon Sep 17 00:00:00 2001 From: Kiral Poon Date: Tue, 27 Jan 2026 18:08:20 +0900 Subject: [PATCH 24/24] Add .gitignore rules for Claude Code tooling files - Ignore .agent/ directory - Ignore Agents.md - Ignore Claude.local.md These are personal tooling files that should never be in the repository. --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 7d1f30d..9d432a4 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,8 @@ custom_voices/*.pt custom_voices/*.wav # But keep the README !custom_voices/README.md + +# Claude Code and personal tooling files (should never be committed) +.agent/ +Agents.md +Claude.local.md

Voice Prompt: {modalVoicePrompt} - setModalVoicePrompt(e.target.value)} + > + {voicesLoading ? ( + + ) : voicesError ? ( + + ) : ( + voices.map((voice) => ( + + )) + )}