-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
59 lines (48 loc) · 1.96 KB
/
config.example.yaml
File metadata and controls
59 lines (48 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# vTTS Configuration
# Copy this file to config.yaml and adjust to your setup.
model:
# HuggingFace model ID (downloaded automatically on first run)
# Options:
# - Qwen/Qwen3-TTS-12Hz-0.6B-Base (voice cloning, 1.2GB VRAM)
# - Qwen/Qwen3-TTS-12Hz-1.7B-Base (voice cloning, 3.4GB VRAM)
# - Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice (built-in speakers, 1.2GB VRAM)
# - Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice (built-in speakers, 3.4GB VRAM)
name: "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
# GPU device
device: "cuda:0"
# Model precision (bfloat16 recommended for Ampere+ GPUs)
dtype: "bfloat16"
# Optional: local path if you already have the model downloaded.
# If set, vTTS uses this path instead of downloading from HuggingFace.
# local_path: "/path/to/your/local/model"
worker:
# Maximum concurrent audio generations
max_slots: 16
# Frames between audio chunk emissions.
# Lower = lower latency (faster first byte), more vocoder overhead.
# 3 = ~250ms chunks, ~200ms TTFB (best for voice agents)
# 6 = ~500ms chunks, ~400ms TTFB (good balance)
# 20 = ~1.7s chunks, ~1.4s TTFB (best for batch dubbing)
emit_every_frames: 6
# Vocoder context window (more = better quality, more compute)
decode_window_frames: 80
# Crossfade overlap between chunks (samples at 24kHz)
overlap_samples: 1024
# Maximum generation length in frames (12 frames = 1 second of audio)
# Safety limit to prevent infinite generation.
max_frames: 3600 # 5 minutes
server:
host: "0.0.0.0"
port: 8080
# Voice cloning configuration (only for Base models).
# Each voice needs:
# - ref_audio: path to a WAV file (~5-10 seconds of clean speech)
# - ref_text: exact transcript of what is said in the audio
#
# voices:
# my_voice:
# ref_audio: "./voices/my_voice.wav"
# ref_text: "The exact words spoken in the reference audio."
# another_voice:
# ref_audio: "./voices/another.wav"
# ref_text: "Another reference transcript."