Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions config/config.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ models:
api_family: http
model_url: http://${oc.env:PARAKEET_ASR_URL,172.17.0.1:8767}
api_key: ''
# Capabilities: what this provider can produce
# - word_timestamps: Word-level timing data for alignment
# - segments: Speaker segments (generic labels like Speaker 0)
capabilities:
- word_timestamps
- segments
operations:
stt_transcribe:
method: POST
Expand All @@ -119,6 +125,14 @@ models:
api_family: http
model_url: https://api.deepgram.com/v1
api_key: ${oc.env:DEEPGRAM_API_KEY,''}
# Capabilities: what this provider can produce
# - word_timestamps: Word-level timing data
# - segments: Speaker segments with paragraphs
# - diarization: Native speaker diarization (Speaker 0, Speaker 1, etc.)
capabilities:
- word_timestamps
- segments
- diarization
operations:
stt_transcribe:
method: POST
Expand All @@ -141,6 +155,30 @@ models:
text: results.channels[0].alternatives[0].transcript
words: results.channels[0].alternatives[0].words
segments: results.channels[0].alternatives[0].paragraphs.paragraphs
- name: stt-vibevoice
description: Microsoft VibeVoice ASR with speaker diarization
model_type: stt
model_provider: vibevoice
api_family: http
model_url: http://${oc.env:VIBEVOICE_ASR_URL,host.docker.internal:8767}
api_key: ''
# Capabilities: what this provider can produce
# - segments: Speaker segments with diarization labels
# - diarization: Built-in speaker diarization (no word timestamps)
# Note: VibeVoice does NOT provide word_timestamps
capabilities:
- segments
- diarization
operations:
stt_transcribe:
method: POST
path: /transcribe
content_type: multipart/form-data
response:
type: json
extract:
text: text
segments: segments
- name: tts-http
description: Generic JSON TTS endpoint
model_type: tts
Expand Down
4 changes: 4 additions & 0 deletions extras/asr-services/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@
!pyproject*.toml
!README.md
!uv.lock
!common/
!common/**
!providers/
!providers/**
81 changes: 61 additions & 20 deletions extras/asr-services/.env.template
Original file line number Diff line number Diff line change
@@ -1,32 +1,73 @@
# ASR Services Configuration
# Copy this file to .env and configure as needed

# PyTorch CUDA version for Docker build
# =============================================================================
# Provider Selection
# =============================================================================
# Choose one of: faster-whisper, transformers, nemo
ASR_PROVIDER=nemo

# =============================================================================
# Model Configuration
# =============================================================================
# Model identifier (HuggingFace repo or local path)
#
# Faster-Whisper models:
# - Systran/faster-whisper-large-v3 (Best quality)
# - Systran/faster-whisper-small (Lightweight)
# - deepdml/faster-whisper-large-v3-turbo-ct2 (Speed optimized)
#
# Transformers models:
# - microsoft/VibeVoice-ASR (7B, speaker diarization)
# - Oriserve/Whisper-Hindi2Hinglish-Prime (Hindi/Hinglish)
# - openai/whisper-large-v3 (Original Whisper)
#
# NeMo models:
# - nvidia/parakeet-tdt-0.6b-v3 (Default)
# - nvidia/canary-1b (Multilingual)
#
ASR_MODEL=nvidia/parakeet-tdt-0.6b-v3

# =============================================================================
# Service Port Configuration
# =============================================================================
ASR_PORT=8767

# =============================================================================
# PyTorch/CUDA Configuration
# =============================================================================
# Options: cu121 (CUDA 12.1), cu126 (CUDA 12.6), cu128 (CUDA 12.8)
# Should match your system's CUDA version (check with: nvidia-smi)
PYTORCH_CUDA_VERSION=cu126

# Parakeet ASR Model Selection
PARAKEET_MODEL=nvidia/parakeet-tdt-0.6b-v3
# =============================================================================
# Faster-Whisper Provider Settings
# =============================================================================
# Quantization type: float16, int8, float32
COMPUTE_TYPE=float16

# Service Port Configuration
PARAKEET_HOST_PORT=8767
PARAKEET_CONTAINER_PORT=8765
# Device: cuda, cpu
DEVICE=cuda

# GPU device index (for multi-GPU systems)
DEVICE_INDEX=0

# Enhanced Chunking Configuration for Long Audio
# Enable/disable chunking for long audio processing
CHUNKING_ENABLED=true
# Enable Voice Activity Detection filtering
VAD_FILTER=true

# Duration of each audio chunk in seconds (recommended: 20-40s)
CHUNK_DURATION_SECONDS=120.0
# Force language (empty for auto-detect)
# LANGUAGE=en

# Overlap duration between chunks in seconds (recommended: 3-7s)
OVERLAP_DURATION_SECONDS=10.0
# =============================================================================
# Transformers Provider Settings
# =============================================================================
# PyTorch data type: float16, float32, bfloat16
TORCH_DTYPE=float16

# Minimum audio duration to trigger chunking (in seconds)
# Audio shorter than this will use single-pass processing
MIN_AUDIO_FOR_CHUNKING=120.0
# Enable Flash Attention 2 (requires compatible GPU)
USE_FLASH_ATTENTION=false

# Confidence threshold for overlap reconciliation (0.0-1.0)
# Higher values prefer higher confidence words during overlap resolution
CONFIDENCE_THRESHOLD=0.8
# =============================================================================
# NeMo Provider Settings
# =============================================================================
# NeMo's transcribe() handles long audio natively with timestamps=True.
# No additional configuration needed.
Loading
Loading