SimpleOpenSoftware · AnkushMalaker · Feb 2, 2026
diff --git a/config/config.yml.template b/config/config.yml.template
@@ -101,6 +101,12 @@ models:
   api_family: http
   model_url: http://${oc.env:PARAKEET_ASR_URL,172.17.0.1:8767}
   api_key: ''
+  # Capabilities: what this provider can produce
+  # - word_timestamps: Word-level timing data for alignment
+  # - segments: Speaker segments (generic labels like Speaker 0)
+  capabilities:
+    - word_timestamps
+    - segments
   operations:
     stt_transcribe:
       method: POST
@@ -119,6 +125,14 @@ models:
   api_family: http
   model_url: https://api.deepgram.com/v1
   api_key: ${oc.env:DEEPGRAM_API_KEY,''}
+  # Capabilities: what this provider can produce
+  # - word_timestamps: Word-level timing data
+  # - segments: Speaker segments with paragraphs
+  # - diarization: Native speaker diarization (Speaker 0, Speaker 1, etc.)
+  capabilities:
+    - word_timestamps
+    - segments
+    - diarization
   operations:
     stt_transcribe:
       method: POST
@@ -141,6 +155,30 @@ models:
           text: results.channels[0].alternatives[0].transcript
           words: results.channels[0].alternatives[0].words
           segments: results.channels[0].alternatives[0].paragraphs.paragraphs
+- name: stt-vibevoice
+  description: Microsoft VibeVoice ASR with speaker diarization
+  model_type: stt
+  model_provider: vibevoice
+  api_family: http
+  model_url: http://${oc.env:VIBEVOICE_ASR_URL,host.docker.internal:8767}
+  api_key: ''
+  # Capabilities: what this provider can produce
+  # - segments: Speaker segments with diarization labels
+  # - diarization: Built-in speaker diarization (no word timestamps)
+  # Note: VibeVoice does NOT provide word_timestamps
+  capabilities:
+    - segments
+    - diarization
+  operations:
+    stt_transcribe:
+      method: POST
+      path: /transcribe
+      content_type: multipart/form-data
+      response:
+        type: json
+        extract:
+          text: text
+          segments: segments
 - name: tts-http
   description: Generic JSON TTS endpoint
   model_type: tts

diff --git a/extras/asr-services/.dockerignore b/extras/asr-services/.dockerignore
@@ -5,3 +5,7 @@
 !pyproject*.toml
 !README.md
 !uv.lock
+!common/
+!common/**
+!providers/
+!providers/**
diff --git a/extras/asr-services/.env.template b/extras/asr-services/.env.template
@@ -1,32 +1,73 @@
 # ASR Services Configuration
 # Copy this file to .env and configure as needed
 
-# PyTorch CUDA version for Docker build
+# =============================================================================
+# Provider Selection
+# =============================================================================
+# Choose one of: faster-whisper, transformers, nemo
+ASR_PROVIDER=nemo
+
+# =============================================================================
+# Model Configuration
+# =============================================================================
+# Model identifier (HuggingFace repo or local path)
+#
+# Faster-Whisper models:
+#   - Systran/faster-whisper-large-v3 (Best quality)
+#   - Systran/faster-whisper-small (Lightweight)
+#   - deepdml/faster-whisper-large-v3-turbo-ct2 (Speed optimized)
+#
+# Transformers models:
+#   - microsoft/VibeVoice-ASR (7B, speaker diarization)
+#   - Oriserve/Whisper-Hindi2Hinglish-Prime (Hindi/Hinglish)
+#   - openai/whisper-large-v3 (Original Whisper)
+#
+# NeMo models:
+#   - nvidia/parakeet-tdt-0.6b-v3 (Default)
+#   - nvidia/canary-1b (Multilingual)
+#
+ASR_MODEL=nvidia/parakeet-tdt-0.6b-v3
+
+# =============================================================================
+# Service Port Configuration
+# =============================================================================
+ASR_PORT=8767
+
+# =============================================================================
+# PyTorch/CUDA Configuration
+# =============================================================================
 # Options: cu121 (CUDA 12.1), cu126 (CUDA 12.6), cu128 (CUDA 12.8)
-# Should match your system's CUDA version (check with: nvidia-smi)
 PYTORCH_CUDA_VERSION=cu126
 
-# Parakeet ASR Model Selection
-PARAKEET_MODEL=nvidia/parakeet-tdt-0.6b-v3
+# =============================================================================
+# Faster-Whisper Provider Settings
+# =============================================================================
+# Quantization type: float16, int8, float32
+COMPUTE_TYPE=float16
 
-# Service Port Configuration
-PARAKEET_HOST_PORT=8767
-PARAKEET_CONTAINER_PORT=8765
+# Device: cuda, cpu
+DEVICE=cuda
+
+# GPU device index (for multi-GPU systems)
+DEVICE_INDEX=0
 
-# Enhanced Chunking Configuration for Long Audio
-# Enable/disable chunking for long audio processing
-CHUNKING_ENABLED=true
+# Enable Voice Activity Detection filtering
+VAD_FILTER=true
 
-# Duration of each audio chunk in seconds (recommended: 20-40s)
-CHUNK_DURATION_SECONDS=120.0
+# Force language (empty for auto-detect)
+# LANGUAGE=en
 
-# Overlap duration between chunks in seconds (recommended: 3-7s)
-OVERLAP_DURATION_SECONDS=10.0
+# =============================================================================
+# Transformers Provider Settings
+# =============================================================================
+# PyTorch data type: float16, float32, bfloat16
+TORCH_DTYPE=float16
 
-# Minimum audio duration to trigger chunking (in seconds)
-# Audio shorter than this will use single-pass processing
-MIN_AUDIO_FOR_CHUNKING=120.0
+# Enable Flash Attention 2 (requires compatible GPU)
+USE_FLASH_ATTENTION=false
 
-# Confidence threshold for overlap reconciliation (0.0-1.0)
-# Higher values prefer higher confidence words during overlap resolution
-CONFIDENCE_THRESHOLD=0.8
+# =============================================================================
+# NeMo Provider Settings
+# =============================================================================
+# NeMo's transcribe() handles long audio natively with timestamps=True.
+# No additional configuration needed.