SimpleOpenSoftware · AnkushMalaker · Dec 6, 2025 · Dec 14, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/extras/speaker-recognition/.env.template b/extras/speaker-recognition/.env.template
@@ -39,8 +39,35 @@ REACT_UI_HOST=0.0.0.0
 REACT_UI_PORT=5174
 REACT_UI_HTTPS=false
 
-# Optional: External Services
+# ===================================================================
+# Transcription Provider Configuration
+# ===================================================================
+
+# Choose transcription provider: 'deepgram' or 'parakeet'
+# If not set, auto-detects based on available credentials
+TRANSCRIPTION_PROVIDER=deepgram
+
+# Option 1: Deepgram (cloud-based, requires API key)
 DEEPGRAM_API_KEY=your_deepgram_api_key_here
+
+# Option 2: Parakeet ASR (local/offline transcription)
+# Point to Parakeet service from extras/asr-services or backends/advanced
+# PARAKEET_ASR_URL=http://parakeet-asr:8767
+
+# ===================================================================
+# Diarization Configuration
+# ===================================================================
+
+# Speaker diarization mode: auto, native, pyannote, or none
+# - auto: Use provider's native diarization if available, otherwise Pyannote (default)
+# - native: Use only provider's native diarization (Deepgram has native, Parakeet doesn't)
+# - pyannote: Always use standalone Pyannote diarization
+# - none: Skip diarization entirely (transcription only)
+DIARIZATION_MODE=auto
+
+# ===================================================================
+# Other External Services
+# ===================================================================
 GROQ_API_KEY=your_groq_api_key_here
 
 # Test Configuration (for docker-compose-test.yml)

diff --git a/extras/speaker-recognition/Dockerfile b/extras/speaker-recognition/Dockerfile
@@ -42,5 +42,5 @@ ENV PYTHONPATH=/app
 EXPOSE 8085
 
 # Run the service
-# Use shell form to allow environment variable expansion
-CMD uv run --extra ${PYTORCH_CUDA_VERSION} --no-dev simple-speaker-service 
+# Use JSON form with shell wrapper for environment variable expansion
+CMD ["sh", "-c", "uv run --extra ${PYTORCH_CUDA_VERSION} --no-dev simple-speaker-service"] 
diff --git a/extras/speaker-recognition/README.md b/extras/speaker-recognition/README.md
@@ -15,8 +15,8 @@ cp .env.template .env
 # Edit .env and add your Hugging Face token
 ```
 Get your HF token from https://huggingface.co/settings/tokens
-Accept the terms and conditions for 
-https://huggingface.co/pyannote/speaker-diarization-3.1
+Accept the terms and conditions for
+https://huggingface.co/pyannote/speaker-diarization-community-1
-Accept the terms and conditions for
-https://huggingface.co/pyannote/speaker-diarization-community-1
+Accept the terms and conditions for:
+[pyannote/speaker-diarization-community-1](https://huggingface.co/pyannote/speaker-diarization-community-1)
-Accept the terms and conditions for
-https://huggingface.co/pyannote/speaker-diarization-community-1
+Accept the terms and conditions for:
+[pyannote/speaker-diarization-community-1](https://huggingface.co/pyannote/speaker-diarization-community-1)
 https://huggingface.co/pyannote/segmentation-3.0
 
 
@@ -829,7 +829,7 @@ The advanced backend communicates with this service through the `client.py` modu
 
 ## Laptop Client
 
-A command-line client (`laptop_client.py`) that can record from your microphone and interact with the speaker recognition service.
+A command-line client (`scripts/laptop_client.py`) that can record from your microphone and interact with the speaker recognition service.
 
 ### Setup for Laptop Client
 
@@ -854,22 +854,22 @@ pip install pyaudio
 docker compose --profile cpu up -d
 
 # Enroll a new speaker (records 10 seconds)
-python laptop_client.py enroll --speaker-id "john" --speaker-name "John Doe" --duration 10
+python scripts/laptop_client.py enroll --speaker-id "john" --speaker-name "John Doe" --duration 10
 
 # Identify a speaker (records 5 seconds)
-python laptop_client.py identify --duration 5
+python scripts/laptop_client.py identify --duration 5
 
 # Verify against a specific speaker (records 3 seconds)
-python laptop_client.py verify --speaker-id "john" --duration 3
+python scripts/laptop_client.py verify --speaker-id "john" --duration 3
 
 # List all enrolled speakers
-python laptop_client.py list
+python scripts/laptop_client.py list
 
 # Remove a speaker
-python laptop_client.py remove --speaker-id "john"
+python scripts/laptop_client.py remove --speaker-id "john"
 
 # Use different service URL
-python laptop_client.py --service-url "http://192.168.1.100:8001" identify
+python scripts/laptop_client.py --service-url "http://192.168.1.100:8001" identify
 ```
 
 ### Laptop Client Features

diff --git a/extras/speaker-recognition/docker-compose-test.yml b/extras/speaker-recognition/docker-compose-test.yml
@@ -5,7 +5,7 @@ services:
       context: .
       dockerfile: Dockerfile
       args:
-        PYTORCH_CUDA_VERSION: ${COMPUTE_MODE:-cpu}
+        PYTORCH_CUDA_VERSION: ${PYTORCH_CUDA_VERSION:-cpu}
     image: speaker-recognition:test
     ports:
       # Map host test port (default 8086) to container port 8085

diff --git a/extras/speaker-recognition/docker-compose.yml b/extras/speaker-recognition/docker-compose.yml
@@ -26,7 +26,10 @@ services:
       - SIMILARITY_THRESHOLD=${SIMILARITY_THRESHOLD:-0.15}
       - SPEAKER_SERVICE_HOST=${SPEAKER_SERVICE_HOST:-0.0.0.0}
       - SPEAKER_SERVICE_PORT=${SPEAKER_SERVICE_PORT:-8085}
+      # Transcription provider configuration
+      - TRANSCRIPTION_PROVIDER=${TRANSCRIPTION_PROVIDER:-deepgram}
       - DEEPGRAM_API_KEY=${DEEPGRAM_API_KEY}
+      - PARAKEET_ASR_URL=${PARAKEET_ASR_URL:-http://parakeet-asr:8767}
     restart: unless-stopped
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8085/health"]

diff --git a/extras/speaker-recognition/phone-hangout-1-response.json b/extras/speaker-recognition/phone-hangout-1-response.json
diff --git a/extras/speaker-recognition/pyproject.toml b/extras/speaker-recognition/pyproject.toml
@@ -8,7 +8,7 @@ dependencies = [
     "fastapi>=0.115.12",
     "uvicorn>=0.34.2",
     "scipy>=1.10.0",
-    "pyannote.audio>=3.3.2",
+    "pyannote.audio>=4.0.0",
     "aiohttp>=3.8.0",
     "python-multipart>=0.0.6",
     "pydantic>=2.0.0",
@@ -43,26 +43,20 @@ cpu = [
     "torchaudio>=2.0.0",
 ]
 
-cu121 = [
-    "torch>=2.0.0",
-    "torchaudio>=2.0.0",
-]
-
 cu126 = [
-    "torch>=2.0.0",
-    "torchaudio>=2.0.0",
+    "torch>=2.8.0",
+    "torchaudio>=2.8.0",
 ]
 
 cu128 = [
-    "torch>=2.0.0",
-    "torchaudio>=2.0.0",
+    "torch>=2.8.0",
+    "torchaudio>=2.8.0",
 ]
 
 [tool.uv]
 conflicts = [
     [
         { extra = "cpu" },
-        { extra = "cu121" },
         { extra = "cu126" },
         { extra = "cu128" },
     ],
@@ -71,13 +65,11 @@ conflicts = [
 [tool.uv.sources]
 torch = [
     { index = "pytorch-cpu", extra = "cpu" },
-    { index = "pytorch-cu121", extra = "cu121" },
     { index = "pytorch-cu126", extra = "cu126" },
     { index = "pytorch-cu128", extra = "cu128" },
 ]
 torchaudio = [
     { index = "pytorch-cpu", extra = "cpu" },
-    { index = "pytorch-cu121", extra = "cu121" },
     { index = "pytorch-cu126", extra = "cu126" },
     { index = "pytorch-cu128", extra = "cu128" },
 ]
@@ -87,11 +79,6 @@ name = "pytorch-cpu"
 url = "https://download.pytorch.org/whl/cpu"
 explicit = true
 
-[[tool.uv.index]]
-name = "pytorch-cu121"
-url = "https://download.pytorch.org/whl/cu121"
-explicit = true
-
 [[tool.uv.index]]
 name = "pytorch-cu126"
 url = "https://download.pytorch.org/whl/cu126"

diff --git a/extras/speaker-recognition/run-test.sh b/extras/speaker-recognition/run-test.sh
@@ -55,6 +55,9 @@ fi
 
 print_info "Speaker Recognition Integration Test Runner"
 print_info "=========================================="
+
+# Load environment variables (CI or local)
+if [ -f ".env" ]; then
 print_info ".env file exists: $([ -f .env ] && echo 'yes' || echo 'no')"
 
 # Load environment variables (CI or local)
@@ -109,8 +112,10 @@ if [ -z "$DEEPGRAM_API_KEY" ]; then
     exit 1
 fi
 
-print_info "HF_TOKEN length: ${#HF_TOKEN}"
-print_info "DEEPGRAM_API_KEY length: ${#DEEPGRAM_API_KEY}"
+# Now we can safely check the variables
+print_info "Environment configuration:"
+print_info "  HF_TOKEN length: ${#HF_TOKEN}"
+print_info "  DEEPGRAM_API_KEY length: ${#DEEPGRAM_API_KEY}"
 
 # Export variables early so docker compose can use them
 export HF_TOKEN

diff --git a/extras/speaker-recognition/laptop_client.py → ...aker-recognition/scripts/laptop_client.py b/extras/speaker-recognition/laptop_client.py → ...aker-recognition/scripts/laptop_client.py