Update ASR service capabilities and improve speaker identification handling

AnkushMalaker · AnkushMalaker · commit 7c098c71eecf · 2026-02-06T19:41:20.000Z
- Modified the capabilities of the VibeVoice ASR provider to include 'speaker_identification' and 'long_form', enhancing its feature set.
- Adjusted the speaker identification logic in the VibeVoiceTranscriber to prevent double-prefixing and ensure accurate speaker representation.
- Updated protocol tests to reflect the expanded list of known ASR capabilities, ensuring comprehensive validation of reported features.
diff --git a/extras/asr-services/init.py b/extras/asr-services/init.py
@@ -37,7 +37,7 @@
         "default_model": "microsoft/VibeVoice-ASR",
         "service": "vibevoice-asr",
         # Note: VibeVoice provides diarization but NOT word_timestamps
-        "capabilities": ["segments", "diarization", "timestamps"],
+        "capabilities": ["timestamps", "diarization", "speaker_identification", "long_form"],
     },
     "faster-whisper": {
         "name": "Faster-Whisper",
diff --git a/extras/asr-services/providers/vibevoice/transcriber.py b/extras/asr-services/providers/vibevoice/transcriber.py
@@ -288,7 +288,7 @@ def _parse_vibevoice_output(self, raw_output: str) -> dict:
                     "text": seg.get("Content", ""),
                     "start": float(seg.get("Start", 0.0)),
                     "end": float(seg.get("End", 0.0)),
-                    "speaker": f"Speaker {seg.get('Speaker', 0)}",
+                    "speaker": seg.get("Speaker", 0),
                 })
 
             return {"raw_text": raw_output, "segments": segments}
@@ -317,8 +317,13 @@ def _map_to_result(self, processed: dict, raw_output: str) -> TranscriptionResul
             start = seg_data.get("start_time", seg_data.get("start", 0.0))
             end = seg_data.get("end_time", seg_data.get("end", 0.0))
             speaker_raw = seg_data.get("speaker_id", seg_data.get("speaker"))
-            # Convert speaker to string (VibeVoice returns int)
-            speaker_id = f"Speaker {speaker_raw}" if speaker_raw is not None else None
+            # Convert speaker to string, avoiding double-prefix from fallback parser
+            if speaker_raw is None:
+                speaker_id = None
+            elif isinstance(speaker_raw, str) and speaker_raw.startswith("Speaker "):
+                speaker_id = speaker_raw
+            else:
+                speaker_id = f"Speaker {speaker_raw}"
 
             if text:
                 text_parts.append(text)
diff --git a/tests/asr/protocol_tests.robot b/tests/asr/protocol_tests.robot
@@ -158,13 +158,17 @@ ASR Capabilities Format Is Valid List
 
 ASR Capabilities Are From Known Set
     [Documentation]    Verify reported capabilities are valid known capabilities
-    ...                Known capabilities: timestamps, word_timestamps, segments, diarization
+    ...                Known capabilities: timestamps, word_timestamps, diarization,
+    ...                speaker_identification, long_form, language_detection, vad_filter,
+    ...                translation, chunked_processing
     [Tags]    infra
 
     ${info}=    Get ASR Service Info    ${ASR_URL}
 
-    # Define known capabilities
-    @{known_caps}=    Create List    timestamps    word_timestamps    segments    diarization
+    # Define known capabilities (union of all provider capabilities + mock server)
+    @{known_caps}=    Create List    timestamps    word_timestamps    diarization
+    ...    segments    speaker_identification    long_form    language_detection
+    ...    vad_filter    translation    chunked_processing
 
     # All reported capabilities should be known
     FOR    ${cap}    IN    @{info}[capabilities]