Skip to content

Commit 7c098c7

Browse files
committed
Update ASR service capabilities and improve speaker identification handling
- Modified the capabilities of the VibeVoice ASR provider to include 'speaker_identification' and 'long_form', enhancing its feature set. - Adjusted the speaker identification logic in the VibeVoiceTranscriber to prevent double-prefixing and ensure accurate speaker representation. - Updated protocol tests to reflect the expanded list of known ASR capabilities, ensuring comprehensive validation of reported features.
1 parent 150dd51 commit 7c098c7

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

extras/asr-services/init.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"default_model": "microsoft/VibeVoice-ASR",
3838
"service": "vibevoice-asr",
3939
# Note: VibeVoice provides diarization but NOT word_timestamps
40-
"capabilities": ["segments", "diarization", "timestamps"],
40+
"capabilities": ["timestamps", "diarization", "speaker_identification", "long_form"],
4141
},
4242
"faster-whisper": {
4343
"name": "Faster-Whisper",

extras/asr-services/providers/vibevoice/transcriber.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def _parse_vibevoice_output(self, raw_output: str) -> dict:
288288
"text": seg.get("Content", ""),
289289
"start": float(seg.get("Start", 0.0)),
290290
"end": float(seg.get("End", 0.0)),
291-
"speaker": f"Speaker {seg.get('Speaker', 0)}",
291+
"speaker": seg.get("Speaker", 0),
292292
})
293293

294294
return {"raw_text": raw_output, "segments": segments}
@@ -317,8 +317,13 @@ def _map_to_result(self, processed: dict, raw_output: str) -> TranscriptionResul
317317
start = seg_data.get("start_time", seg_data.get("start", 0.0))
318318
end = seg_data.get("end_time", seg_data.get("end", 0.0))
319319
speaker_raw = seg_data.get("speaker_id", seg_data.get("speaker"))
320-
# Convert speaker to string (VibeVoice returns int)
321-
speaker_id = f"Speaker {speaker_raw}" if speaker_raw is not None else None
320+
# Convert speaker to string, avoiding double-prefix from fallback parser
321+
if speaker_raw is None:
322+
speaker_id = None
323+
elif isinstance(speaker_raw, str) and speaker_raw.startswith("Speaker "):
324+
speaker_id = speaker_raw
325+
else:
326+
speaker_id = f"Speaker {speaker_raw}"
322327

323328
if text:
324329
text_parts.append(text)

tests/asr/protocol_tests.robot

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,17 @@ ASR Capabilities Format Is Valid List
158158

159159
ASR Capabilities Are From Known Set
160160
[Documentation] Verify reported capabilities are valid known capabilities
161-
... Known capabilities: timestamps, word_timestamps, segments, diarization
161+
... Known capabilities: timestamps, word_timestamps, diarization,
162+
... speaker_identification, long_form, language_detection, vad_filter,
163+
... translation, chunked_processing
162164
[Tags] infra
163165
164166
${info}= Get ASR Service Info ${ASR_URL}
165167

166-
# Define known capabilities
167-
@{known_caps}= Create List timestamps word_timestamps segments diarization
168+
# Define known capabilities (union of all provider capabilities + mock server)
169+
@{known_caps}= Create List timestamps word_timestamps diarization
170+
... segments speaker_identification long_form language_detection
171+
... vad_filter translation chunked_processing
168172

169173
# All reported capabilities should be known
170174
FOR ${cap} IN @{info}[capabilities]

0 commit comments

Comments
 (0)