From 2dc7a31e4d865e36cadeb721fe0aecb269fbdcf5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 02:37:52 +0000 Subject: [PATCH 1/2] feat: add crates/transcript with TranscriptAccumulator Co-Authored-By: yujonglee --- Cargo.toml | 1 + crates/transcript/Cargo.toml | 15 + crates/transcript/src/accumulator/channel.rs | 52 ++ crates/transcript/src/accumulator/mod.rs | 507 ++++++++++++++++++ crates/transcript/src/accumulator/words.rs | 511 +++++++++++++++++++ crates/transcript/src/lib.rs | 1 + 6 files changed, 1087 insertions(+) create mode 100644 crates/transcript/Cargo.toml create mode 100644 crates/transcript/src/accumulator/channel.rs create mode 100644 crates/transcript/src/accumulator/mod.rs create mode 100644 crates/transcript/src/accumulator/words.rs create mode 100644 crates/transcript/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index e7ef294d5b..8efe59a387 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,6 +108,7 @@ hypr-transcribe-deepgram = { path = "crates/transcribe-deepgram", package = "tra hypr-transcribe-openai = { path = "crates/transcribe-openai", package = "transcribe-openai" } hypr-transcribe-proxy = { path = "crates/transcribe-proxy", package = "transcribe-proxy" } hypr-transcribe-whisper-local = { path = "crates/transcribe-whisper-local", package = "transcribe-whisper-local" } +hypr-transcript = { path = "crates/transcript", package = "transcript" } hypr-turso = { path = "crates/turso", package = "turso" } hypr-vad = { path = "crates/vad", package = "vad" } hypr-vad-ext = { path = "crates/vad-ext", package = "vad-ext" } diff --git a/crates/transcript/Cargo.toml b/crates/transcript/Cargo.toml new file mode 100644 index 0000000000..9fd1d63589 --- /dev/null +++ b/crates/transcript/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "transcript" +version = "0.1.0" +edition = "2024" + +[dev-dependencies] +hypr-data = { workspace = true } +serde_json = { workspace = true } + +[dependencies] +owhisper-interface = { workspace = true } + +serde = { workspace = true } +specta = { workspace = true } +uuid = { workspace = true, features = ["v4"] } diff --git a/crates/transcript/src/accumulator/channel.rs b/crates/transcript/src/accumulator/channel.rs new file mode 100644 index 0000000000..10fc6785ad --- /dev/null +++ b/crates/transcript/src/accumulator/channel.rs @@ -0,0 +1,52 @@ +use super::words::{ + RawWord, SpeakerHint, TranscriptWord, dedup, finalize_words, splice, stitch, strip_overlap, +}; + +pub(super) struct ChannelState { + watermark: i64, + held: Option, + partials: Vec, +} + +impl ChannelState { + pub(super) fn new() -> Self { + Self { + watermark: 0, + held: None, + partials: Vec::new(), + } + } + + pub(super) fn partials(&self) -> &[RawWord] { + &self.partials + } + + pub(super) fn apply_final( + &mut self, + words: Vec, + ) -> (Vec, Vec) { + let response_end = words.last().map_or(0, |w| w.end_ms); + let new_words: Vec<_> = dedup(words, self.watermark); + + if new_words.is_empty() { + return (vec![], vec![]); + } + + self.watermark = response_end; + self.partials = strip_overlap(std::mem::take(&mut self.partials), response_end); + + let (emitted, held) = stitch(self.held.take(), new_words); + self.held = held; + finalize_words(emitted) + } + + pub(super) fn apply_partial(&mut self, words: Vec) { + self.partials = splice(&self.partials, words); + } + + pub(super) fn drain(&mut self) -> (Vec, Vec) { + let mut raw: Vec<_> = self.held.take().into_iter().collect(); + raw.extend(std::mem::take(&mut self.partials)); + finalize_words(raw) + } +} diff --git a/crates/transcript/src/accumulator/mod.rs b/crates/transcript/src/accumulator/mod.rs new file mode 100644 index 0000000000..847da5c244 --- /dev/null +++ b/crates/transcript/src/accumulator/mod.rs @@ -0,0 +1,507 @@ +//! # Transcript-as-Oracle Accumulator +//! +//! The transcript string in each ASR response is the **sole source of truth** +//! for word boundaries. Tokens are sub-word fragments with timing metadata; +//! the transcript tells us which fragments belong to the same word. +//! +//! ## Two-level design +//! +//! **Within a response** — `assemble` aligns tokens to the transcript via +//! `spacing_from_transcript`. A space in the transcript means "new word"; +//! no space means "same word." No timing heuristics. +//! +//! **Across responses** — `stitch` handles the one case where no transcript +//! spans both sides: when a provider splits a word across two final responses +//! (e.g. Korean particles like "시스템" + "을" → "시스템을"). This uses a +//! timing-based heuristic because no cross-response transcript exists. + +mod channel; +mod words; + +use std::collections::BTreeMap; + +use owhisper_interface::stream::StreamResponse; + +pub use words::{PartialWord, SpeakerHint, TranscriptUpdate, TranscriptWord}; + +use channel::ChannelState; +use words::{assemble, ensure_space_prefix_partial}; + +/// Accumulates streaming ASR responses into clean, deduplicated transcript data. +/// +/// Each `process` call returns a `TranscriptUpdate` with: +/// - `new_final_words`: words that became final since the last update (ready to persist) +/// - `speaker_hints`: speaker associations for the newly finalized words +/// - `partial_words`: current in-progress words across all channels (for live display) +/// +/// Call `flush` at session end to drain any held/partial words that were never finalized. +pub struct TranscriptAccumulator { + channels: BTreeMap, +} + +impl TranscriptAccumulator { + pub fn new() -> Self { + Self { + channels: BTreeMap::new(), + } + } + + pub fn process(&mut self, response: &StreamResponse) -> Option { + let (is_final, channel, channel_index) = match response { + StreamResponse::TranscriptResponse { + is_final, + channel, + channel_index, + .. + } => (*is_final, channel, channel_index), + _ => return None, + }; + + let alt = channel.alternatives.first()?; + if alt.words.is_empty() && alt.transcript.is_empty() { + return None; + } + + let ch = channel_index.first().copied().unwrap_or(0); + let words = assemble(&alt.words, &alt.transcript, ch); + if words.is_empty() { + return None; + } + + let state = self.channels.entry(ch).or_insert_with(ChannelState::new); + + let (new_final_words, speaker_hints) = if is_final { + state.apply_final(words) + } else { + state.apply_partial(words); + (vec![], vec![]) + }; + + Some(TranscriptUpdate { + new_final_words, + speaker_hints, + partial_words: self.all_partials(), + }) + } + + pub fn flush(&mut self) -> TranscriptUpdate { + let mut new_final_words = Vec::new(); + let mut speaker_hints = Vec::new(); + + for state in self.channels.values_mut() { + let (words, hints) = state.drain(); + new_final_words.extend(words); + speaker_hints.extend(hints); + } + + TranscriptUpdate { + new_final_words, + speaker_hints, + partial_words: vec![], + } + } + + fn all_partials(&self) -> Vec { + let mut partials: Vec = self + .channels + .values() + .flat_map(|state| state.partials().iter().map(|w| w.to_partial())) + .collect(); + + if let Some(first) = partials.first_mut() { + ensure_space_prefix_partial(first); + } + + partials + } +} + +impl Default for TranscriptAccumulator { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use owhisper_interface::stream::{Alternatives, Channel, Metadata, ModelInfo}; + + fn raw_word( + text: &str, + start: f64, + end: f64, + speaker: Option, + ) -> owhisper_interface::stream::Word { + owhisper_interface::stream::Word { + word: text.to_string(), + start, + end, + confidence: 1.0, + speaker, + punctuated_word: Some(text.to_string()), + language: None, + } + } + + fn response( + words: &[(&str, f64, f64, Option)], + transcript: &str, + is_final: bool, + channel_idx: i32, + ) -> StreamResponse { + StreamResponse::TranscriptResponse { + start: 0.0, + duration: 0.0, + is_final, + speech_final: is_final, + from_finalize: false, + channel: Channel { + alternatives: vec![Alternatives { + transcript: transcript.to_string(), + words: words + .iter() + .map(|&(t, s, e, sp)| raw_word(t, s, e, sp)) + .collect(), + confidence: 1.0, + languages: vec![], + }], + }, + metadata: Metadata { + request_id: String::new(), + model_info: ModelInfo { + name: String::new(), + version: String::new(), + arch: String::new(), + }, + model_uuid: String::new(), + extra: None, + }, + channel_index: vec![channel_idx], + } + } + + fn partial(words: &[(&str, f64, f64)], transcript: &str) -> StreamResponse { + let ws: Vec<_> = words.iter().map(|&(t, s, e)| (t, s, e, None)).collect(); + response(&ws, transcript, false, 0) + } + + fn finalize(words: &[(&str, f64, f64)], transcript: &str) -> StreamResponse { + let ws: Vec<_> = words.iter().map(|&(t, s, e)| (t, s, e, None)).collect(); + response(&ws, transcript, true, 0) + } + + fn finalize_with_speakers( + words: &[(&str, f64, f64, Option)], + transcript: &str, + ) -> StreamResponse { + response(words, transcript, true, 0) + } + + fn replay(responses: &[StreamResponse]) -> Vec { + let mut acc = TranscriptAccumulator::new(); + let mut words = Vec::new(); + + for r in responses { + if let Some(update) = acc.process(r) { + words.extend(update.new_final_words); + } + } + + words.extend(acc.flush().new_final_words); + words + } + + fn assert_valid_output(words: &[TranscriptWord]) { + assert!(!words.is_empty(), "must produce words"); + + assert!( + words.iter().all(|w| !w.id.is_empty()), + "all words must have IDs" + ); + + let ids: std::collections::HashSet<_> = words.iter().map(|w| &w.id).collect(); + assert_eq!(ids.len(), words.len(), "IDs must be unique"); + + for w in words { + assert!( + !w.text.trim().is_empty(), + "word text must not be blank: {w:?}" + ); + assert!( + w.text.starts_with(' '), + "word must start with space: {:?}", + w.text + ); + } + + for ch in words + .iter() + .map(|w| w.channel) + .collect::>() + { + let cw: Vec<_> = words.iter().filter(|w| w.channel == ch).collect(); + assert!( + cw.windows(2).all(|w| w[0].start_ms <= w[1].start_ms), + "channel {ch} must be chronological" + ); + } + } + + #[test] + fn partial_update_exposes_current_words() { + let mut acc = TranscriptAccumulator::new(); + + let update = acc + .process(&partial( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )) + .unwrap(); + + assert!(update.new_final_words.is_empty()); + assert_eq!(update.partial_words.len(), 2); + assert_eq!( + update + .partial_words + .iter() + .map(|w| &w.text) + .collect::>(), + [" Hello", " world"] + ); + } + + #[test] + fn partial_splices_into_existing_window() { + let mut acc = TranscriptAccumulator::new(); + + acc.process(&partial( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )); + + let update = acc + .process(&partial( + &[ + (" Hello", 0.1, 0.5), + (" world", 0.6, 0.9), + (" today", 1.0, 1.3), + ], + " Hello world today", + )) + .unwrap(); + + assert_eq!(update.partial_words.len(), 3); + assert_eq!( + update + .partial_words + .iter() + .map(|w| &w.text) + .collect::>(), + [" Hello", " world", " today"] + ); + } + + #[test] + fn final_emits_prefix_and_holds_last() { + let mut acc = TranscriptAccumulator::new(); + + acc.process(&partial( + &[(" Hello", 0.1, 0.5), (" world", 0.55, 0.9)], + " Hello world", + )); + + let update = acc + .process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.55, 0.9)], + " Hello world", + )) + .unwrap(); + + assert_eq!(update.new_final_words.len(), 1); + assert_eq!(update.new_final_words[0].text, " Hello"); + assert!(update.partial_words.is_empty()); + + let flushed = acc.flush(); + assert_eq!(flushed.new_final_words.len(), 1); + assert_eq!(flushed.new_final_words[0].text, " world"); + } + + #[test] + fn final_deduplicates_repeated_response() { + let mut acc = TranscriptAccumulator::new(); + + let r = finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + ); + + let first = acc.process(&r).unwrap(); + let second = acc.process(&r).unwrap(); + + assert!(!first.new_final_words.is_empty()); + assert!(second.new_final_words.is_empty()); + } + + #[test] + fn final_clears_overlapping_partials() { + let mut acc = TranscriptAccumulator::new(); + + acc.process(&partial( + &[ + (" Hello", 0.1, 0.5), + (" world", 0.6, 1.0), + (" test", 1.1, 1.5), + ], + " Hello world test", + )); + + let update = acc + .process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 1.0)], + " Hello world", + )) + .unwrap(); + + assert_eq!(update.partial_words.len(), 1); + assert_eq!(update.partial_words[0].text, " test"); + } + + #[test] + fn all_final_words_have_ids() { + let mut acc = TranscriptAccumulator::new(); + + let update = acc + .process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )) + .unwrap(); + + assert!(update.new_final_words.iter().all(|w| !w.id.is_empty())); + + let flushed = acc.flush(); + assert!(flushed.new_final_words.iter().all(|w| !w.id.is_empty())); + } + + #[test] + fn flush_drains_held_word() { + let mut acc = TranscriptAccumulator::new(); + + acc.process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )); + + let flushed = acc.flush(); + + assert_eq!(flushed.new_final_words.len(), 1); + assert_eq!(flushed.new_final_words[0].text, " world"); + assert!(!flushed.new_final_words[0].id.is_empty()); + } + + #[test] + fn flush_drains_partials_beyond_final_range() { + let mut acc = TranscriptAccumulator::new(); + + acc.process(&partial(&[(" later", 5.0, 5.5)], " later")); + + acc.process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )); + + let flushed = acc.flush(); + + let texts: Vec<_> = flushed.new_final_words.iter().map(|w| &w.text).collect(); + assert!( + texts.contains(&&" world".to_string()) || texts.contains(&&" later".to_string()), + "flush must drain held + partials: {texts:?}" + ); + assert!(flushed.new_final_words.iter().all(|w| !w.id.is_empty())); + } + + #[test] + fn flush_on_empty_accumulator_is_empty() { + let mut acc = TranscriptAccumulator::new(); + let flushed = acc.flush(); + assert!(flushed.new_final_words.is_empty()); + assert!(flushed.partial_words.is_empty()); + assert!(flushed.speaker_hints.is_empty()); + } + + #[test] + fn non_transcript_responses_produce_no_update() { + let mut acc = TranscriptAccumulator::new(); + let ignored = StreamResponse::TerminalResponse { + request_id: "r".into(), + created: "now".into(), + duration: 1.0, + channels: 1, + }; + assert!(acc.process(&ignored).is_none()); + } + + #[test] + fn speaker_hints_extracted_from_final_words() { + let mut acc = TranscriptAccumulator::new(); + + let update = acc + .process(&finalize_with_speakers( + &[(" Hello", 0.1, 0.5, Some(0)), (" world", 0.6, 0.9, Some(1))], + " Hello world", + )) + .unwrap(); + + assert_eq!(update.new_final_words.len(), 1); + assert_eq!(update.speaker_hints.len(), 1); + assert_eq!(update.speaker_hints[0].speaker_index, 0); + assert_eq!( + update.speaker_hints[0].word_id, + update.new_final_words[0].id + ); + + let flushed = acc.flush(); + assert_eq!(flushed.new_final_words.len(), 1); + assert_eq!(flushed.speaker_hints.len(), 1); + assert_eq!(flushed.speaker_hints[0].speaker_index, 1); + } + + #[test] + fn no_speaker_hints_when_speaker_is_none() { + let mut acc = TranscriptAccumulator::new(); + + let update = acc + .process(&finalize( + &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)], + " Hello world", + )) + .unwrap(); + + assert!(update.speaker_hints.is_empty()); + } + + macro_rules! fixture_test { + ($test_name:ident, $json:expr) => { + #[test] + fn $test_name() { + let responses: Vec = + serde_json::from_str($json).expect("fixture must parse as StreamResponse[]"); + assert_valid_output(&replay(&responses)); + } + }; + } + + fixture_test!( + deepgram_fixture_produces_valid_output, + hypr_data::english_1::DEEPGRAM_JSON + ); + fixture_test!( + soniox_fixture_produces_valid_output, + hypr_data::english_1::SONIOX_JSON + ); + fixture_test!( + soniox_korean_fixture_produces_valid_output, + hypr_data::korean_1::SONIOX_JSON + ); +} diff --git a/crates/transcript/src/accumulator/words.rs b/crates/transcript/src/accumulator/words.rs new file mode 100644 index 0000000000..49d9238c8f --- /dev/null +++ b/crates/transcript/src/accumulator/words.rs @@ -0,0 +1,511 @@ +use owhisper_interface::stream::Word; +use uuid::Uuid; + +// ── Public output types ───────────────────────────────────────────────────── + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)] +pub struct TranscriptWord { + pub id: String, + pub text: String, + pub start_ms: i64, + pub end_ms: i64, + pub channel: i32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)] +pub struct PartialWord { + pub text: String, + pub start_ms: i64, + pub end_ms: i64, + pub channel: i32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)] +pub struct SpeakerHint { + pub word_id: String, + pub speaker_index: i32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)] +pub struct TranscriptUpdate { + pub new_final_words: Vec, + pub speaker_hints: Vec, + pub partial_words: Vec, +} + +// ── Internal pipeline type ────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +pub(super) struct RawWord { + pub(super) text: String, + pub(super) start_ms: i64, + pub(super) end_ms: i64, + pub(super) channel: i32, + pub(super) speaker: Option, +} + +impl RawWord { + pub(super) fn to_final(self, id: String) -> (TranscriptWord, Option) { + let hint = self.speaker.map(|speaker_index| SpeakerHint { + word_id: id.clone(), + speaker_index, + }); + let word = TranscriptWord { + id, + text: self.text, + start_ms: self.start_ms, + end_ms: self.end_ms, + channel: self.channel, + }; + (word, hint) + } + + pub(super) fn to_partial(&self) -> PartialWord { + PartialWord { + text: self.text.clone(), + start_ms: self.start_ms, + end_ms: self.end_ms, + channel: self.channel, + } + } +} + +// ── Assembly ───────────────────────────────────────────────────────────────── + +/// Assemble raw ASR tokens into merged `RawWord`s. +/// +/// The transcript string is the **sole oracle** for word boundaries within a +/// single response. `spacing_from_transcript` aligns each token to the +/// transcript; a space prefix means "new word", no space means "same word." +/// Adjacent tokens without a space prefix are unconditionally merged — +/// no timing heuristics. +pub(super) fn assemble(raw: &[Word], transcript: &str, channel: i32) -> Vec { + let spaced = spacing_from_transcript(raw, transcript); + let mut result: Vec = Vec::new(); + + for (w, text) in raw.iter().zip(&spaced) { + let start_ms = (w.start * 1000.0).round() as i64; + let end_ms = (w.end * 1000.0).round() as i64; + + let should_merge = !text.starts_with(' ') && result.last().is_some(); + + if should_merge { + let last = result.last_mut().unwrap(); + last.text.push_str(text); + last.end_ms = end_ms; + if last.speaker.is_none() { + last.speaker = w.speaker; + } + } else { + result.push(RawWord { + text: text.clone(), + start_ms, + end_ms, + channel, + speaker: w.speaker, + }); + } + } + + result +} + +/// Align each token to the transcript string and recover its spacing. +/// +/// The transcript is the oracle: if a token is found in the transcript, the +/// whitespace between the previous match and this one is prepended verbatim. +/// If a token cannot be found (ASR/transcript mismatch), a space is forced +/// so it becomes a separate word — "unknown = word boundary." +fn spacing_from_transcript(raw: &[Word], transcript: &str) -> Vec { + let mut result = Vec::with_capacity(raw.len()); + let mut pos = 0; + + for w in raw { + let text = w.punctuated_word.as_deref().unwrap_or(&w.word); + let trimmed = text.trim(); + + if trimmed.is_empty() { + result.push(text.to_string()); + continue; + } + + match transcript[pos..].find(trimmed) { + Some(found) => { + let abs = pos + found; + result.push(format!("{}{trimmed}", &transcript[pos..abs])); + pos = abs + trimmed.len(); + } + None => { + let mut fallback = text.to_string(); + if !fallback.starts_with(' ') { + fallback.insert(0, ' '); + } + result.push(fallback); + } + } + } + + result +} + +// ── Pipeline stages ────────────────────────────────────────────────────────── + +/// Drop words already covered by the watermark (deduplication). +pub(super) fn dedup(words: Vec, watermark: i64) -> Vec { + words + .into_iter() + .skip_while(|w| w.end_ms <= watermark) + .collect() +} + +/// Cross-response word boundary handling — the one place where a timing +/// heuristic is unavoidable, because no transcript spans both responses. +/// +/// Holds back the last word of each finalized batch so it can be merged +/// with the first word of the next batch if the provider split a word +/// across responses (common with Korean particles, contractions, etc.). +pub(super) fn stitch( + held: Option, + mut words: Vec, +) -> (Vec, Option) { + if words.is_empty() { + return (held.into_iter().collect(), None); + } + + if let Some(h) = held { + if should_stitch(&h, &words[0]) { + words[0] = merge_words(h, words[0].clone()); + } else { + words.insert(0, h); + } + } + + let new_held = words.pop(); + (words, new_held) +} + +/// Replace the time range covered by `incoming` within `existing`. +pub(super) fn splice(existing: &[RawWord], incoming: Vec) -> Vec { + let first_start = incoming.first().map_or(0, |w| w.start_ms); + let last_end = incoming.last().map_or(0, |w| w.end_ms); + + existing + .iter() + .filter(|w| w.end_ms <= first_start) + .cloned() + .chain(incoming) + .chain(existing.iter().filter(|w| w.start_ms >= last_end).cloned()) + .collect() +} + +/// Remove partials that overlap with the finalized time range. +pub(super) fn strip_overlap(partials: Vec, final_end: i64) -> Vec { + partials + .into_iter() + .filter(|w| w.start_ms > final_end) + .collect() +} + +// ── Word-level transforms ──────────────────────────────────────────────────── + +pub(super) fn ensure_space_prefix_raw(w: &mut RawWord) { + if !w.text.starts_with(' ') { + w.text.insert(0, ' '); + } +} + +pub(super) fn ensure_space_prefix_partial(w: &mut PartialWord) { + if !w.text.starts_with(' ') { + w.text.insert(0, ' '); + } +} + +fn should_stitch(tail: &RawWord, head: &RawWord) -> bool { + !head.text.starts_with(' ') && (head.start_ms - tail.end_ms) <= 300 +} + +fn merge_words(mut left: RawWord, right: RawWord) -> RawWord { + left.text.push_str(&right.text); + left.end_ms = right.end_ms; + if left.speaker.is_none() { + left.speaker = right.speaker; + } + left +} + +/// Convert a list of RawWords into finalized TranscriptWords + SpeakerHints. +/// Assigns UUIDs, ensures space prefixes, and extracts speaker data. +pub(super) fn finalize_words(mut words: Vec) -> (Vec, Vec) { + words.iter_mut().for_each(ensure_space_prefix_raw); + + let mut final_words = Vec::with_capacity(words.len()); + let mut hints = Vec::new(); + + for w in words { + let id = Uuid::new_v4().to_string(); + let (word, hint) = w.to_final(id); + final_words.push(word); + if let Some(h) = hint { + hints.push(h); + } + } + + (final_words, hints) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn raw_word(text: &str, start: f64, end: f64) -> Word { + Word { + word: text.to_string(), + start, + end, + confidence: 1.0, + speaker: None, + punctuated_word: Some(text.to_string()), + language: None, + } + } + + fn word(text: &str, start_ms: i64, end_ms: i64) -> RawWord { + RawWord { + text: text.to_string(), + start_ms, + end_ms, + channel: 0, + speaker: None, + } + } + + // ── spacing_from_transcript ────────────────────────────────────────── + + #[test] + fn spacing_recovered_from_transcript() { + let raw = vec![raw_word("Hello", 0.0, 0.5), raw_word("world", 0.6, 1.0)]; + let spaced = spacing_from_transcript(&raw, " Hello world"); + assert_eq!(spaced, [" Hello", " world"]); + } + + #[test] + fn spacing_forces_word_boundary_on_unfound_token() { + let raw = vec![raw_word("Hello", 0.0, 0.5)]; + let spaced = spacing_from_transcript(&raw, "completely different"); + assert_eq!(spaced, [" Hello"]); + } + + #[test] + fn spacing_preserves_no_space_at_transcript_start() { + let raw = vec![raw_word("기", 0.0, 0.1), raw_word("간", 0.2, 0.3)]; + let spaced = spacing_from_transcript(&raw, "기간"); + assert_eq!(spaced, ["기", "간"]); + } + + // ── assemble ───────────────────────────────────────────────────────── + + #[test] + fn assemble_merges_attached_punctuation() { + let raw = vec![raw_word(" Hello", 0.0, 0.5), raw_word("'s", 0.51, 0.6)]; + let words = assemble(&raw, " Hello's", 0); + assert_eq!(words.len(), 1); + assert_eq!(words[0].text, " Hello's"); + assert_eq!(words[0].end_ms, 600); + } + + #[test] + fn assemble_does_not_merge_spaced_tokens() { + let raw = vec![raw_word(" Hello", 0.0, 0.5), raw_word(" world", 0.51, 1.0)]; + let words = assemble(&raw, " Hello world", 0); + assert_eq!(words.len(), 2); + } + + #[test] + fn assemble_separates_unfound_tokens() { + let raw = vec![raw_word("Hello", 0.0, 0.5), raw_word("world", 0.51, 0.6)]; + let words = assemble(&raw, "completely different text", 0); + assert_eq!(words.len(), 2); + assert!(words[0].text.starts_with(' ')); + assert!(words[1].text.starts_with(' ')); + } + + #[test] + fn assemble_merges_cjk_syllables_with_large_gap() { + let raw = vec![ + raw_word("있는", 0.0, 0.3), + raw_word("데", 0.54, 0.66), + raw_word(",", 0.84, 0.9), + ]; + let words = assemble(&raw, " 있는데,", 0); + assert_eq!( + words.len(), + 1, + "syllables in same CJK word must merge: {words:?}" + ); + assert_eq!(words[0].text, " 있는데,"); + assert_eq!(words[0].end_ms, 900); + } + + #[test] + fn assemble_splits_cjk_words_at_transcript_space_boundary() { + let raw = vec![ + raw_word("있는", 0.0, 0.3), + raw_word("데", 0.54, 0.66), + raw_word("학습", 1.0, 1.3), + raw_word("과", 1.54, 1.66), + ]; + let words = assemble(&raw, " 있는데 학습과", 0); + assert_eq!( + words.len(), + 2, + "space in transcript must split words: {words:?}" + ); + assert_eq!(words[0].text, " 있는데"); + assert_eq!(words[1].text, " 학습과"); + } + + // ── dedup ──────────────────────────────────────────────────────────── + + #[test] + fn dedup_drops_words_at_or_before_watermark() { + let words = vec![ + word(" a", 0, 100), + word(" b", 100, 200), + word(" c", 200, 300), + ]; + let result = dedup(words, 200); + assert_eq!(result.len(), 1); + assert_eq!(result[0].text, " c"); + } + + #[test] + fn dedup_keeps_all_when_watermark_is_zero() { + let words = vec![word(" a", 0, 100), word(" b", 100, 200)]; + let result = dedup(words, 0); + assert_eq!(result.len(), 2); + } + + #[test] + fn dedup_returns_empty_when_all_covered() { + let words = vec![word(" a", 0, 100), word(" b", 100, 200)]; + let result = dedup(words, 200); + assert!(result.is_empty()); + } + + // ── stitch ─────────────────────────────────────────────────────────── + + #[test] + fn stitch_no_held_holds_last() { + let ws = vec![word(" Hello", 0, 500), word(" world", 600, 1000)]; + let (emitted, held) = stitch(None, ws); + assert_eq!(emitted.len(), 1); + assert_eq!(emitted[0].text, " Hello"); + assert_eq!(held.unwrap().text, " world"); + } + + #[test] + fn stitch_merges_spaceless_adjacent_head() { + let held = word(" Hello", 0, 500); + let ws = vec![word("'s", 550, 700)]; + let (emitted, held) = stitch(Some(held), ws); + assert!(emitted.is_empty()); + assert_eq!(held.unwrap().text, " Hello's"); + } + + #[test] + fn stitch_separates_spaced_head() { + let held = word(" Hello", 0, 500); + let ws = vec![word(" world", 600, 1000)]; + let (emitted, held) = stitch(Some(held), ws); + assert_eq!(emitted.len(), 1); + assert_eq!(emitted[0].text, " Hello"); + assert_eq!(held.unwrap().text, " world"); + } + + #[test] + fn stitch_separates_distant_spaceless_head() { + let held = word(" Hello", 0, 500); + let ws = vec![word("world", 1000, 1500)]; + let (emitted, held) = stitch(Some(held), ws); + assert_eq!(emitted.len(), 1); + assert_eq!(emitted[0].text, " Hello"); + assert_eq!(held.unwrap().text, "world"); + } + + #[test] + fn stitch_empty_batch_releases_held() { + let held = word(" Hello", 0, 500); + let (emitted, held) = stitch(Some(held), vec![]); + assert_eq!(emitted.len(), 1); + assert!(held.is_none()); + } + + #[test] + fn stitch_single_word_batch_yields_no_emission() { + let ws = vec![word(" Hello", 0, 500)]; + let (emitted, held) = stitch(None, ws); + assert!(emitted.is_empty()); + assert_eq!(held.unwrap().text, " Hello"); + } + + // ── splice ─────────────────────────────────────────────────────────── + + #[test] + fn splice_replaces_overlapping_range() { + let existing = vec![ + word(" a", 0, 100), + word(" b", 100, 200), + word(" c", 300, 400), + ]; + let incoming = vec![word(" B", 100, 200), word(" new", 200, 300)]; + let result = splice(&existing, incoming); + assert_eq!( + result.iter().map(|w| &w.text[..]).collect::>(), + [" a", " B", " new", " c"] + ); + } + + #[test] + fn splice_appends_when_no_overlap() { + let existing = vec![word(" a", 0, 100)]; + let incoming = vec![word(" b", 200, 300)]; + let result = splice(&existing, incoming); + assert_eq!(result.len(), 2); + } + + #[test] + fn splice_full_replacement() { + let existing = vec![word(" a", 0, 100), word(" b", 100, 200)]; + let incoming = vec![ + word(" x", 0, 100), + word(" y", 100, 200), + word(" z", 200, 300), + ]; + let result = splice(&existing, incoming); + assert_eq!( + result.iter().map(|w| &w.text[..]).collect::>(), + [" x", " y", " z"] + ); + } + + // ── strip_overlap ──────────────────────────────────────────────────── + + #[test] + fn strip_overlap_removes_covered_partials() { + let partials = vec![ + word(" a", 0, 100), + word(" b", 100, 200), + word(" c", 300, 400), + ]; + let result = strip_overlap(partials, 200); + assert_eq!(result.len(), 1); + assert_eq!(result[0].text, " c"); + } + + #[test] + fn strip_overlap_keeps_all_beyond_range() { + let partials = vec![word(" a", 300, 400), word(" b", 400, 500)]; + let result = strip_overlap(partials, 200); + assert_eq!(result.len(), 2); + } +} diff --git a/crates/transcript/src/lib.rs b/crates/transcript/src/lib.rs new file mode 100644 index 0000000000..2d5b405aa4 --- /dev/null +++ b/crates/transcript/src/lib.rs @@ -0,0 +1 @@ +pub mod accumulator; From e7eb1ce85cd9d246f9da3d3e660a2bbf62bd8ca8 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 02:37:57 +0000 Subject: [PATCH 2/2] chore: update Cargo.lock for transcript crate Co-Authored-By: yujonglee --- Cargo.lock | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index f6c4e39d80..9d0c8880df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19816,6 +19816,18 @@ dependencies = [ "ws-utils", ] +[[package]] +name = "transcript" +version = "0.1.0" +dependencies = [ + "data", + "owhisper-interface", + "serde", + "serde_json", + "specta", + "uuid", +] + [[package]] name = "transpose" version = "0.2.3"