From 2dc7a31e4d865e36cadeb721fe0aecb269fbdcf5 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 20 Feb 2026 02:37:52 +0000
Subject: [PATCH 1/2] feat: add crates/transcript with TranscriptAccumulator

Co-Authored-By: yujonglee <yujonglee.dev@gmail.com>
---
 Cargo.toml                                   |   1 +
 crates/transcript/Cargo.toml                 |  15 +
 crates/transcript/src/accumulator/channel.rs |  52 ++
 crates/transcript/src/accumulator/mod.rs     | 507 ++++++++++++++++++
 crates/transcript/src/accumulator/words.rs   | 511 +++++++++++++++++++
 crates/transcript/src/lib.rs                 |   1 +
 6 files changed, 1087 insertions(+)
 create mode 100644 crates/transcript/Cargo.toml
 create mode 100644 crates/transcript/src/accumulator/channel.rs
 create mode 100644 crates/transcript/src/accumulator/mod.rs
 create mode 100644 crates/transcript/src/accumulator/words.rs
 create mode 100644 crates/transcript/src/lib.rs
diff --git a/Cargo.toml b/Cargo.toml
index e7ef294d5b..8efe59a387 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -108,6 +108,7 @@ hypr-transcribe-deepgram = { path = "crates/transcribe-deepgram", package = "tra
 hypr-transcribe-openai = { path = "crates/transcribe-openai", package = "transcribe-openai" }
 hypr-transcribe-proxy = { path = "crates/transcribe-proxy", package = "transcribe-proxy" }
 hypr-transcribe-whisper-local = { path = "crates/transcribe-whisper-local", package = "transcribe-whisper-local" }
+hypr-transcript = { path = "crates/transcript", package = "transcript" }
 hypr-turso = { path = "crates/turso", package = "turso" }
 hypr-vad = { path = "crates/vad", package = "vad" }
 hypr-vad-ext = { path = "crates/vad-ext", package = "vad-ext" }
diff --git a/crates/transcript/Cargo.toml b/crates/transcript/Cargo.toml
new file mode 100644
index 0000000000..9fd1d63589
--- /dev/null
+++ b/crates/transcript/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "transcript"
+version = "0.1.0"
+edition = "2024"
+
+[dev-dependencies]
+hypr-data = { workspace = true }
+serde_json = { workspace = true }
+
+[dependencies]
+owhisper-interface = { workspace = true }
+
+serde = { workspace = true }
+specta = { workspace = true }
+uuid = { workspace = true, features = ["v4"] }
diff --git a/crates/transcript/src/accumulator/channel.rs b/crates/transcript/src/accumulator/channel.rs
new file mode 100644
index 0000000000..10fc6785ad
--- /dev/null
+++ b/crates/transcript/src/accumulator/channel.rs
@@ -0,0 +1,52 @@
+use super::words::{
+    RawWord, SpeakerHint, TranscriptWord, dedup, finalize_words, splice, stitch, strip_overlap,
+};
+
+pub(super) struct ChannelState {
+    watermark: i64,
+    held: Option<RawWord>,
+    partials: Vec<RawWord>,
+}
+
+impl ChannelState {
+    pub(super) fn new() -> Self {
+        Self {
+            watermark: 0,
+            held: None,
+            partials: Vec::new(),
+        }
+    }
+
+    pub(super) fn partials(&self) -> &[RawWord] {
+        &self.partials
+    }
+
+    pub(super) fn apply_final(
+        &mut self,
+        words: Vec<RawWord>,
+    ) -> (Vec<TranscriptWord>, Vec<SpeakerHint>) {
+        let response_end = words.last().map_or(0, |w| w.end_ms);
+        let new_words: Vec<_> = dedup(words, self.watermark);
+
+        if new_words.is_empty() {
+            return (vec![], vec![]);
+        }
+
+        self.watermark = response_end;
+        self.partials = strip_overlap(std::mem::take(&mut self.partials), response_end);
+
+        let (emitted, held) = stitch(self.held.take(), new_words);
+        self.held = held;
+        finalize_words(emitted)
+    }
+
+    pub(super) fn apply_partial(&mut self, words: Vec<RawWord>) {
+        self.partials = splice(&self.partials, words);
+    }
+
+    pub(super) fn drain(&mut self) -> (Vec<TranscriptWord>, Vec<SpeakerHint>) {
+        let mut raw: Vec<_> = self.held.take().into_iter().collect();
+        raw.extend(std::mem::take(&mut self.partials));
+        finalize_words(raw)
+    }
+}
diff --git a/crates/transcript/src/accumulator/mod.rs b/crates/transcript/src/accumulator/mod.rs
new file mode 100644
index 0000000000..847da5c244
--- /dev/null
+++ b/crates/transcript/src/accumulator/mod.rs
@@ -0,0 +1,507 @@
+//! # Transcript-as-Oracle Accumulator
+//!
+//! The transcript string in each ASR response is the **sole source of truth**
+//! for word boundaries. Tokens are sub-word fragments with timing metadata;
+//! the transcript tells us which fragments belong to the same word.
+//!
+//! ## Two-level design
+//!
+//! **Within a response** — `assemble` aligns tokens to the transcript via
+//! `spacing_from_transcript`. A space in the transcript means "new word";
+//! no space means "same word." No timing heuristics.
+//!
+//! **Across responses** — `stitch` handles the one case where no transcript
+//! spans both sides: when a provider splits a word across two final responses
+//! (e.g. Korean particles like "시스템" + "을" → "시스템을"). This uses a
+//! timing-based heuristic because no cross-response transcript exists.
+
+mod channel;
+mod words;
+
+use std::collections::BTreeMap;
+
+use owhisper_interface::stream::StreamResponse;
+
+pub use words::{PartialWord, SpeakerHint, TranscriptUpdate, TranscriptWord};
+
+use channel::ChannelState;
+use words::{assemble, ensure_space_prefix_partial};
+
+/// Accumulates streaming ASR responses into clean, deduplicated transcript data.
+///
+/// Each `process` call returns a `TranscriptUpdate` with:
+/// - `new_final_words`: words that became final since the last update (ready to persist)
+/// - `speaker_hints`: speaker associations for the newly finalized words
+/// - `partial_words`: current in-progress words across all channels (for live display)
+///
+/// Call `flush` at session end to drain any held/partial words that were never finalized.
+pub struct TranscriptAccumulator {
+    channels: BTreeMap<i32, ChannelState>,
+}
+
+impl TranscriptAccumulator {
+    pub fn new() -> Self {
+        Self {
+            channels: BTreeMap::new(),
+        }
+    }
+
+    pub fn process(&mut self, response: &StreamResponse) -> Option<TranscriptUpdate> {
+        let (is_final, channel, channel_index) = match response {
+            StreamResponse::TranscriptResponse {
+                is_final,
+                channel,
+                channel_index,
+                ..
+            } => (*is_final, channel, channel_index),
+            _ => return None,
+        };
+
+        let alt = channel.alternatives.first()?;
+        if alt.words.is_empty() && alt.transcript.is_empty() {
+            return None;
+        }
+
+        let ch = channel_index.first().copied().unwrap_or(0);
+        let words = assemble(&alt.words, &alt.transcript, ch);
+        if words.is_empty() {
+            return None;
+        }
+
+        let state = self.channels.entry(ch).or_insert_with(ChannelState::new);
+
+        let (new_final_words, speaker_hints) = if is_final {
+            state.apply_final(words)
+        } else {
+            state.apply_partial(words);
+            (vec![], vec![])
+        };
+
+        Some(TranscriptUpdate {
+            new_final_words,
+            speaker_hints,
+            partial_words: self.all_partials(),
+        })
+    }
+
+    pub fn flush(&mut self) -> TranscriptUpdate {
+        let mut new_final_words = Vec::new();
+        let mut speaker_hints = Vec::new();
+
+        for state in self.channels.values_mut() {
+            let (words, hints) = state.drain();
+            new_final_words.extend(words);
+            speaker_hints.extend(hints);
+        }
+
+        TranscriptUpdate {
+            new_final_words,
+            speaker_hints,
+            partial_words: vec![],
+        }
+    }
+
+    fn all_partials(&self) -> Vec<PartialWord> {
+        let mut partials: Vec<PartialWord> = self
+            .channels
+            .values()
+            .flat_map(|state| state.partials().iter().map(|w| w.to_partial()))
+            .collect();
+
+        if let Some(first) = partials.first_mut() {
+            ensure_space_prefix_partial(first);
+        }
+
+        partials
+    }
+}
+
+impl Default for TranscriptAccumulator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use owhisper_interface::stream::{Alternatives, Channel, Metadata, ModelInfo};
+
+    fn raw_word(
+        text: &str,
+        start: f64,
+        end: f64,
+        speaker: Option<i32>,
+    ) -> owhisper_interface::stream::Word {
+        owhisper_interface::stream::Word {
+            word: text.to_string(),
+            start,
+            end,
+            confidence: 1.0,
+            speaker,
+            punctuated_word: Some(text.to_string()),
+            language: None,
+        }
+    }
+
+    fn response(
+        words: &[(&str, f64, f64, Option<i32>)],
+        transcript: &str,
+        is_final: bool,
+        channel_idx: i32,
+    ) -> StreamResponse {
+        StreamResponse::TranscriptResponse {
+            start: 0.0,
+            duration: 0.0,
+            is_final,
+            speech_final: is_final,
+            from_finalize: false,
+            channel: Channel {
+                alternatives: vec![Alternatives {
+                    transcript: transcript.to_string(),
+                    words: words
+                        .iter()
+                        .map(|&(t, s, e, sp)| raw_word(t, s, e, sp))
+                        .collect(),
+                    confidence: 1.0,
+                    languages: vec![],
+                }],
+            },
+            metadata: Metadata {
+                request_id: String::new(),
+                model_info: ModelInfo {
+                    name: String::new(),
+                    version: String::new(),
+                    arch: String::new(),
+                },
+                model_uuid: String::new(),
+                extra: None,
+            },
+            channel_index: vec![channel_idx],
+        }
+    }
+
+    fn partial(words: &[(&str, f64, f64)], transcript: &str) -> StreamResponse {
+        let ws: Vec<_> = words.iter().map(|&(t, s, e)| (t, s, e, None)).collect();
+        response(&ws, transcript, false, 0)
+    }
+
+    fn finalize(words: &[(&str, f64, f64)], transcript: &str) -> StreamResponse {
+        let ws: Vec<_> = words.iter().map(|&(t, s, e)| (t, s, e, None)).collect();
+        response(&ws, transcript, true, 0)
+    }
+
+    fn finalize_with_speakers(
+        words: &[(&str, f64, f64, Option<i32>)],
+        transcript: &str,
+    ) -> StreamResponse {
+        response(words, transcript, true, 0)
+    }
+
+    fn replay(responses: &[StreamResponse]) -> Vec<TranscriptWord> {
+        let mut acc = TranscriptAccumulator::new();
+        let mut words = Vec::new();
+
+        for r in responses {
+            if let Some(update) = acc.process(r) {
+                words.extend(update.new_final_words);
+            }
+        }
+
+        words.extend(acc.flush().new_final_words);
+        words
+    }
+
+    fn assert_valid_output(words: &[TranscriptWord]) {
+        assert!(!words.is_empty(), "must produce words");
+
+        assert!(
+            words.iter().all(|w| !w.id.is_empty()),
+            "all words must have IDs"
+        );
+
+        let ids: std::collections::HashSet<_> = words.iter().map(|w| &w.id).collect();
+        assert_eq!(ids.len(), words.len(), "IDs must be unique");
+
+        for w in words {
+            assert!(
+                !w.text.trim().is_empty(),
+                "word text must not be blank: {w:?}"
+            );
+            assert!(
+                w.text.starts_with(' '),
+                "word must start with space: {:?}",
+                w.text
+            );
+        }
+
+        for ch in words
+            .iter()
+            .map(|w| w.channel)
+            .collect::<std::collections::BTreeSet<_>>()
+        {
+            let cw: Vec<_> = words.iter().filter(|w| w.channel == ch).collect();
+            assert!(
+                cw.windows(2).all(|w| w[0].start_ms <= w[1].start_ms),
+                "channel {ch} must be chronological"
+            );
+        }
+    }
+
+    #[test]
+    fn partial_update_exposes_current_words() {
+        let mut acc = TranscriptAccumulator::new();
+
+        let update = acc
+            .process(&partial(
+                &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert!(update.new_final_words.is_empty());
+        assert_eq!(update.partial_words.len(), 2);
+        assert_eq!(
+            update
+                .partial_words
+                .iter()
+                .map(|w| &w.text)
+                .collect::<Vec<_>>(),
+            [" Hello", " world"]
+        );
+    }
+
+    #[test]
+    fn partial_splices_into_existing_window() {
+        let mut acc = TranscriptAccumulator::new();
+
+        acc.process(&partial(
+            &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+            " Hello world",
+        ));
+
+        let update = acc
+            .process(&partial(
+                &[
+                    (" Hello", 0.1, 0.5),
+                    (" world", 0.6, 0.9),
+                    (" today", 1.0, 1.3),
+                ],
+                " Hello world today",
+            ))
+            .unwrap();
+
+        assert_eq!(update.partial_words.len(), 3);
+        assert_eq!(
+            update
+                .partial_words
+                .iter()
+                .map(|w| &w.text)
+                .collect::<Vec<_>>(),
+            [" Hello", " world", " today"]
+        );
+    }
+
+    #[test]
+    fn final_emits_prefix_and_holds_last() {
+        let mut acc = TranscriptAccumulator::new();
+
+        acc.process(&partial(
+            &[(" Hello", 0.1, 0.5), (" world", 0.55, 0.9)],
+            " Hello world",
+        ));
+
+        let update = acc
+            .process(&finalize(
+                &[(" Hello", 0.1, 0.5), (" world", 0.55, 0.9)],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert_eq!(update.new_final_words.len(), 1);
+        assert_eq!(update.new_final_words[0].text, " Hello");
+        assert!(update.partial_words.is_empty());
+
+        let flushed = acc.flush();
+        assert_eq!(flushed.new_final_words.len(), 1);
+        assert_eq!(flushed.new_final_words[0].text, " world");
+    }
+
+    #[test]
+    fn final_deduplicates_repeated_response() {
+        let mut acc = TranscriptAccumulator::new();
+
+        let r = finalize(
+            &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+            " Hello world",
+        );
+
+        let first = acc.process(&r).unwrap();
+        let second = acc.process(&r).unwrap();
+
+        assert!(!first.new_final_words.is_empty());
+        assert!(second.new_final_words.is_empty());
+    }
+
+    #[test]
+    fn final_clears_overlapping_partials() {
+        let mut acc = TranscriptAccumulator::new();
+
+        acc.process(&partial(
+            &[
+                (" Hello", 0.1, 0.5),
+                (" world", 0.6, 1.0),
+                (" test", 1.1, 1.5),
+            ],
+            " Hello world test",
+        ));
+
+        let update = acc
+            .process(&finalize(
+                &[(" Hello", 0.1, 0.5), (" world", 0.6, 1.0)],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert_eq!(update.partial_words.len(), 1);
+        assert_eq!(update.partial_words[0].text, " test");
+    }
+
+    #[test]
+    fn all_final_words_have_ids() {
+        let mut acc = TranscriptAccumulator::new();
+
+        let update = acc
+            .process(&finalize(
+                &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert!(update.new_final_words.iter().all(|w| !w.id.is_empty()));
+
+        let flushed = acc.flush();
+        assert!(flushed.new_final_words.iter().all(|w| !w.id.is_empty()));
+    }
+
+    #[test]
+    fn flush_drains_held_word() {
+        let mut acc = TranscriptAccumulator::new();
+
+        acc.process(&finalize(
+            &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+            " Hello world",
+        ));
+
+        let flushed = acc.flush();
+
+        assert_eq!(flushed.new_final_words.len(), 1);
+        assert_eq!(flushed.new_final_words[0].text, " world");
+        assert!(!flushed.new_final_words[0].id.is_empty());
+    }
+
+    #[test]
+    fn flush_drains_partials_beyond_final_range() {
+        let mut acc = TranscriptAccumulator::new();
+
+        acc.process(&partial(&[(" later", 5.0, 5.5)], " later"));
+
+        acc.process(&finalize(
+            &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+            " Hello world",
+        ));
+
+        let flushed = acc.flush();
+
+        let texts: Vec<_> = flushed.new_final_words.iter().map(|w| &w.text).collect();
+        assert!(
+            texts.contains(&&" world".to_string()) || texts.contains(&&" later".to_string()),
+            "flush must drain held + partials: {texts:?}"
+        );
+        assert!(flushed.new_final_words.iter().all(|w| !w.id.is_empty()));
+    }
+
+    #[test]
+    fn flush_on_empty_accumulator_is_empty() {
+        let mut acc = TranscriptAccumulator::new();
+        let flushed = acc.flush();
+        assert!(flushed.new_final_words.is_empty());
+        assert!(flushed.partial_words.is_empty());
+        assert!(flushed.speaker_hints.is_empty());
+    }
+
+    #[test]
+    fn non_transcript_responses_produce_no_update() {
+        let mut acc = TranscriptAccumulator::new();
+        let ignored = StreamResponse::TerminalResponse {
+            request_id: "r".into(),
+            created: "now".into(),
+            duration: 1.0,
+            channels: 1,
+        };
+        assert!(acc.process(&ignored).is_none());
+    }
+
+    #[test]
+    fn speaker_hints_extracted_from_final_words() {
+        let mut acc = TranscriptAccumulator::new();
+
+        let update = acc
+            .process(&finalize_with_speakers(
+                &[(" Hello", 0.1, 0.5, Some(0)), (" world", 0.6, 0.9, Some(1))],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert_eq!(update.new_final_words.len(), 1);
+        assert_eq!(update.speaker_hints.len(), 1);
+        assert_eq!(update.speaker_hints[0].speaker_index, 0);
+        assert_eq!(
+            update.speaker_hints[0].word_id,
+            update.new_final_words[0].id
+        );
+
+        let flushed = acc.flush();
+        assert_eq!(flushed.new_final_words.len(), 1);
+        assert_eq!(flushed.speaker_hints.len(), 1);
+        assert_eq!(flushed.speaker_hints[0].speaker_index, 1);
+    }
+
+    #[test]
+    fn no_speaker_hints_when_speaker_is_none() {
+        let mut acc = TranscriptAccumulator::new();
+
+        let update = acc
+            .process(&finalize(
+                &[(" Hello", 0.1, 0.5), (" world", 0.6, 0.9)],
+                " Hello world",
+            ))
+            .unwrap();
+
+        assert!(update.speaker_hints.is_empty());
+    }
+
+    macro_rules! fixture_test {
+        ($test_name:ident, $json:expr) => {
+            #[test]
+            fn $test_name() {
+                let responses: Vec<StreamResponse> =
+                    serde_json::from_str($json).expect("fixture must parse as StreamResponse[]");
+                assert_valid_output(&replay(&responses));
+            }
+        };
+    }
+
+    fixture_test!(
+        deepgram_fixture_produces_valid_output,
+        hypr_data::english_1::DEEPGRAM_JSON
+    );
+    fixture_test!(
+        soniox_fixture_produces_valid_output,
+        hypr_data::english_1::SONIOX_JSON
+    );
+    fixture_test!(
+        soniox_korean_fixture_produces_valid_output,
+        hypr_data::korean_1::SONIOX_JSON
+    );
+}
diff --git a/crates/transcript/src/accumulator/words.rs b/crates/transcript/src/accumulator/words.rs
new file mode 100644
index 0000000000..49d9238c8f
--- /dev/null
+++ b/crates/transcript/src/accumulator/words.rs
@@ -0,0 +1,511 @@
+use owhisper_interface::stream::Word;
+use uuid::Uuid;
+
+// ── Public output types ─────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)]
+pub struct TranscriptWord {
+    pub id: String,
+    pub text: String,
+    pub start_ms: i64,
+    pub end_ms: i64,
+    pub channel: i32,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)]
+pub struct PartialWord {
+    pub text: String,
+    pub start_ms: i64,
+    pub end_ms: i64,
+    pub channel: i32,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)]
+pub struct SpeakerHint {
+    pub word_id: String,
+    pub speaker_index: i32,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, specta::Type)]
+pub struct TranscriptUpdate {
+    pub new_final_words: Vec<TranscriptWord>,
+    pub speaker_hints: Vec<SpeakerHint>,
+    pub partial_words: Vec<PartialWord>,
+}
+
+// ── Internal pipeline type ──────────────────────────────────────────────────
+
+#[derive(Debug, Clone)]
+pub(super) struct RawWord {
+    pub(super) text: String,
+    pub(super) start_ms: i64,
+    pub(super) end_ms: i64,
+    pub(super) channel: i32,
+    pub(super) speaker: Option<i32>,
+}
+
+impl RawWord {
+    pub(super) fn to_final(self, id: String) -> (TranscriptWord, Option<SpeakerHint>) {
+        let hint = self.speaker.map(|speaker_index| SpeakerHint {
+            word_id: id.clone(),
+            speaker_index,
+        });
+        let word = TranscriptWord {
+            id,
+            text: self.text,
+            start_ms: self.start_ms,
+            end_ms: self.end_ms,
+            channel: self.channel,
+        };
+        (word, hint)
+    }
+
+    pub(super) fn to_partial(&self) -> PartialWord {
+        PartialWord {
+            text: self.text.clone(),
+            start_ms: self.start_ms,
+            end_ms: self.end_ms,
+            channel: self.channel,
+        }
+    }
+}
+
+// ── Assembly ─────────────────────────────────────────────────────────────────
+
+/// Assemble raw ASR tokens into merged `RawWord`s.
+///
+/// The transcript string is the **sole oracle** for word boundaries within a
+/// single response. `spacing_from_transcript` aligns each token to the
+/// transcript; a space prefix means "new word", no space means "same word."
+/// Adjacent tokens without a space prefix are unconditionally merged —
+/// no timing heuristics.
+pub(super) fn assemble(raw: &[Word], transcript: &str, channel: i32) -> Vec<RawWord> {
+    let spaced = spacing_from_transcript(raw, transcript);
+    let mut result: Vec<RawWord> = Vec::new();
+
+    for (w, text) in raw.iter().zip(&spaced) {
+        let start_ms = (w.start * 1000.0).round() as i64;
+        let end_ms = (w.end * 1000.0).round() as i64;
+
+        let should_merge = !text.starts_with(' ') && result.last().is_some();
+
+        if should_merge {
+            let last = result.last_mut().unwrap();
+            last.text.push_str(text);
+            last.end_ms = end_ms;
+            if last.speaker.is_none() {
+                last.speaker = w.speaker;
+            }
+        } else {
+            result.push(RawWord {
+                text: text.clone(),
+                start_ms,
+                end_ms,
+                channel,
+                speaker: w.speaker,
+            });
+        }
+    }
+
+    result
+}
+
+/// Align each token to the transcript string and recover its spacing.
+///
+/// The transcript is the oracle: if a token is found in the transcript, the
+/// whitespace between the previous match and this one is prepended verbatim.
+/// If a token cannot be found (ASR/transcript mismatch), a space is forced
+/// so it becomes a separate word — "unknown = word boundary."
+fn spacing_from_transcript(raw: &[Word], transcript: &str) -> Vec<String> {
+    let mut result = Vec::with_capacity(raw.len());
+    let mut pos = 0;
+
+    for w in raw {
+        let text = w.punctuated_word.as_deref().unwrap_or(&w.word);
+        let trimmed = text.trim();
+
+        if trimmed.is_empty() {
+            result.push(text.to_string());
+            continue;
+        }
+
+        match transcript[pos..].find(trimmed) {
+            Some(found) => {
+                let abs = pos + found;
+                result.push(format!("{}{trimmed}", &transcript[pos..abs]));
+                pos = abs + trimmed.len();
+            }
+            None => {
+                let mut fallback = text.to_string();
+                if !fallback.starts_with(' ') {
+                    fallback.insert(0, ' ');
+                }
+                result.push(fallback);
+            }
+        }
+    }
+
+    result
+}
+
+// ── Pipeline stages ──────────────────────────────────────────────────────────
+
+/// Drop words already covered by the watermark (deduplication).
+pub(super) fn dedup(words: Vec<RawWord>, watermark: i64) -> Vec<RawWord> {
+    words
+        .into_iter()
+        .skip_while(|w| w.end_ms <= watermark)
+        .collect()
+}
+
+/// Cross-response word boundary handling — the one place where a timing
+/// heuristic is unavoidable, because no transcript spans both responses.
+///
+/// Holds back the last word of each finalized batch so it can be merged
+/// with the first word of the next batch if the provider split a word
+/// across responses (common with Korean particles, contractions, etc.).
+pub(super) fn stitch(
+    held: Option<RawWord>,
+    mut words: Vec<RawWord>,
+) -> (Vec<RawWord>, Option<RawWord>) {
+    if words.is_empty() {
+        return (held.into_iter().collect(), None);
+    }
+
+    if let Some(h) = held {
+        if should_stitch(&h, &words[0]) {
+            words[0] = merge_words(h, words[0].clone());
+        } else {
+            words.insert(0, h);
+        }
+    }
+
+    let new_held = words.pop();
+    (words, new_held)
+}
+
+/// Replace the time range covered by `incoming` within `existing`.
+pub(super) fn splice(existing: &[RawWord], incoming: Vec<RawWord>) -> Vec<RawWord> {
+    let first_start = incoming.first().map_or(0, |w| w.start_ms);
+    let last_end = incoming.last().map_or(0, |w| w.end_ms);
+
+    existing
+        .iter()
+        .filter(|w| w.end_ms <= first_start)
+        .cloned()
+        .chain(incoming)
+        .chain(existing.iter().filter(|w| w.start_ms >= last_end).cloned())
+        .collect()
+}
+
+/// Remove partials that overlap with the finalized time range.
+pub(super) fn strip_overlap(partials: Vec<RawWord>, final_end: i64) -> Vec<RawWord> {
+    partials
+        .into_iter()
+        .filter(|w| w.start_ms > final_end)
+        .collect()
+}
+
+// ── Word-level transforms ────────────────────────────────────────────────────
+
+pub(super) fn ensure_space_prefix_raw(w: &mut RawWord) {
+    if !w.text.starts_with(' ') {
+        w.text.insert(0, ' ');
+    }
+}
+
+pub(super) fn ensure_space_prefix_partial(w: &mut PartialWord) {
+    if !w.text.starts_with(' ') {
+        w.text.insert(0, ' ');
+    }
+}
+
+fn should_stitch(tail: &RawWord, head: &RawWord) -> bool {
+    !head.text.starts_with(' ') && (head.start_ms - tail.end_ms) <= 300
+}
+
+fn merge_words(mut left: RawWord, right: RawWord) -> RawWord {
+    left.text.push_str(&right.text);
+    left.end_ms = right.end_ms;
+    if left.speaker.is_none() {
+        left.speaker = right.speaker;
+    }
+    left
+}
+
+/// Convert a list of RawWords into finalized TranscriptWords + SpeakerHints.
+/// Assigns UUIDs, ensures space prefixes, and extracts speaker data.
+pub(super) fn finalize_words(mut words: Vec<RawWord>) -> (Vec<TranscriptWord>, Vec<SpeakerHint>) {
+    words.iter_mut().for_each(ensure_space_prefix_raw);
+
+    let mut final_words = Vec::with_capacity(words.len());
+    let mut hints = Vec::new();
+
+    for w in words {
+        let id = Uuid::new_v4().to_string();
+        let (word, hint) = w.to_final(id);
+        final_words.push(word);
+        if let Some(h) = hint {
+            hints.push(h);
+        }
+    }
+
+    (final_words, hints)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn raw_word(text: &str, start: f64, end: f64) -> Word {
+        Word {
+            word: text.to_string(),
+            start,
+            end,
+            confidence: 1.0,
+            speaker: None,
+            punctuated_word: Some(text.to_string()),
+            language: None,
+        }
+    }
+
+    fn word(text: &str, start_ms: i64, end_ms: i64) -> RawWord {
+        RawWord {
+            text: text.to_string(),
+            start_ms,
+            end_ms,
+            channel: 0,
+            speaker: None,
+        }
+    }
+
+    // ── spacing_from_transcript ──────────────────────────────────────────
+
+    #[test]
+    fn spacing_recovered_from_transcript() {
+        let raw = vec![raw_word("Hello", 0.0, 0.5), raw_word("world", 0.6, 1.0)];
+        let spaced = spacing_from_transcript(&raw, " Hello world");
+        assert_eq!(spaced, [" Hello", " world"]);
+    }
+
+    #[test]
+    fn spacing_forces_word_boundary_on_unfound_token() {
+        let raw = vec![raw_word("Hello", 0.0, 0.5)];
+        let spaced = spacing_from_transcript(&raw, "completely different");
+        assert_eq!(spaced, [" Hello"]);
+    }
+
+    #[test]
+    fn spacing_preserves_no_space_at_transcript_start() {
+        let raw = vec![raw_word("기", 0.0, 0.1), raw_word("간", 0.2, 0.3)];
+        let spaced = spacing_from_transcript(&raw, "기간");
+        assert_eq!(spaced, ["기", "간"]);
+    }
+
+    // ── assemble ─────────────────────────────────────────────────────────
+
+    #[test]
+    fn assemble_merges_attached_punctuation() {
+        let raw = vec![raw_word(" Hello", 0.0, 0.5), raw_word("'s", 0.51, 0.6)];
+        let words = assemble(&raw, " Hello's", 0);
+        assert_eq!(words.len(), 1);
+        assert_eq!(words[0].text, " Hello's");
+        assert_eq!(words[0].end_ms, 600);
+    }
+
+    #[test]
+    fn assemble_does_not_merge_spaced_tokens() {
+        let raw = vec![raw_word(" Hello", 0.0, 0.5), raw_word(" world", 0.51, 1.0)];
+        let words = assemble(&raw, " Hello world", 0);
+        assert_eq!(words.len(), 2);
+    }
+
+    #[test]
+    fn assemble_separates_unfound_tokens() {
+        let raw = vec![raw_word("Hello", 0.0, 0.5), raw_word("world", 0.51, 0.6)];
+        let words = assemble(&raw, "completely different text", 0);
+        assert_eq!(words.len(), 2);
+        assert!(words[0].text.starts_with(' '));
+        assert!(words[1].text.starts_with(' '));
+    }
+
+    #[test]
+    fn assemble_merges_cjk_syllables_with_large_gap() {
+        let raw = vec![
+            raw_word("있는", 0.0, 0.3),
+            raw_word("데", 0.54, 0.66),
+            raw_word(",", 0.84, 0.9),
+        ];
+        let words = assemble(&raw, " 있는데,", 0);
+        assert_eq!(
+            words.len(),
+            1,
+            "syllables in same CJK word must merge: {words:?}"
+        );
+        assert_eq!(words[0].text, " 있는데,");
+        assert_eq!(words[0].end_ms, 900);
+    }
+
+    #[test]
+    fn assemble_splits_cjk_words_at_transcript_space_boundary() {
+        let raw = vec![
+            raw_word("있는", 0.0, 0.3),
+            raw_word("데", 0.54, 0.66),
+            raw_word("학습", 1.0, 1.3),
+            raw_word("과", 1.54, 1.66),
+        ];
+        let words = assemble(&raw, " 있는데 학습과", 0);
+        assert_eq!(
+            words.len(),
+            2,
+            "space in transcript must split words: {words:?}"
+        );
+        assert_eq!(words[0].text, " 있는데");
+        assert_eq!(words[1].text, " 학습과");
+    }
+
+    // ── dedup ────────────────────────────────────────────────────────────
+
+    #[test]
+    fn dedup_drops_words_at_or_before_watermark() {
+        let words = vec![
+            word(" a", 0, 100),
+            word(" b", 100, 200),
+            word(" c", 200, 300),
+        ];
+        let result = dedup(words, 200);
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].text, " c");
+    }
+
+    #[test]
+    fn dedup_keeps_all_when_watermark_is_zero() {
+        let words = vec![word(" a", 0, 100), word(" b", 100, 200)];
+        let result = dedup(words, 0);
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn dedup_returns_empty_when_all_covered() {
+        let words = vec![word(" a", 0, 100), word(" b", 100, 200)];
+        let result = dedup(words, 200);
+        assert!(result.is_empty());
+    }
+
+    // ── stitch ───────────────────────────────────────────────────────────
+
+    #[test]
+    fn stitch_no_held_holds_last() {
+        let ws = vec![word(" Hello", 0, 500), word(" world", 600, 1000)];
+        let (emitted, held) = stitch(None, ws);
+        assert_eq!(emitted.len(), 1);
+        assert_eq!(emitted[0].text, " Hello");
+        assert_eq!(held.unwrap().text, " world");
+    }
+
+    #[test]
+    fn stitch_merges_spaceless_adjacent_head() {
+        let held = word(" Hello", 0, 500);
+        let ws = vec![word("'s", 550, 700)];
+        let (emitted, held) = stitch(Some(held), ws);
+        assert!(emitted.is_empty());
+        assert_eq!(held.unwrap().text, " Hello's");
+    }
+
+    #[test]
+    fn stitch_separates_spaced_head() {
+        let held = word(" Hello", 0, 500);
+        let ws = vec![word(" world", 600, 1000)];
+        let (emitted, held) = stitch(Some(held), ws);
+        assert_eq!(emitted.len(), 1);
+        assert_eq!(emitted[0].text, " Hello");
+        assert_eq!(held.unwrap().text, " world");
+    }
+
+    #[test]
+    fn stitch_separates_distant_spaceless_head() {
+        let held = word(" Hello", 0, 500);
+        let ws = vec![word("world", 1000, 1500)];
+        let (emitted, held) = stitch(Some(held), ws);
+        assert_eq!(emitted.len(), 1);
+        assert_eq!(emitted[0].text, " Hello");
+        assert_eq!(held.unwrap().text, "world");
+    }
+
+    #[test]
+    fn stitch_empty_batch_releases_held() {
+        let held = word(" Hello", 0, 500);
+        let (emitted, held) = stitch(Some(held), vec![]);
+        assert_eq!(emitted.len(), 1);
+        assert!(held.is_none());
+    }
+
+    #[test]
+    fn stitch_single_word_batch_yields_no_emission() {
+        let ws = vec![word(" Hello", 0, 500)];
+        let (emitted, held) = stitch(None, ws);
+        assert!(emitted.is_empty());
+        assert_eq!(held.unwrap().text, " Hello");
+    }
+
+    // ── splice ───────────────────────────────────────────────────────────
+
+    #[test]
+    fn splice_replaces_overlapping_range() {
+        let existing = vec![
+            word(" a", 0, 100),
+            word(" b", 100, 200),
+            word(" c", 300, 400),
+        ];
+        let incoming = vec![word(" B", 100, 200), word(" new", 200, 300)];
+        let result = splice(&existing, incoming);
+        assert_eq!(
+            result.iter().map(|w| &w.text[..]).collect::<Vec<_>>(),
+            [" a", " B", " new", " c"]
+        );
+    }
+
+    #[test]
+    fn splice_appends_when_no_overlap() {
+        let existing = vec![word(" a", 0, 100)];
+        let incoming = vec![word(" b", 200, 300)];
+        let result = splice(&existing, incoming);
+        assert_eq!(result.len(), 2);
+    }
+
+    #[test]
+    fn splice_full_replacement() {
+        let existing = vec![word(" a", 0, 100), word(" b", 100, 200)];
+        let incoming = vec![
+            word(" x", 0, 100),
+            word(" y", 100, 200),
+            word(" z", 200, 300),
+        ];
+        let result = splice(&existing, incoming);
+        assert_eq!(
+            result.iter().map(|w| &w.text[..]).collect::<Vec<_>>(),
+            [" x", " y", " z"]
+        );
+    }
+
+    // ── strip_overlap ────────────────────────────────────────────────────
+
+    #[test]
+    fn strip_overlap_removes_covered_partials() {
+        let partials = vec![
+            word(" a", 0, 100),
+            word(" b", 100, 200),
+            word(" c", 300, 400),
+        ];
+        let result = strip_overlap(partials, 200);
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].text, " c");
+    }
+
+    #[test]
+    fn strip_overlap_keeps_all_beyond_range() {
+        let partials = vec![word(" a", 300, 400), word(" b", 400, 500)];
+        let result = strip_overlap(partials, 200);
+        assert_eq!(result.len(), 2);
+    }
+}
diff --git a/crates/transcript/src/lib.rs b/crates/transcript/src/lib.rs
new file mode 100644
index 0000000000..2d5b405aa4
--- /dev/null
+++ b/crates/transcript/src/lib.rs
@@ -0,0 +1 @@
+pub mod accumulator;

From e7eb1ce85cd9d246f9da3d3e660a2bbf62bd8ca8 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Fri, 20 Feb 2026 02:37:57 +0000
Subject: [PATCH 2/2] chore: update Cargo.lock for transcript crate

Co-Authored-By: yujonglee <yujonglee.dev@gmail.com>
---
 Cargo.lock | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index f6c4e39d80..9d0c8880df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -19816,6 +19816,18 @@ dependencies = [
  "ws-utils",
 ]
 
+[[package]]
+name = "transcript"
+version = "0.1.0"
+dependencies = [
+ "data",
+ "owhisper-interface",
+ "serde",
+ "serde_json",
+ "specta",
+ "uuid",
+]
+
 [[package]]
 name = "transpose"
 version = "0.2.3"