fastrepl · yujonglee · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/.github/workflows/local_stt_e2e.yaml b/.github/workflows/local_stt_e2e.yaml
@@ -0,0 +1,48 @@
+on:
+  workflow_dispatch:
+  push:
+    branches: [main]
+    paths:
+      - crates/transcribe-cactus/**
+      - crates/cactus/**
+      - crates/cactus-sys/**
+  pull_request:
+    paths:
+      - crates/transcribe-cactus/**
+      - crates/cactus/**
+      - crates/cactus-sys/**
+
+jobs:
+  local-stt-e2e:
+    runs-on: depot-ubuntu-24.04-arm-8
+    strategy:
+      matrix:
+        model:
+          - name: whisper-small
+            repo: openai/whisper-small
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: ./.github/actions/rust_install
+        with:
+          platform: linux
+      - run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake build-essential libcurl4-openssl-dev libclang-dev
+      - run: |
+          pip3 install --break-system-packages huggingface-hub
+          pip3 install --break-system-packages -e vendor/cactus/python/ --no-deps
+      - uses: actions/cache@v4
+        with:
+          path: vendor/cactus/weights/
+          key: cactus-models-${{ matrix.model.name }}-arm-v1
+      - run: cactus download ${{ matrix.model.repo }}
+      - run: cargo test -p transcribe-cactus -- --ignored --nocapture
+        env:
+          CACTUS_STT_MODEL: ${{ github.workspace }}/vendor/cactus/weights/${{ matrix.model.name }}
+          CACTUS_CLOUD_API_KEY: ${{ secrets.CACTUS_CLOUD_API_KEY }}
+          E2E_AUDIO_SECS: 20
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/cactus/src/model.rs b/crates/cactus/src/model.rs
@@ -8,6 +8,7 @@ use crate::error::{Error, Result};
 pub struct Model {
     handle: NonNull<std::ffi::c_void>,
     inference_lock: Mutex<()>,
+    is_moonshine: bool,
 }
 
 unsafe impl Send for Model {}
@@ -39,9 +40,16 @@ impl ModelBuilder {
         let handle =
             NonNull::new(raw).ok_or_else(|| Error::Init("cactus_init returned null".into()))?;
 
+        let is_moonshine = self
+            .model_path
+            .to_string_lossy()
+            .to_lowercase()
+            .contains("moonshine");
+
         Ok(Model {
             handle,
             inference_lock: Mutex::new(()),
+            is_moonshine,
         })
     }
 }
@@ -57,6 +65,10 @@ impl Model {
         Self::builder(model_path).build()
     }
 
+    pub fn is_moonshine(&self) -> bool {
+        self.is_moonshine
+    }
+
     /// Cancel an in-progress inference. Safe to call concurrently — only sets an
     /// atomic flag on the C++ side.
     pub fn stop(&self) {

diff --git a/crates/cactus/src/stt/batch.rs b/crates/cactus/src/stt/batch.rs
@@ -20,7 +20,12 @@ impl Model {
         options: &TranscribeOptions,
     ) -> Result<TranscriptionResult> {
         let guard = self.lock_inference();
-        let prompt_c = CString::new(build_whisper_prompt(options))?;
+        let prompt = if self.is_moonshine() {
+            String::new()
+        } else {
+            build_whisper_prompt(options)
+        };
+        let prompt_c = CString::new(prompt)?;
         let options_c = CString::new(serde_json::to_string(options)?)?;
         let mut buf = vec![0u8; RESPONSE_BUF_SIZE];
 

diff --git a/crates/transcribe-cactus/Cargo.toml b/crates/transcribe-cactus/Cargo.toml
@@ -24,5 +24,14 @@ tower = { workspace = true }
 tracing = { workspace = true }
 
 [dev-dependencies]
+hypr-audio-utils = { workspace = true }
+hypr-cactus = { workspace = true }
 hypr-data = { workspace = true }
+
+axum = { workspace = true, features = ["ws"] }
+futures-util = { workspace = true }
+reqwest = { workspace = true, features = ["json"] }
+sequential-test = "0.2"
+serde_json = { workspace = true }
+tokio = { workspace = true }
 tokio-tungstenite = { workspace = true }
diff --git a/crates/transcribe-cactus/src/config.rs b/crates/transcribe-cactus/src/config.rs
@@ -1,13 +1,13 @@
 #[derive(Clone, Debug)]
 pub struct CactusConfig {
-    pub cloud_handoff: bool,
+    pub cloud: hypr_cactus::CloudConfig,
     pub min_chunk_sec: f32,
 }
 
 impl Default for CactusConfig {
     fn default() -> Self {
         Self {
-            cloud_handoff: true,
+            cloud: hypr_cactus::CloudConfig::default(),
             min_chunk_sec: 2.5,
         }
     }

diff --git a/crates/transcribe-cactus/src/service/batch.rs b/crates/transcribe-cactus/src/service/batch.rs
@@ -320,9 +320,14 @@ mod tests {
         println!("{}", alternative.transcript.trim());
         println!("--- END (confidence={:.2}) ---\n", alternative.confidence);
 
+        let transcript = alternative.transcript.trim().to_lowercase();
+        assert!(!transcript.is_empty(), "expected non-empty transcript");
         assert!(
-            !alternative.transcript.trim().is_empty(),
-            "expected non-empty transcript"
+            transcript.contains("maybe")
+                || transcript.contains("this")
+                || transcript.contains("talking"),
+            "transcript looks like a hallucination (got: {:?})",
+            transcript
         );
         assert!(
             alternative.confidence.is_finite(),

diff --git a/crates/transcribe-cactus/src/service/streaming/message.rs b/crates/transcribe-cactus/src/service/streaming/message.rs
@@ -56,3 +56,124 @@ pub(super) fn process_incoming_message(msg: &Message, channels: u8) -> IncomingM
         _ => IncomingMessage::Audio(AudioExtract::Empty),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use axum::extract::ws::Message;
+    use owhisper_interface::ControlMessage;
+
+    use super::*;
+
+    #[test]
+    fn control_message_finalize_parsed() {
+        let msg = Message::Text(r#"{"type":"Finalize"}"#.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Control(ControlMessage::Finalize) => {}
+            other => panic!(
+                "expected Finalize, got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn control_message_keep_alive_parsed() {
+        let msg = Message::Text(r#"{"type":"KeepAlive"}"#.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Control(ControlMessage::KeepAlive) => {}
+            other => panic!(
+                "expected KeepAlive, got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn control_message_close_stream_parsed() {
+        let msg = Message::Text(r#"{"type":"CloseStream"}"#.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Control(ControlMessage::CloseStream) => {}
+            other => panic!(
+                "expected CloseStream, got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn audio_chunk_parsed_over_control() {
+        let chunk = owhisper_interface::ListenInputChunk::End;
+        let json = serde_json::to_string(&chunk).unwrap();
+        let msg = Message::Text(json.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Audio(AudioExtract::End) => {}
+            other => panic!(
+                "expected Audio(End), got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn close_frame_yields_end() {
+        let msg = Message::Close(None);
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Audio(AudioExtract::End) => {}
+            other => panic!(
+                "expected Audio(End), got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn binary_single_channel_yields_mono() {
+        let samples: Vec<i16> = vec![1000, 2000, 3000];
+        let data: Vec<u8> = samples.iter().flat_map(|s| s.to_le_bytes()).collect();
+        let msg = Message::Binary(data.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Audio(AudioExtract::Mono(s)) => assert!(!s.is_empty()),
+            other => panic!(
+                "expected Audio(Mono), got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn binary_dual_channel_yields_dual() {
+        // 2 interleaved i16 samples (4 bytes per frame: ch0, ch1)
+        let samples: Vec<i16> = vec![1000, -1000, 2000, -2000];
+        let data: Vec<u8> = samples.iter().flat_map(|s| s.to_le_bytes()).collect();
+        let msg = Message::Binary(data.into());
+        match process_incoming_message(&msg, 2) {
+            IncomingMessage::Audio(AudioExtract::Dual { ch0, ch1 }) => {
+                assert_eq!(ch0.len(), 2);
+                assert_eq!(ch1.len(), 2);
+                assert!(ch0[0] > 0.0);
+                assert!(ch1[0] < 0.0);
+            }
+            other => panic!(
+                "expected Audio(Dual), got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+
+    #[test]
+    fn dual_audio_json_yields_dual() {
+        let chunk = owhisper_interface::ListenInputChunk::DualAudio {
+            mic: vec![0x00, 0x10],
+            speaker: vec![0x00, 0x20],
+        };
+        let json = serde_json::to_string(&chunk).unwrap();
+        let msg = Message::Text(json.into());
+        match process_incoming_message(&msg, 1) {
+            IncomingMessage::Audio(AudioExtract::Dual { .. }) => {}
+            other => panic!(
+                "expected Audio(Dual), got {:?}",
+                std::mem::discriminant(&other)
+            ),
+        }
+    }
+}
diff --git a/crates/transcribe-cactus/src/service/streaming/mod.rs b/crates/transcribe-cactus/src/service/streaming/mod.rs
@@ -2,9 +2,6 @@ mod message;
 mod response;
 mod session;
 
-#[cfg(test)]
-mod tests;
-
 use std::{
     future::Future,
     path::PathBuf,