From 8f3df0aeafddf1e97f8e3f8ad7e0ae4ce4b9c2ed Mon Sep 17 00:00:00 2001 From: fredericbirke Date: Sat, 1 Nov 2025 22:45:16 +0100 Subject: [PATCH 1/4] Remove hard coded python versions and use > instead --- packaging/voxd.wrapper | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/packaging/voxd.wrapper b/packaging/voxd.wrapper index d1b0e111..50ddee3a 100644 --- a/packaging/voxd.wrapper +++ b/packaging/voxd.wrapper @@ -44,10 +44,12 @@ print(f"{sys.version_info.major}.{sys.version_info.minor}") PY )" log "System Python version: $ver" - case "$ver" in - 3.9|3.10|3.11|3.12|3.13) : ;; - *) - # Attempt to create a user-local venv with any newer Python found + # Check if version is >= 3.9 + IFS='.' read -r major minor <<< "$ver" + if [[ "$major" -gt 3 ]] || [[ "$major" -eq 3 && "$minor" -ge 9 ]]; then + : # version is acceptable + else + # Attempt to create a user-local venv with any newer Python found pick_python() { for c in python3.12 python3.11 python3.10 python3.9 python3; do if command -v "$c" >/dev/null 2>&1; then @@ -56,10 +58,12 @@ import sys print(f"{sys.version_info.major}.{sys.version_info.minor}") PY )" - case "$v" in - 3.9|3.10|3.11|3.12|3.13) echo "$c"; return 0 ;; - *) : ;; - esac + # Check if version is >= 3.9 + IFS='.' read -r v_major v_minor <<< "$v" + if [[ "$v_major" -gt 3 ]] || [[ "$v_major" -eq 3 && "$v_minor" -ge 9 ]]; then + echo "$c" + return 0 + fi fi done echo "" @@ -86,8 +90,7 @@ PY echo "[voxd] System Python $ver is unsupported and no newer Python was found. Use 'bash packaging/install_voxd.sh ' to provision a newer Python, or create $APPDIR/.venv with Python >= 3.9." >&2 exit 1 fi - ;; - esac + fi fi # Ensure Python can import the embedded source tree From cb4a89df69a05a022f456c213687c5cfc43897fd Mon Sep 17 00:00:00 2001 From: 4ellendger <4ellendger@gmail.com> Date: Tue, 30 Dec 2025 13:42:36 +0400 Subject: [PATCH 2/4] Add streaming input functionality --- README.md | 32 +- pyproject.toml | 2 +- src/voxd/__main__.py | 13 + src/voxd/cli/cli_main.py | 332 ++++++++++++++++----- src/voxd/core/config.py | 7 + src/voxd/core/recorder.py | 89 +++++- src/voxd/core/streaming_core.py | 217 ++++++++++++++ src/voxd/core/streaming_transcriber.py | 395 +++++++++++++++++++++++++ src/voxd/core/transcriber.py | 4 - src/voxd/core/typer.py | 84 ++++++ src/voxd/gui/gui_main.py | 8 +- src/voxd/tray/tray_main.py | 6 +- src/voxd/utils/libw.py | 7 +- tests/conftest.py | 29 ++ 14 files changed, 1120 insertions(+), 105 deletions(-) create mode 100644 src/voxd/core/streaming_core.py create mode 100644 src/voxd/core/streaming_transcriber.py diff --git a/README.md b/README.md index 8291ea08..89f985ec 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ Hit your **hotkey shortcut** -> speak -> hotk | Feature | Notes | | -------------------------------- | ----------------------------------------------------------------------- | | **Whisper.cpp** backend | Local, offline, fast ASR. | -| **Simulated typing** | instantly types straight into any currently focused input window. Even on Wayland! (*ydotool*). | +| **Streaming transcription** | Real-time incremental typing as you speak. Text appears word-by-word, not after recording stops. | +| **Simulated typing** | Instantly types straight into any currently focused input window. Even on Wayland! (*ydotool*). | | **Clipboard** | Auto-copies into clipboard - ready for pasting, if desired | | **Languages** | 99+ languages. Provides default language config and session language override | | **AIPP**, AI Post-Processing | AI-rewriting via local or cloud LLMs. GUI prompt editor. | @@ -138,9 +139,25 @@ Leave VOXD running in the background -> go to any app where you want to voice-ty | Press hotkey … | VOXD does … | | ---------------- | ----------------------------------------------------------- | | **First press** | start recording | -| **Second press** | stop ⇢ [transcribe ⇢ copy to clipboard] ⇢ types the output into any focused app | +| **Second press** | stop ⇢ [finalize transcription ⇢ copy to clipboard] ⇢ types any remaining output into any focused app | -Otherwise, if in --flux (beta), **just speak**. +### 🎙️ Streaming Mode (Default) + +VOXD uses **streaming transcription** by default, which means: + +- **Real-time typing**: Text appears incrementally as you speak, not after you stop recording +- **Chunk-based processing**: Audio is processed in overlapping chunks (default: 3 seconds) for continuous transcription +- **Incremental updates**: Text is typed word-by-word or phrase-by-phrase as it's transcribed (typically every 2 seconds or 3 words) +- **Seamless experience**: You see your words appear in real-time, making it feel like natural voice-typing + +**How it works:** +1. Press hotkey to start → VOXD begins recording and transcribing +2. As you speak → Text appears incrementally in your focused application +3. Press hotkey again → Finalizes any remaining transcription and copies to clipboard + +This streaming behavior is enabled by default in CLI (`voxd`), GUI (`voxd --gui`), and Tray (`voxd --tray`) modes. The old "record-then-transcribe" behavior is no longer used. + +**Note:** If in `--flux` mode (beta), **just speak** - no hotkey needed, voice activity detection triggers recording automatically. ### Autostart For practical reasons (always ready to type & low system footprint), it is advised to enable voxd user daemon: @@ -307,6 +324,15 @@ llamacpp_server_timeout: 30 # Selected models per provider (automatically updated by VOXD) aipp_selected_models: llamacpp_server: "qwen2.5-3b-instruct-q4_k_m" + +# Streaming transcription settings (default: enabled) +streaming_enabled: true # Enable/disable streaming mode +streaming_chunk_seconds: 3.0 # Audio chunk size in seconds (default: 3.0) +streaming_overlap_seconds: 0.5 # Overlap between chunks in seconds (default: 0.5) +streaming_emit_interval_seconds: 2.0 # Minimum time between text updates (default: 2.0) +streaming_emit_word_count: 3 # Minimum words before emitting text (default: 3) +streaming_typing_delay: 0.01 # Delay between typed characters in streaming mode (default: 0.01) +streaming_min_chars_to_type: 3 # Minimum characters before typing incremental text (default: 3) ``` --- diff --git a/pyproject.toml b/pyproject.toml index 9132d542..756ecc65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "voxd" -version = "mr.batman" # bump manually on releases +version = "1.7.0" description = "Voice-typing helper powered by whisper.cpp" authors = [{ name = "Jakov", email = "jakov.iv@proton.me" }] requires-python = ">=3.9" diff --git a/src/voxd/__main__.py b/src/voxd/__main__.py index b7c445f0..9b8e4611 100644 --- a/src/voxd/__main__.py +++ b/src/voxd/__main__.py @@ -352,6 +352,12 @@ def main(): dest="lang", help="Transcription language (ISO 639-1, e.g. 'en', 'sv', or 'auto' for detection)" ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose logging (shows detailed debug output)" + ) args, unknown = parser.parse_known_args() if args.version: @@ -419,6 +425,13 @@ def main(): sys.exit(0) cfg = AppConfig() + # Session-only override for verbosity + if args.verbose: + cfg.data["verbosity"] = True + setattr(cfg, "verbosity", True) + import os + os.environ["VOXD_VERBOSE"] = "1" + # Session-only override for language if args.lang: try: diff --git a/src/voxd/cli/cli_main.py b/src/voxd/cli/cli_main.py index e1fc9ff1..5a60c5db 100644 --- a/src/voxd/cli/cli_main.py +++ b/src/voxd/cli/cli_main.py @@ -13,6 +13,7 @@ from voxd.core.logger import SessionLogger from voxd.core.transcriber import WhisperTranscriber # type: ignore from voxd.core.aipp import get_final_text +from voxd.core.streaming_transcriber import StreamingWhisperTranscriber from voxd.utils.core_runner import AudioRecorder, ClipboardManager, SimulatedTyper from voxd.utils.ipc_server import start_ipc_server from voxd.utils.libw import verbo, verr, YELLOW, RED, RESET, ORANGE @@ -111,56 +112,134 @@ def on_ipc_trigger(): print(f"{ORANGE}Continuous mode | hotkey to rec/stop | Ctrl+C to exit\n*** You can now go to ANY other app to VOICE-TYPE - leave this active in the background ***{RESET}") else: print("Continuous mode | hotkey to rec/stop | Ctrl+C to exit\n*** You can now go to ANY other app to VOICE-TYPE - leave this active in the background ***") - # Create reusable instances outside the loop - recorder = AudioRecorder( - record_chunked=getattr(cfg, "record_chunked", True), - chunk_seconds=int(getattr(cfg, "record_chunk_seconds", 300)) - ) + preserve = bool(args.save_audio) or bool(getattr(cfg, "save_recordings", False)) - transcriber = WhisperTranscriber( - cfg.whisper_model_path, - cfg.whisper_binary, - delete_input=not preserve, - language=cfg.data.get("language", "en"), - ) clipboard = ClipboardManager() - typer = SimulatedTyper(delay=cfg.typing_delay, start_delay=cfg.typing_start_delay) - - try: - while True: - verbo("\n[cli] Awaiting hotkey to start recording...") - hotkey_event.clear() - hotkey_event.wait() - - recorder.start_recording() - print("Recording...") - hotkey_event.clear() - hotkey_event.wait() - verbo("[cli] Hotkey received: stopping recording.") - - rec_path = recorder.stop_recording(preserve=preserve) - verbo("[recorder] Stopping recording...") - - tscript, orig_tscript = transcriber.transcribe(rec_path) - if not tscript: - print("[core_runner] No transcript returned.") - continue - - final_text = get_final_text(tscript, cfg) # type: ignore[arg-type] - clipboard.copy(final_text) - print(f"\n📝 ---> ") + typer = SimulatedTyper( + delay=cfg.data.get("streaming_typing_delay", 0.01) if cfg.data.get("streaming_enabled", True) else cfg.typing_delay, + start_delay=cfg.typing_start_delay + ) + + use_streaming = cfg.data.get("streaming_enabled", True) + + if use_streaming: + recorder = AudioRecorder() + transcriber = StreamingWhisperTranscriber( + model_path=cfg.whisper_model_path, + binary_path=cfg.whisper_binary, + language=cfg.data.get("language", "en"), + chunk_seconds=cfg.data.get("streaming_chunk_seconds", 3.0), + overlap_seconds=cfg.data.get("streaming_overlap_seconds", 0.5), + ) + accumulated_text = "" + last_typed_text = "" + + def on_partial_text(text: str): + nonlocal accumulated_text, last_typed_text + if not text or not text.strip(): + return + min_chars = cfg.data.get("streaming_min_chars_to_type", 3) + if len(text) < min_chars: + return + new_accumulated = accumulated_text + " " + text if accumulated_text else text if cfg.typing: - typer.type(final_text) - print() - if cfg.aipp_enabled: - logger.log_entry(f"[original] {tscript}") - if final_text != tscript: - logger.log_entry(f"[aipp] {final_text}") - else: - logger.log_entry(final_text) - - except KeyboardInterrupt: - print("\n[cli] Exiting continuous recording mode...") + try: + typer.type_incremental(last_typed_text, new_accumulated) + last_typed_text = new_accumulated + except Exception as e: + verr(f"[cli] Incremental typing failed: {e}") + accumulated_text = new_accumulated + + transcriber.on_partial_text = on_partial_text + + try: + while True: + verbo("\n[cli] Awaiting hotkey to start recording...") + hotkey_event.clear() + hotkey_event.wait() + + print("Recording...") + accumulated_text = "" + last_typed_text = "" + + def on_audio_chunk(audio_data): + transcriber.add_audio_chunk(audio_data) + + recorder.start_streaming_recording(on_audio_chunk, chunk_seconds=cfg.data.get("streaming_chunk_seconds", 2.0)) + transcriber.start(samplerate=recorder.fs, channels=recorder.channels) + + hotkey_event.clear() + hotkey_event.wait() + verbo("[cli] Hotkey received: stopping recording.") + + recorder.stop_recording(preserve=preserve) + transcriber.stop() + final_text = transcriber.finalize() + + if not final_text: + print("[cli] No transcript returned.") + continue + + processed_text = get_final_text(final_text, cfg) + clipboard.copy(processed_text) + print(f"\n📝 ---> {processed_text}") + if cfg.aipp_enabled: + logger.log_entry(f"[original] {final_text}") + if processed_text != final_text: + logger.log_entry(f"[aipp] {processed_text}") + else: + logger.log_entry(processed_text) + except KeyboardInterrupt: + print("\n[cli] Exiting continuous recording mode...") + if recorder.is_recording: + recorder.stop_recording(preserve=preserve) + transcriber.stop() + else: + recorder = AudioRecorder( + record_chunked=getattr(cfg, "record_chunked", True), + chunk_seconds=int(getattr(cfg, "record_chunk_seconds", 300)) + ) + transcriber = WhisperTranscriber( + cfg.whisper_model_path, + cfg.whisper_binary, + delete_input=not preserve, + language=cfg.data.get("language", "en"), + ) + + try: + while True: + verbo("\n[cli] Awaiting hotkey to start recording...") + hotkey_event.clear() + hotkey_event.wait() + + recorder.start_recording() + print("Recording...") + hotkey_event.clear() + hotkey_event.wait() + verbo("[cli] Hotkey received: stopping recording.") + + rec_path = recorder.stop_recording(preserve=preserve) + verbo("[recorder] Stopping recording...") + + tscript, orig_tscript = transcriber.transcribe(rec_path) + if not tscript: + print("[core_runner] No transcript returned.") + continue + + final_text = get_final_text(tscript, cfg) + clipboard.copy(final_text) + print(f"\n📝 ---> ") + if cfg.typing: + typer.type(final_text) + print() + if cfg.aipp_enabled: + logger.log_entry(f"[original] {tscript}") + if final_text != tscript: + logger.log_entry(f"[aipp] {final_text}") + else: + logger.log_entry(final_text) + except KeyboardInterrupt: + print("\n[cli] Exiting continuous recording mode...") elif cmd == "l": logger.show() @@ -187,6 +266,7 @@ def on_ipc_trigger(): def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="voxd", description="VOXD CLI Mode") parser.add_argument("--save-audio", action="store_true", help="Preserve audio recordings. If used alone, sets it persistently.") + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging (shows detailed debug output)") # --- Quick action flags (non-interactive shortcuts) --- qa = parser.add_argument_group("Quick actions") qa.add_argument("--record", action="store_true", help="Record to ~/.local/share/voxd/recordings and exit (no transcription)") @@ -207,6 +287,14 @@ def main(): args = parser.parse_args() cfg = cast(Any, AppConfig()) + + # Session-only override for verbosity + if args.verbose: + cfg.data["verbosity"] = True + setattr(cfg, "verbosity", True) + import os + os.environ["VOXD_VERBOSE"] = "1" + # Ensure whisper-cli exists (auto-build if missing) ensure_whisper_cli("cli") logger = SessionLogger(cfg.log_enabled, cfg.log_location) @@ -270,44 +358,124 @@ def on_ipc_trigger(): print(f"{ORANGE}Continuous mode | hotkey to rec/stop | Ctrl+C to exit{RESET}") else: print("Continuous mode | hotkey to rec/stop | Ctrl+C to exit") - recorder = AudioRecorder() + preserve = bool(args.save_audio) or bool(getattr(cfg, "save_recordings", False)) - transcriber = WhisperTranscriber( - cfg.whisper_model_path, - cfg.whisper_binary, - delete_input=not preserve, - language=cfg.data.get("language", "en"), - ) clipboard = ClipboardManager() - typer = SimulatedTyper(delay=cfg.typing_delay, start_delay=cfg.typing_start_delay) - try: - while True: - verbo("\n[cli] Awaiting hotkey to start recording...") - hotkey_event.clear() - hotkey_event.wait() - recorder.start_recording() - print("Recording...") - hotkey_event.clear() - hotkey_event.wait() - verbo("[cli] Hotkey received: stopping recording.") - rec_path = recorder.stop_recording(preserve=preserve) - tscript, _ = transcriber.transcribe(rec_path) - if not tscript: - print("[cli] No transcript returned.") - continue - final_text = get_final_text(tscript, cfg) # type: ignore[arg-type] - clipboard.copy(final_text) + typer = SimulatedTyper( + delay=cfg.data.get("streaming_typing_delay", 0.01) if cfg.data.get("streaming_enabled", True) else cfg.typing_delay, + start_delay=cfg.typing_start_delay + ) + + use_streaming = cfg.data.get("streaming_enabled", True) + + if use_streaming: + recorder = AudioRecorder() + transcriber = StreamingWhisperTranscriber( + model_path=cfg.whisper_model_path, + binary_path=cfg.whisper_binary, + language=cfg.data.get("language", "en"), + chunk_seconds=cfg.data.get("streaming_chunk_seconds", 3.0), + overlap_seconds=cfg.data.get("streaming_overlap_seconds", 0.5), + ) + accumulated_text = "" + last_typed_text = "" + + def on_partial_text(text: str): + nonlocal accumulated_text, last_typed_text + if not text or not text.strip(): + return + min_chars = cfg.data.get("streaming_min_chars_to_type", 3) + if len(text) < min_chars: + return + new_accumulated = accumulated_text + " " + text if accumulated_text else text if cfg.typing: - typer.type(final_text) - print(f"\n📝 ---> {final_text}") - if cfg.aipp_enabled: - logger.log_entry(f"[original] {tscript}") - if final_text != tscript: - logger.log_entry(f"[aipp] {final_text}") - else: - logger.log_entry(final_text) - except KeyboardInterrupt: - print("\n[cli] Exiting continuous recording mode...") + try: + typer.type_incremental(last_typed_text, new_accumulated) + last_typed_text = new_accumulated + except Exception as e: + verr(f"[cli] Incremental typing failed: {e}") + accumulated_text = new_accumulated + + transcriber.on_partial_text = on_partial_text + + try: + while True: + verbo("\n[cli] Awaiting hotkey to start recording...") + hotkey_event.clear() + hotkey_event.wait() + + print("Recording...") + accumulated_text = "" + last_typed_text = "" + + def on_audio_chunk(audio_data): + transcriber.add_audio_chunk(audio_data) + + recorder.start_streaming_recording(on_audio_chunk, chunk_seconds=cfg.data.get("streaming_chunk_seconds", 2.0)) + transcriber.start(samplerate=recorder.fs, channels=recorder.channels) + + hotkey_event.clear() + hotkey_event.wait() + verbo("[cli] Hotkey received: stopping recording.") + + recorder.stop_recording(preserve=preserve) + transcriber.stop() + final_text = transcriber.finalize() + + if not final_text: + print("[cli] No transcript returned.") + continue + + processed_text = get_final_text(final_text, cfg) + clipboard.copy(processed_text) + print(f"\n📝 ---> {processed_text}") + if cfg.aipp_enabled: + logger.log_entry(f"[original] {final_text}") + if processed_text != final_text: + logger.log_entry(f"[aipp] {processed_text}") + else: + logger.log_entry(processed_text) + except KeyboardInterrupt: + print("\n[cli] Exiting continuous recording mode...") + if recorder.is_recording: + recorder.stop_recording(preserve=preserve) + transcriber.stop() + else: + recorder = AudioRecorder() + transcriber = WhisperTranscriber( + cfg.whisper_model_path, + cfg.whisper_binary, + delete_input=not preserve, + language=cfg.data.get("language", "en"), + ) + try: + while True: + verbo("\n[cli] Awaiting hotkey to start recording...") + hotkey_event.clear() + hotkey_event.wait() + recorder.start_recording() + print("Recording...") + hotkey_event.clear() + hotkey_event.wait() + verbo("[cli] Hotkey received: stopping recording.") + rec_path = recorder.stop_recording(preserve=preserve) + tscript, _ = transcriber.transcribe(rec_path) + if not tscript: + print("[cli] No transcript returned.") + continue + final_text = get_final_text(tscript, cfg) + clipboard.copy(final_text) + if cfg.typing: + typer.type(final_text) + print(f"\n📝 ---> {final_text}") + if cfg.aipp_enabled: + logger.log_entry(f"[original] {tscript}") + if final_text != tscript: + logger.log_entry(f"[aipp] {final_text}") + else: + logger.log_entry(final_text) + except KeyboardInterrupt: + print("\n[cli] Exiting continuous recording mode...") return if args.transcribe: diff --git a/src/voxd/core/config.py b/src/voxd/core/config.py index da326111..fadd61c3 100644 --- a/src/voxd/core/config.py +++ b/src/voxd/core/config.py @@ -21,6 +21,13 @@ "typing_start_delay": 0.15, "ctrl_v_paste": False, # Use Ctrl+V instead of default Ctrl+Shift+V "append_trailing_space": True, + # Streaming transcription mode + "streaming_enabled": True, + "streaming_chunk_seconds": 3.0, + "streaming_overlap_seconds": 0.5, + "streaming_emit_interval_seconds": 2.0, + "streaming_emit_word_count": 3, + "streaming_typing_delay": 0.01, "verbosity": False, "autostart": False, "save_recordings": False, diff --git a/src/voxd/core/recorder.py b/src/voxd/core/recorder.py index e81887f9..a494fc12 100644 --- a/src/voxd/core/recorder.py +++ b/src/voxd/core/recorder.py @@ -5,11 +5,12 @@ from pathlib import Path import tempfile from voxd.utils.libw import verbo, verr +from voxd.core.config import AppConfig +from voxd.paths import RECORDINGS_DIR class AudioRecorder: def __init__(self, samplerate=16000, channels=1, *, record_chunked: bool | None = None, chunk_seconds: int | None = None): - from voxd.core.config import AppConfig cfg = AppConfig() self.fs = samplerate self.channels = channels @@ -43,7 +44,6 @@ def start_recording(self): # Prefer configured device or PulseAudio on Linux try: - from voxd.core.config import AppConfig cfg = AppConfig() dev_pref = cfg.data.get("audio_input_device") or ("pulse" if cfg.data.get("audio_prefer_pulse", True) else None) except Exception: @@ -77,7 +77,6 @@ def _open(device, fs): return sd.InputStream(**kw) # Try preferred sample rate on preferred device; then robust fallbacks - tried_pulse = False try: self.stream = _open(dev_pref, self.fs) self.stream.start() @@ -106,10 +105,8 @@ def _open(device, fs): try: self.stream = _open("pulse", self.fs) self.stream.start() - tried_pulse = True return except Exception: - tried_pulse = True pass # Last resort: open without device hint self.stream = _open(None, self.fs) @@ -120,10 +117,14 @@ def stop_recording(self, preserve=False): return None verbo("[recorder] Stopping recording...") - self.stream.stop() - self.stream.close() + if hasattr(self, 'stream') and self.stream is not None: + self.stream.stop() + self.stream.close() self.is_recording = False + if hasattr(self, 'streaming_buffer'): + self.streaming_buffer = [] + if self.record_chunked and self._chunk_wave is not None: try: self._chunk_wave.close() @@ -132,8 +133,6 @@ def stop_recording(self, preserve=False): self._chunk_wave = None audio_data = None if self.record_chunked else np.concatenate(self.recording, axis=0) - - from voxd.paths import RECORDINGS_DIR if preserve: rec_dir = RECORDINGS_DIR rec_dir.mkdir(exist_ok=True) @@ -192,7 +191,7 @@ def _stitch_chunks(self, output_path: Path): except Exception as e: verr(f"[recorder] Failed to stitch chunks: {e}") raise - + def get_last_temp_file(self): return self.last_temp_file @@ -200,3 +199,73 @@ def cleanup_temp(self): if self.last_temp_file and self.last_temp_file.exists(): verbo(f"[recorder] Cleaning up temporary file {self.last_temp_file}") self.last_temp_file.unlink() + + def start_streaming_recording(self, callback, chunk_seconds: float = 3.0): + """Start streaming recording that emits audio chunks via callback. + + Args: + callback: Callable[[np.ndarray], None] - called with each audio chunk + chunk_seconds: Size of each chunk in seconds + """ + verbo("[recorder] Starting streaming recording...") + self.is_recording = True + self.streaming_callback = callback + self.streaming_chunk_frames = int(chunk_seconds * self.fs) + self.streaming_buffer = [] + + try: + cfg = AppConfig() + dev_pref = cfg.data.get("audio_input_device") or ("pulse" if cfg.data.get("audio_prefer_pulse", True) else None) + except Exception: + dev_pref = "pulse" + + # FYI: `frames` and `time` parameters are required by sounddevice. + def streaming_callback(indata, frames, time, status): + if status: + verbo(f"[recorder] Warning: {status}") + if not self.is_recording: + return + + self.streaming_buffer.append(indata.copy()) + total_frames = sum(len(chunk) for chunk in self.streaming_buffer) + + if total_frames >= self.streaming_chunk_frames: + chunk = np.concatenate(self.streaming_buffer, axis=0) + self.streaming_buffer = [] + try: + verbo(f"[recorder] Emitting chunk of {len(chunk)} frames") + self.streaming_callback(chunk) + except Exception as e: + verr(f"[recorder] Streaming callback error: {e}") + + def _open(device, fs): + kw = {"samplerate": fs, "channels": self.channels, "callback": streaming_callback} + if device: + kw["device"] = device + return sd.InputStream(**kw) + + try: + self.stream = _open(dev_pref, self.fs) + self.stream.start() + except Exception as e: + verr(f"[recorder] Opening stream at {self.fs} Hz failed ({e}); trying device default rate") + try: + indev = dev_pref if dev_pref else (sd.default.device[0] if sd.default.device else None) + except Exception: + indev = None + try: + info = sd.query_devices(indev, 'input') if indev is not None else sd.query_devices(kind='input') + fallback_fs = int(info.get('default_samplerate') or 48000) + except Exception: + fallback_fs = 48000 + self.fs = fallback_fs + self.streaming_chunk_frames = int(chunk_seconds * self.fs) + if dev_pref != "pulse": + try: + self.stream = _open("pulse", self.fs) + self.stream.start() + return + except Exception: + pass + self.stream = _open(None, self.fs) + self.stream.start() diff --git a/src/voxd/core/streaming_core.py b/src/voxd/core/streaming_core.py new file mode 100644 index 00000000..99ad689a --- /dev/null +++ b/src/voxd/core/streaming_core.py @@ -0,0 +1,217 @@ +# pyright: reportMissingImports=false +from PyQt6.QtCore import QThread, pyqtSignal # type: ignore +from voxd.core.streaming_transcriber import StreamingWhisperTranscriber +from voxd.core.recorder import AudioRecorder +from voxd.core.typer import SimulatedTyper +from voxd.core.aipp import get_final_text +from voxd.utils.libw import verbo, verr +from voxd.utils.whisper_auto import ensure_whisper_cli +from datetime import datetime +from time import time +from pathlib import Path +import numpy as np +import psutil + + +class StreamingCoreProcessThread(QThread): + """Core process thread that orchestrates streaming recording, transcription, and typing.""" + + finished = pyqtSignal(str) + status_changed = pyqtSignal(str) + + def __init__(self, cfg, logger): + super().__init__() + self.cfg = cfg + self.logger = logger + self.should_stop = False + + self.recorder: AudioRecorder | None = None + self.transcriber: StreamingWhisperTranscriber | None = None + self.typer: SimulatedTyper | None = None + + self.accumulated_text = "" + self.last_typed_text = "" + self.last_typed_length = 0 + + def stop_recording(self): + """Stop the streaming recording.""" + self.should_stop = True + + def run(self): + """Main streaming process loop.""" + try: + transcriber = StreamingWhisperTranscriber( + model_path=self.cfg.whisper_model_path, + binary_path=self.cfg.whisper_binary, + language=getattr(self.cfg, "language", "en"), + chunk_seconds=self.cfg.data.get("streaming_chunk_seconds", 3.0), + overlap_seconds=self.cfg.data.get("streaming_overlap_seconds", 0.5), + emit_interval_seconds=self.cfg.data.get("streaming_emit_interval_seconds", 2.0), + emit_word_count=self.cfg.data.get("streaming_emit_word_count", 3), + on_partial_text=self._on_partial_text, + on_final_text=self._on_final_text, + ) + except FileNotFoundError: + if ensure_whisper_cli("gui") is None: + self.status_changed.emit("VOXD") + self.finished.emit("") + return + transcriber = StreamingWhisperTranscriber( + model_path=self.cfg.whisper_model_path, + binary_path=self.cfg.whisper_binary, + language=getattr(self.cfg, "language", "en"), + chunk_seconds=self.cfg.data.get("streaming_chunk_seconds", 3.0), + overlap_seconds=self.cfg.data.get("streaming_overlap_seconds", 0.5), + emit_interval_seconds=self.cfg.data.get("streaming_emit_interval_seconds", 2.0), + emit_word_count=self.cfg.data.get("streaming_emit_word_count", 3), + on_partial_text=self._on_partial_text, + on_final_text=self._on_final_text, + ) + + self.transcriber = transcriber + self.typer = SimulatedTyper( + delay=self.cfg.data.get("streaming_typing_delay", 0.01), + start_delay=self.cfg.typing_start_delay, + cfg=self.cfg + ) + + recorder = AudioRecorder() + self.recorder = recorder + + rec_start_dt = datetime.now() + chunk_seconds = self.cfg.data.get("streaming_chunk_seconds", 3.0) + + self.status_changed.emit("Recording") + + def on_audio_chunk(audio_data: np.ndarray): + """Callback for audio chunks from recorder.""" + if not self.should_stop: + verbo(f"[streaming_core] Received audio chunk: {len(audio_data)} frames, {len(audio_data) / recorder.fs:.2f}s") + transcriber.add_audio_chunk(audio_data) + + recorder.start_streaming_recording(on_audio_chunk, chunk_seconds=chunk_seconds) + transcriber.start(samplerate=recorder.fs, channels=recorder.channels) + + verbo("[streaming_core] Started streaming recording and transcription") + while not self.should_stop: + self.msleep(100) + + rec_end_dt = datetime.now() + + self.status_changed.emit("Transcribing") + recorder.stop_recording(preserve=False) + transcriber.stop() + + final_text = transcriber.finalize() + + if not final_text: + self.finished.emit("") + return + + trans_start_ts = time() + trans_end_ts = time() + + aipp_start_ts = aipp_end_ts = None + processed_text = get_final_text(final_text, self.cfg) + if self.cfg.aipp_enabled and processed_text and processed_text != final_text: + aipp_start_ts = time() + aipp_end_ts = time() + + try: + if self.cfg.aipp_enabled: + self.logger.log_entry(f"[original] {final_text}") + if processed_text and processed_text != final_text: + self.logger.log_entry(f"[aipp] {processed_text}") + else: + self.logger.log_entry(processed_text) + except Exception: + pass + + # In streaming mode, text is already typed incrementally during recording + # Only type final text if AIPP changed it and there's a difference + if self.cfg.typing and processed_text: + # Check if AIPP modified the text + if self.cfg.aipp_enabled and processed_text != final_text: + # AIPP changed the text - type the corrected version + # But only the difference to avoid retyping everything + if processed_text.startswith(self.last_typed_text): + # Only type the new suffix + suffix = processed_text[len(self.last_typed_text):] + if suffix: + self.status_changed.emit("Typing") + try: + self.typer.type_incremental(self.last_typed_text, processed_text) + self.last_typed_text = processed_text + self.last_typed_length = len(processed_text) + except Exception as e: + print(f"[streaming_core] Final typing failed: {e}") + else: + # Text changed significantly - type the full corrected version + self.status_changed.emit("Typing") + try: + self.typer.type(processed_text) + except Exception as e: + print(f"[streaming_core] Typing failed: {e}") + # If AIPP didn't change text, it's already been typed incrementally - do nothing + print() + + if self.cfg.perf_collect: + from voxd.utils.performance import write_perf_entry + + perf_entry = { + "date": rec_start_dt.strftime("%Y-%m-%d"), + "rec_start_time": rec_start_dt.strftime("%H:%M:%S"), + "rec_end_time": rec_end_dt.strftime("%H:%M:%S"), + "rec_dur": (rec_end_dt - rec_start_dt).total_seconds(), + "trans_start_time": datetime.fromtimestamp(trans_start_ts).strftime("%H:%M:%S"), + "trans_end_time": datetime.fromtimestamp(trans_end_ts).strftime("%H:%M:%S"), + "trans_dur": trans_end_ts - trans_start_ts, + "trans_eff": (trans_end_ts - trans_start_ts) / max(len(final_text), 1), + "transcript": final_text, + "usr_trans_acc": None, + "trans_model": Path(self.cfg.whisper_model_path).name, + "aipp_start_time": datetime.fromtimestamp(aipp_start_ts).strftime("%H:%M:%S") if aipp_start_ts else None, + "aipp_end_time": datetime.fromtimestamp(aipp_end_ts).strftime("%H:%M:%S") if aipp_end_ts else None, + "aipp_dur": (aipp_end_ts - aipp_start_ts) if aipp_start_ts and aipp_end_ts else None, + "ai_model": self.cfg.aipp_model if self.cfg.aipp_enabled else None, + "ai_provider": self.cfg.aipp_provider if self.cfg.aipp_enabled else None, + "ai_prompt": self.cfg.aipp_active_prompt if self.cfg.aipp_enabled else None, + "ai_transcript": processed_text if self.cfg.aipp_enabled else None, + "aipp_eff": ((aipp_end_ts - aipp_start_ts) / max(len(processed_text), 1)) if self.cfg.aipp_enabled and aipp_start_ts and aipp_end_ts and processed_text else None, + "sys_mem": psutil.virtual_memory().total, + "sys_cpu": psutil.cpu_freq().max if psutil.cpu_freq() else None, + "total_dur": (trans_end_ts - trans_start_ts) + (rec_end_dt - rec_start_dt).total_seconds() + } + write_perf_entry(perf_entry) + + self.finished.emit(processed_text) + + def _on_partial_text(self, text: str): + """Handle partial text updates from transcriber.""" + if not text or not text.strip(): + return + + verbo(f"[streaming_core] Partial text received: '{text[:50]}...'") + + # The transcriber already handles spacing, so just concatenate + # Preserve any leading space that was intentionally added + if self.accumulated_text: + new_accumulated = self.accumulated_text + text + else: + new_accumulated = text + + if self.cfg.typing and self.typer: + try: + self.typer.type_incremental(self.last_typed_text, new_accumulated) + self.last_typed_text = new_accumulated + self.last_typed_length = len(new_accumulated) + verbo(f"[streaming_core] Typed incremental text, total length: {len(new_accumulated)}") + except Exception as e: + verr(f"[streaming_core] Incremental typing failed: {e}") + + self.accumulated_text = new_accumulated + + def _on_final_text(self, text: str): + """Handle final text from transcriber.""" + self.accumulated_text = text + diff --git a/src/voxd/core/streaming_transcriber.py b/src/voxd/core/streaming_transcriber.py new file mode 100644 index 00000000..5839ee07 --- /dev/null +++ b/src/voxd/core/streaming_transcriber.py @@ -0,0 +1,395 @@ +import threading +import queue +import tempfile +import wave +import time +import numpy as np +from pathlib import Path +from typing import Callable, Optional +from voxd.core.transcriber import WhisperTranscriber +from voxd.utils.libw import verbo, verr + + +class StreamingWhisperTranscriber: + """Transcriber that processes audio in chunks and emits incremental text updates. + + Can emit text based on: + - Time interval + - Word count + - Supports rewriting previous text when transcription changes + """ + + def __init__( + self, + model_path: str, + binary_path: str, + language: Optional[str] = None, + chunk_seconds: float = 3.0, + overlap_seconds: float = 0.5, + emit_interval_seconds: float = 2.0, + emit_word_count: int = 3, + on_partial_text: Optional[Callable[[str], None]] = None, + on_final_text: Optional[Callable[[str], None]] = None, + ): + self.transcriber = WhisperTranscriber( + model_path=model_path, + binary_path=binary_path, + delete_input=True, + language=language, + ) + self.chunk_seconds = chunk_seconds + self.overlap_seconds = overlap_seconds + self.emit_interval_seconds = emit_interval_seconds + self.emit_word_count = emit_word_count + self.on_partial_text = on_partial_text + self.on_final_text = on_final_text + + self.transcription_queue = queue.Queue() + self.worker_thread: Optional[threading.Thread] = None + self.is_running = False + self.audio_buffer: list[np.ndarray] = [] + self.samplerate = 16000 + self.channels = 1 + + self.accumulated_text = "" + self.last_emitted_text = "" + self.last_emitted_time = 0.0 + self.last_emitted_word_count = 0 + self.chunk_timestamps: dict[str, float] = {} # Track when chunks were transcribed + self.chunk_texts: dict[str, str] = {} # Track text for each chunk + + # Threshold constants for chunk queuing logic (calculated once in constructor) + # 0.7 = 70% of chunk_seconds: minimum time between chunks to avoid too-frequent processing + # 0.8 = 0.8 seconds: absolute minimum time between chunks (safety floor) + self.min_time_between_chunks = max(0.8, self.chunk_seconds * 0.7) + + # 0.6 = 60% of chunk_frames: minimum buffer size to queue when time threshold is met + # This allows processing smaller chunks if enough time has passed, preventing delays + self.min_frames_for_time_based_queue = 0.6 + + # 0.5 = 50% of chunk_frames: absolute minimum chunk size to queue (safety floor) + # Prevents queuing tiny chunks that would waste processing time + self.min_frames_to_queue = 0.5 + + # Frame-based values (calculated in start() when samplerate is known) + self.chunk_frames = 0 + self.overlap_frames = 0 + + def start(self, samplerate: int = 16000, channels: int = 1): + """Start the streaming transcriber.""" + self.samplerate = samplerate + self.channels = channels + self.is_running = True + self.audio_buffer = [] + self.accumulated_text = "" + self.last_emitted_text = "" + self.last_emitted_time = time.time() + self.last_emitted_word_count = 0 + self.chunk_timestamps.clear() + self.chunk_texts.clear() + + # Calculate frame-based values now that samplerate is known + self.chunk_frames = int(self.chunk_seconds * self.samplerate) + self.overlap_frames = int(self.overlap_seconds * self.samplerate) + + self.worker_thread = threading.Thread(target=self._transcription_worker, daemon=True) + self.worker_thread.start() + verbo("[streaming_transcriber] Started") + + def stop(self): + """Stop the streaming transcriber.""" + self.is_running = False + if self.worker_thread and self.worker_thread.is_alive(): + self.transcription_queue.put(None) + self.worker_thread.join(timeout=5.0) + verbo("[streaming_transcriber] Stopped") + + def add_audio_chunk(self, audio_data: np.ndarray): + """Add an audio chunk for transcription.""" + if not self.is_running: + return + + self.audio_buffer.append(audio_data.copy()) + concatenated = np.concatenate(self.audio_buffer, axis=0) if len(self.audio_buffer) > 1 else self.audio_buffer[0] + total_frames = len(concatenated) + + # Calculate time since last chunk was emitted. + current_time = time.time() + time_since_last = current_time - self.last_emitted_time if self.last_emitted_time > 0 else float('inf') + + # Determine if we should queue the chunk for transcription. + should_queue = False + if total_frames >= self.chunk_frames: + should_queue = True + elif time_since_last >= self.min_time_between_chunks and total_frames >= self.chunk_frames * self.min_frames_for_time_based_queue: + should_queue = True + + if should_queue: + chunk_to_transcribe = concatenated[-self.chunk_frames:] if total_frames >= self.chunk_frames else concatenated + if len(chunk_to_transcribe) >= self.chunk_frames * self.min_frames_to_queue: + chunk_seconds = len(chunk_to_transcribe) / self.samplerate + self.transcription_queue.put(chunk_to_transcribe) + verbo(f"[streaming_transcriber] Queued chunk for transcription ({len(chunk_to_transcribe)} frames, {chunk_seconds:.2f}s, queue size: {self.transcription_queue.qsize()})") + + if total_frames >= self.overlap_frames: + overlap_data = concatenated[-self.overlap_frames:] + self.audio_buffer = [overlap_data] + elif total_frames >= self.chunk_frames: + self.audio_buffer = [] + else: + self.audio_buffer = [concatenated] + + def _transcription_worker(self): + """Worker thread that processes transcription tasks.""" + while self.is_running: + try: + audio_chunk = self.transcription_queue.get(timeout=0.1) + if audio_chunk is None: + break + + trans_start = time.time() + chunk_seconds = len(audio_chunk) / self.samplerate + verbo(f"[streaming_transcriber] Starting transcription of chunk ({len(audio_chunk)} frames, {chunk_seconds:.2f}s)") + self._transcribe_chunk(audio_chunk) + trans_duration = time.time() - trans_start + verbo(f"[streaming_transcriber] Transcription completed in {trans_duration:.2f}s (chunk: {chunk_seconds:.2f}s)") + except queue.Empty: + continue + except Exception as e: + verr(f"[streaming_transcriber] Transcription worker error: {e}") + + def _transcribe_chunk(self, audio_chunk: np.ndarray): + """Transcribe a single audio chunk.""" + try: + chunk_start_time = time.time() + chunk_seconds = len(audio_chunk) / self.samplerate + chunk_id = f"{id(audio_chunk)}_{len(audio_chunk)}" + + temp_file = self._save_chunk_to_file(audio_chunk) + if temp_file is None: + return + + tscript, _ = self.transcriber.transcribe(temp_file) + + if tscript: + tscript = self._filter_blank_audio(tscript) + if tscript: + # Store chunk metadata + self.chunk_timestamps[chunk_id] = chunk_start_time + self.chunk_texts[chunk_id] = tscript + + timestamp_str = time.strftime("%H:%M:%S", time.localtime(chunk_start_time)) + verbo(f"[streaming_transcriber] Got transcript at {timestamp_str} (chunk: {chunk_seconds:.2f}s): '{tscript[:50]}...'") + self._process_transcript(tscript, chunk_id, chunk_start_time) + except Exception as e: + verr(f"[streaming_transcriber] Failed to transcribe chunk: {e}") + + def _filter_blank_audio(self, text: str) -> str: + """Filter out [BLANK_AUDIO] artifacts from transcription.""" + if not text: + return text + text = text.replace("[BLANK_AUDIO]", "") + text = text.replace("BLANK_AUDIO", "") + return text.strip() + + def _save_chunk_to_file(self, audio_data: np.ndarray) -> Optional[Path]: + """Save audio chunk to temporary WAV file.""" + try: + temp_dir = Path(tempfile.gettempdir()) / "voxd_temp" + temp_dir.mkdir(exist_ok=True) + + temp_file = temp_dir / f"stream_chunk_{threading.get_ident()}_{id(audio_data)}.wav" + + with wave.open(str(temp_file), 'w') as wf: + wf.setnchannels(self.channels) + wf.setsampwidth(2) + wf.setframerate(self.samplerate) + x = np.clip(audio_data, -1.0, 1.0) + wf.writeframes((x * 32767.0).astype(np.int16).tobytes()) + + return temp_file + except Exception as e: + verr(f"[streaming_transcriber] Failed to save chunk: {e}") + return None + + def _should_emit_text(self, new_text: str) -> bool: + """Determine if text should be emitted based on time or word count.""" + current_time = time.time() + time_since_last = current_time - self.last_emitted_time + + current_word_count = len(new_text.split()) + words_since_last = current_word_count - self.last_emitted_word_count + + should_emit = False + if time_since_last >= self.emit_interval_seconds: + should_emit = True + verbo(f"[streaming_transcriber] Time-based emission trigger ({time_since_last:.2f}s >= {self.emit_interval_seconds}s)") + elif words_since_last >= self.emit_word_count: + should_emit = True + verbo(f"[streaming_transcriber] Word-based emission trigger ({words_since_last} words >= {self.emit_word_count} words)") + + return should_emit + + def _process_transcript(self, new_text: str, chunk_id: str = "", chunk_time: float = 0.0): + """Process new transcript and emit incremental updates. + + Args: + new_text: The transcribed text + chunk_id: Unique identifier for this chunk + chunk_time: Timestamp when chunk transcription started + """ + if not new_text or not new_text.strip(): + return + + new_text = new_text.strip() + + # Skip if this text is already fully contained in accumulated text + if self.accumulated_text and new_text in self.accumulated_text: + verbo(f"[streaming_transcriber] Skipping duplicate transcript (already in accumulated): '{new_text[:50]}...'") + return + + verbo(f"[streaming_transcriber] Processing transcript: '{new_text[:50]}...', accumulated: '{self.accumulated_text[:50] if self.accumulated_text else ''}...'") + + if self.accumulated_text: + accumulated_clean = self.accumulated_text.strip() + + if new_text == accumulated_clean: + return + + if new_text.startswith(accumulated_clean): + suffix = new_text[len(accumulated_clean):].strip() + if suffix: + suffix = self._ensure_space_before(accumulated_clean, suffix) + self.accumulated_text = new_text + if self._should_emit_text(new_text): + verbo(f"[streaming_transcriber] Emitting suffix: '{suffix[:50]}...'") + if self.on_partial_text: + self.on_partial_text(suffix) + self._update_emission_state(new_text) + else: + words_accumulated = accumulated_clean.split() + words_new = new_text.split() + + common_prefix_len = 0 + for i in range(min(len(words_accumulated), len(words_new))): + if words_accumulated[i] == words_new[i]: + common_prefix_len += 1 + else: + break + + if common_prefix_len > 0 and common_prefix_len < len(words_accumulated): + # Transcription changed for part of the text - just continue with new text + # (We don't rewrite because it's complex and error-prone) + diff_text = " ".join(words_new[common_prefix_len:]) if common_prefix_len < len(words_new) else "" + if diff_text: + diff_text = self._ensure_space_before(" ".join(words_accumulated[:common_prefix_len]), diff_text) + self.accumulated_text = new_text + if self._should_emit_text(new_text): + verbo(f"[streaming_transcriber] Transcription changed (common prefix: {common_prefix_len} words), emitting diff: '{diff_text[:50]}...'") + if self.on_partial_text: + self.on_partial_text(diff_text) + self._update_emission_state(new_text) + else: + self.accumulated_text = new_text + else: + diff_text = " ".join(words_new[common_prefix_len:]) if common_prefix_len < len(words_new) else new_text + if diff_text: + diff_text = self._ensure_space_before(accumulated_clean, diff_text) + self.accumulated_text = new_text + if self._should_emit_text(new_text): + verbo(f"[streaming_transcriber] Emitting diff: '{diff_text[:50]}...'") + if self.on_partial_text: + self.on_partial_text(diff_text) + self._update_emission_state(new_text) + else: + self.accumulated_text = new_text + else: + self.accumulated_text = new_text + if self._should_emit_text(new_text): + verbo(f"[streaming_transcriber] First transcript, emitting: '{new_text[:50]}...'") + if self.on_partial_text: + self.on_partial_text(new_text) + self._update_emission_state(new_text) + + # Clean up old chunk metadata periodically + if chunk_time > 0: + self._cleanup_old_chunks(chunk_time) + + + def _ensure_space_before(self, previous: str, new: str) -> str: + """Ensure proper spacing between previous and new text. + + Returns new text with leading space if needed, preserving the space in the string. + """ + if not previous or not new: + return new + + previous = previous.rstrip() + new_original = new + new = new.lstrip() + + if not previous or not new: + return new_original # Return original to preserve spacing + + prev_last = previous[-1] if previous else "" + new_first = new[0] if new else "" + + needs_space = False + if prev_last.isalnum() and new_first.isalnum(): + needs_space = True + elif prev_last in ".,!?;:" and new_first.isalnum(): + needs_space = True + elif prev_last == "." and new_first.isupper(): + needs_space = True + + if needs_space and not new_original.startswith(" "): + return " " + new + + return new_original # Return original to preserve any existing spacing + + def _cleanup_old_chunks(self, current_time: float): + """Remove chunk metadata older than 2x chunk_seconds to prevent memory leaks.""" + cutoff_time = current_time - (self.chunk_seconds * 2) + chunks_to_remove = [ + chunk_id for chunk_id, chunk_time in self.chunk_timestamps.items() + if chunk_time < cutoff_time + ] + for chunk_id in chunks_to_remove: + self.chunk_timestamps.pop(chunk_id, None) + self.chunk_texts.pop(chunk_id, None) + + def _update_emission_state(self, text: str): + """Update emission tracking state after emitting text.""" + self.last_emitted_text = text + self.last_emitted_time = time.time() + self.last_emitted_word_count = len(text.split()) + + def get_accumulated_text(self) -> str: + """Get the accumulated transcript so far.""" + return self.accumulated_text + + def finalize(self) -> str: + """Finalize transcription and return complete text.""" + # Process any remaining chunks in the queue + while not self.transcription_queue.empty(): + try: + audio_chunk = self.transcription_queue.get_nowait() + if audio_chunk is not None: + self._transcribe_chunk(audio_chunk) + except queue.Empty: + break + + # Process any remaining audio in buffer (only if it's substantial and not already processed) + if self.audio_buffer: + concatenated = np.concatenate(self.audio_buffer, axis=0) + # Only process if buffer has meaningful audio (at least 0.5 seconds) + min_final_frames = int(0.5 * self.samplerate) + if len(concatenated) >= min_final_frames: + verbo(f"[streaming_transcriber] Finalizing: processing remaining buffer ({len(concatenated)} frames, {len(concatenated)/self.samplerate:.2f}s)") + self._transcribe_chunk(concatenated) + + final_text = self.accumulated_text + if self.on_final_text and final_text: + self.on_final_text(final_text) + + return final_text diff --git a/src/voxd/core/transcriber.py b/src/voxd/core/transcriber.py index 3ef3da9f..dd0b35e0 100644 --- a/src/voxd/core/transcriber.py +++ b/src/voxd/core/transcriber.py @@ -48,10 +48,6 @@ def transcribe(self, audio_path): if not audio_file.exists(): raise FileNotFoundError(f"[transcriber] Audio file not found: {audio_file}") - verbo(f"[transcriber] Using binary: {self.binary_path}") - verbo(f"[transcriber] Using model: {self.model_path}") - verbo("[transcriber] Starting transcription...") - # Output prefix (no extension!) output_prefix = self.output_dir / audio_file.stem output_txt = output_prefix.with_suffix(".txt") diff --git a/src/voxd/core/typer.py b/src/voxd/core/typer.py index 2599cc92..86415e14 100644 --- a/src/voxd/core/typer.py +++ b/src/voxd/core/typer.py @@ -288,6 +288,90 @@ def type(self, text): return self.flush_stdin() # Flush pending input before any new prompt + def type_incremental(self, previous_text: str, new_text: str): + """Type only the new text that wasn't in previous_text (append-only approach). + + This method calculates the suffix to append and types only that, avoiding + scary text replacements by never deleting existing text. + """ + if not self.enabled: + print("[typer] ⚠️ Typing disabled - required tool not available.") + return + + if not new_text: + return + + previous_text = previous_text or "" + new_text = new_text.rstrip() + + if new_text.startswith(previous_text): + suffix = new_text[len(previous_text):] + if not suffix: + return + # Preserve leading space if it exists (needed for proper sentence spacing) + if suffix.startswith(" "): + # Keep the space + pass + else: + # Remove any other leading whitespace but preserve intentional spaces + suffix = suffix.lstrip() + else: + # For non-matching text, preserve leading space if present + if new_text.startswith(" "): + suffix = new_text + else: + suffix = new_text.lstrip() + + if not suffix: + return + + if self.delay_ms <= 0 or not self.tool: + return + + verbo(f"[typer] Typing incremental text: '{suffix[:20]}...' using {self.tool}...") + tool_name = os.path.basename(self.tool) if self.tool else "" + if tool_name == "ydotool" and self.tool: + self._run_tool([self.tool, "type", "-d", self.delay_str, suffix]) + elif tool_name == "xdotool" and self.tool: + self._run_tool([self.tool, "type", "--delay", self.delay_str, suffix]) + else: + print("[typer] ⚠️ No valid typing tool found for incremental typing.") + return + + def type_rewrite(self, text: str, previous_length: int): + """Rewrite text by deleting previous characters and typing new text. + + Args: + text: The new text to type + previous_length: Number of characters to delete before typing new text + """ + if not self.enabled: + print("[typer] ⚠️ Typing disabled - required tool not available.") + return + + if not text: + return + + if self.delay_ms <= 0 or not self.tool: + return + + verbo(f"[typer] Rewriting text: deleting {previous_length} chars, typing '{text[:20]}...' using {self.tool}...") + tool_name = os.path.basename(self.tool) if self.tool else "" + + if tool_name == "ydotool" and self.tool: + if previous_length > 0: + for _ in range(previous_length): + self._run_tool([self.tool, "key", "14:1", "14:0"]) + self._run_tool([self.tool, "type", "-d", self.delay_str, text]) + elif tool_name == "xdotool" and self.tool: + if previous_length > 0: + for _ in range(previous_length): + self._run_tool([self.tool, "key", "BackSpace"]) + self._run_tool([self.tool, "type", "--delay", self.delay_str, text]) + else: + print("[typer] ⚠️ No valid typing tool found for rewrite.") + return + # ------------------------------------------------------------------ # Helper: fast clipboard paste # ------------------------------------------------------------------ diff --git a/src/voxd/gui/gui_main.py b/src/voxd/gui/gui_main.py index 255325f5..dbb21da3 100644 --- a/src/voxd/gui/gui_main.py +++ b/src/voxd/gui/gui_main.py @@ -1,4 +1,5 @@ import sys +import math from PyQt6.QtWidgets import ( QApplication, QWidget, QPushButton, QLabel, QVBoxLayout, QHBoxLayout, QSizePolicy, QInputDialog, QGroupBox, QSystemTrayIcon, QMenu, QDialog, QTextEdit @@ -14,6 +15,7 @@ CoreProcessThread, _create_styled_checkbox, show_manage_prompts, session_log_dialog, show_performance_dialog ) +from voxd.core.streaming_core import StreamingCoreProcessThread from voxd.core.model_manager import show_model_manager from voxd.gui.settings_dialog import SettingsDialog from voxd.utils.performance import update_last_perf_entry @@ -535,7 +537,6 @@ def _blend(self, c1: tuple[int, int, int], c2: tuple[int, int, int], t: float) - def _on_anim_tick(self): self._anim_phase_ms = (self._anim_phase_ms + self._anim_timer.interval()) % 500 - import math phase = (self._anim_phase_ms / 500.0) * 2 * math.pi t = 0.5 * (1 + math.sin(phase)) @@ -608,7 +609,10 @@ def on_button_clicked(self): return self.set_status("Recording") self.clipboard_notice.setText("") - self.runner_thread = CoreProcessThread(self.cfg, self.logger) + if self.cfg.data.get("streaming_enabled", True): + self.runner_thread = StreamingCoreProcessThread(self.cfg, self.logger) + else: + self.runner_thread = CoreProcessThread(self.cfg, self.logger) self.runner_thread.status_changed.connect(self.set_status) self.runner_thread.finished.connect(self.on_transcript_ready) self.runner_thread.start() diff --git a/src/voxd/tray/tray_main.py b/src/voxd/tray/tray_main.py index a1198456..677be4e3 100644 --- a/src/voxd/tray/tray_main.py +++ b/src/voxd/tray/tray_main.py @@ -18,6 +18,7 @@ session_log_dialog, show_performance_dialog, ) +from voxd.core.streaming_core import StreamingCoreProcessThread from voxd.core.model_manager import show_model_manager # NEW from voxd.utils.performance import update_last_perf_entry from voxd.gui.settings_dialog import SettingsDialog @@ -132,7 +133,10 @@ def toggle_recording(self): if self.status in ("Transcribing", "Typing"): return self.set_status("Recording") - self.thread = CoreProcessThread(self.cfg, self.logger) + if self.cfg.data.get("streaming_enabled", True): + self.thread = StreamingCoreProcessThread(self.cfg, self.logger) + else: + self.thread = CoreProcessThread(self.cfg, self.logger) self.thread.status_changed.connect(self.set_status, Qt.ConnectionType.QueuedConnection) self.thread.finished.connect(self.on_transcript_ready) self.thread.start() diff --git a/src/voxd/utils/libw.py b/src/voxd/utils/libw.py index a7026142..aa413b02 100644 --- a/src/voxd/utils/libw.py +++ b/src/voxd/utils/libw.py @@ -57,17 +57,20 @@ def verbo(what_string: str, *args, **kwargs): The string is formatted with ``str.format(*args, **kwargs)`` exactly like ``print`` would do. + + Checks both config verbosity flag and VOXD_VERBOSE environment variable. """ cfg = _app_cfg() - if getattr(cfg, "verbosity", False): + verbose = getattr(cfg, "verbosity", False) or os.getenv("VOXD_VERBOSE") == "1" + if verbose: msg = what_string.format(*args, **kwargs) if _color_enabled(): if msg.startswith("[recorder]"): msg = f"{ORANGE}{msg}{RESET}" elif msg.startswith("[logger]") or msg.startswith("[aipp]"): msg = f"{GREEN}{msg}{RESET}" - print(msg) + print(msg, flush=True) def verr(what_string: str, *args, **kwargs): """Unconditional error print, colored red when TTY. diff --git a/tests/conftest.py b/tests/conftest.py index fd26966f..d8341819 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -118,6 +118,35 @@ def copy(text): yield +@pytest.fixture(autouse=True) +def stub_psutil(monkeypatch): + """Provide a minimal stub for psutil module so imports succeed.""" + import types + + if "psutil" in sys.modules: + del sys.modules["psutil"] + if "_psutil_linux" in sys.modules: + del sys.modules["_psutil_linux"] + if "_pslinux" in sys.modules: + del sys.modules["_pslinux"] + + psutil = types.ModuleType("psutil") + class _VirtualMemory: + def __init__(self): + self.total = 8 * 1024 * 1024 * 1024 + class _CpuFreq: + def __init__(self): + self.max = 3000.0 + def virtual_memory(): + return _VirtualMemory() + def cpu_freq(): + return _CpuFreq() + psutil.virtual_memory = virtual_memory + psutil.cpu_freq = cpu_freq + monkeypatch.setitem(sys.modules, "psutil", psutil) + yield + + @pytest.fixture def fake_whisper_run(monkeypatch, tmp_path): """Patch whisper subprocess.run to simulate success and create expected .txt output.""" From a82c27692dac235d3561d7a54843e2be52d34be3 Mon Sep 17 00:00:00 2001 From: 4ellendger <4ellendger@gmail.com> Date: Tue, 30 Dec 2025 14:17:16 +0400 Subject: [PATCH 3/4] Updated to use latest Python version to make venv. --- README.md | 2 ++ packaging/postinstall.sh | 51 +++++++++++++++++++++++++++++++++------- packaging/voxd.wrapper | 24 ++++++++++++++----- setup.sh | 45 ++++++++++++++++++++++++++++++++--- 4 files changed, 104 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 89f985ec..e288ba7d 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ cd voxd && ./setup.sh Setup is non-interactive with minimal console output; a detailed setup log is saved in the repo directory (e.g. `2025-09-18-setup-log.txt`). +**Note:** The setup script automatically detects and uses the highest available Python 3.x version (3.9 or newer) on your system. If you have Python 3.14, 3.13, or any newer version installed, it will be used automatically. + **Reboot** the system! (unless on an X11 system; on most modern systems there is Wayland, so **ydotool** is required for typing and needs rebooting for user setup). diff --git a/packaging/postinstall.sh b/packaging/postinstall.sh index 636200d6..0d6c413a 100644 --- a/packaging/postinstall.sh +++ b/packaging/postinstall.sh @@ -41,29 +41,62 @@ echo "voxd installed. Each user should run: voxd --setup" # We inherit system site-packages to avoid duplicating distro Python libs APPDIR="/opt/voxd" -# Pick a Python >= 3.9 if available; attempt RPM install on openSUSE if too old +# Pick the highest available Python 3.x version (>=3.9) +# Attempt RPM install on openSUSE if too old pick_python() { - for c in python3.12 python3.11 python3.10 python3.9 python3 python; do + best_cmd="" + best_ver="" + # Check common python3.x commands (3.9 to 3.20) and python3/python + # Build candidate list + i=20 + while [ "$i" -ge 9 ]; do + if command -v "python3.$i" >/dev/null 2>&1; then + ver="$(python3.$i - <<'PY' +import sys +print(f"{sys.version_info.major}.{sys.version_info.minor}") +PY +)" 2>/dev/null || { i=$((i - 1)); continue; } + # Check if version is >= 3.9 + IFS='.' read -r major minor </dev/null 2>&1; then ver="$("$c" - <<'PY' import sys print(f"{sys.version_info.major}.{sys.version_info.minor}") PY -)" - case "$ver" in - 3.9|3.10|3.11|3.12|3.13) echo "$c"; return 0 ;; - *) ;; - esac +)" 2>/dev/null || continue + IFS='.' read -r major minor </dev/null 2>&1; then - for pkg in python311 python3.11 python310 python3.10 python39 python3.9; do + for pkg in python314 python3.14 python313 python3.13 python311 python3.11 python310 python3.10 python39 python3.9; do if zypper --non-interactive --no-gpg-checks install -y "$pkg" >/dev/null 2>&1; then break fi diff --git a/packaging/voxd.wrapper b/packaging/voxd.wrapper index 50ddee3a..399fb5bd 100644 --- a/packaging/voxd.wrapper +++ b/packaging/voxd.wrapper @@ -50,23 +50,35 @@ PY : # version is acceptable else # Attempt to create a user-local venv with any newer Python found + # Pick the highest available Python 3.x version (>=3.9) pick_python() { - for c in python3.12 python3.11 python3.10 python3.9 python3; do + local best_cmd="" best_ver="" + # Check common python3.x commands (3.9 to 3.20) and python3/python + local candidates=() + for i in {20..9}; do + candidates+=("python3.$i") + done + candidates+=("python3" "python") + + for c in "${candidates[@]}"; do if command -v "$c" >/dev/null 2>&1; then - v="$($c - <<'PY' + v="$("$c" - <<'PY' import sys print(f"{sys.version_info.major}.{sys.version_info.minor}") PY -)" +)" 2>/dev/null || continue # Check if version is >= 3.9 IFS='.' read -r v_major v_minor <<< "$v" if [[ "$v_major" -gt 3 ]] || [[ "$v_major" -eq 3 && "$v_minor" -ge 9 ]]; then - echo "$c" - return 0 + # Compare versions: if no best yet, or this is newer + if [[ -z "$best_ver" ]] || [[ "$(printf '%s\n' "$best_ver" "$v" | sort -V | tail -1)" == "$v" ]]; then + best_cmd="$c" + best_ver="$v" + fi fi fi done - echo "" + [[ -n "$best_cmd" ]] && echo "$best_cmd" } log "Attempting to locate newer system Python (>=3.9)" CAND="$(pick_python)" diff --git a/setup.sh b/setup.sh index 8afdcd7f..4adefcdd 100755 --- a/setup.sh +++ b/setup.sh @@ -523,10 +523,47 @@ fi spinner_stop 0 # ────────────────── 4. python venv & deps ─────────────────────────────────–– +# Pick the highest available Python 3.x version (>=3.9) +pick_python() { + local best_cmd="" best_ver="" + # Check common python3.x commands (3.9 to 3.20) and python3/python + local candidates=() + for i in {20..9}; do + candidates+=("python3.$i") + done + candidates+=("python3" "python") + + for c in "${candidates[@]}"; do + if command -v "$c" >/dev/null 2>&1; then + ver="$("$c" - <<'PY' +import sys +print(f"{sys.version_info.major}.{sys.version_info.minor}") +PY +)" 2>/dev/null || continue + # Check if version is >= 3.9 + IFS='.' read -r major minor <<< "$ver" + if [[ "$major" -gt 3 ]] || [[ "$major" -eq 3 && "$minor" -ge 9 ]]; then + # Compare versions: if no best yet, or this is newer + if [[ -z "$best_ver" ]] || [[ "$(printf '%s\n' "$best_ver" "$ver" | sort -V | tail -1)" == "$ver" ]]; then + best_cmd="$c" + best_ver="$ver" + fi + fi + fi + done + [[ -n "$best_cmd" ]] && echo "$best_cmd" +} + spinner_start "Setting up Python env and installing VOXD" +PYTHON_CMD="$(pick_python)" +if [[ -z "$PYTHON_CMD" ]]; then + die "No suitable Python (>=3.9) found. Please install Python 3.9 or newer." +fi +msg "Using Python: $PYTHON_CMD" + if [[ ! -d .venv ]]; then - msg "Creating virtualenv (.venv)…" - python3 -m venv .venv + msg "Creating virtualenv (.venv) with $PYTHON_CMD…" + "$PYTHON_CMD" -m venv .venv else msg "Using existing virtualenv (.venv)" fi @@ -556,7 +593,9 @@ fi pip install -e . # Fix editable install .pth file if it's empty (hatchling bug workaround) -PTH_FILE=".venv/lib/python3.12/site-packages/_voxd.pth" +# Dynamically determine Python version for PTH file path +PY_VERSION="$($PY -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" +PTH_FILE=".venv/lib/python${PY_VERSION}/site-packages/_voxd.pth" if [[ -f "$PTH_FILE" && ! -s "$PTH_FILE" ]]; then echo "$PWD/src" > "$PTH_FILE" msg "Fixed editable install .pth file" From 08b123e9846712db3fabdaaf5b7c1f1b03399d1d Mon Sep 17 00:00:00 2001 From: 4ellendger <4ellendger@gmail.com> Date: Sun, 4 Jan 2026 10:38:57 +0400 Subject: [PATCH 4/4] Add function to find voxd executable and update service unit accordingly. --- src/voxd/__main__.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/voxd/__main__.py b/src/voxd/__main__.py index 9b8e4611..2ce6772d 100644 --- a/src/voxd/__main__.py +++ b/src/voxd/__main__.py @@ -172,6 +172,31 @@ def _systemd_user_available() -> bool: except Exception: return False +def _find_voxd_executable() -> str: + """Find the voxd executable path, checking PATH and common locations. + + Returns the path to voxd executable, or 'voxd' if not found (relying on PATH). + """ + # First, try to find voxd in PATH + voxd_path = shutil.which("voxd") + if voxd_path: + return voxd_path + + # Fall back to common installation locations + home = Path.home() + candidates = [ + home / ".local/bin/voxd", + Path("/usr/local/bin/voxd"), + Path("/usr/bin/voxd"), + ] + + for candidate in candidates: + if candidate.exists() and os.access(candidate, os.X_OK): + return str(candidate) + + # If nothing found, return 'voxd' and let systemd use PATH + return "voxd" + def _ensure_voxd_tray_unit() -> None: """Ensure a voxd-tray.service user unit exists (packaged or per-user fallback).""" try: @@ -184,6 +209,8 @@ def _ensure_voxd_tray_unit() -> None: except Exception: pass unit_path = user_dir / "voxd-tray.service" + voxd_exec = _find_voxd_executable() + if not unit_path.exists(): unit_path.write_text( "[Unit]\n" @@ -191,13 +218,19 @@ def _ensure_voxd_tray_unit() -> None: "After=default.target\n\n" "[Service]\n" "Type=simple\n" - "ExecStart=/usr/bin/voxd --tray\n" + f"ExecStart={voxd_exec} --tray\n" "Restart=on-failure\n" "RestartSec=2s\n" "Environment=YDOTOOL_SOCKET=%h/.ydotool_socket\n\n" "[Install]\n" "WantedBy=default.target\n" ) + else: + # Update existing file if it has the hardcoded /usr/bin/voxd path + content = unit_path.read_text() + if "ExecStart=/usr/bin/voxd --tray" in content and voxd_exec != "/usr/bin/voxd": + content = content.replace("ExecStart=/usr/bin/voxd --tray", f"ExecStart={voxd_exec} --tray") + unit_path.write_text(content) except Exception: pass