From 182a0d0b6b7d2e02d17d3442b8d96b29ce5ea42f Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 00:41:23 -0400
Subject: [PATCH 1/6] Bump whisper, clblast, add buffered output

---
 .github/workflows/build-project.yaml     |   2 +-
 .github/workflows/push.yaml              |   1 +
 CMakeLists.txt                           |   3 +-
 cmake/BuildCTranslate2.cmake             |   4 +-
 cmake/BuildWhispercpp.cmake              |  28 ++-
 src/captions-thread.h                    | 127 ++++++++++
 src/model-utils/model-infos.cpp          |  16 +-
 src/transcription-filter-data.h          |   4 +
 src/transcription-filter.cpp             | 281 +++++++++++++++--------
 src/utils.cpp                            |  21 ++
 src/utils.h                              |   9 +
 src/whisper-utils/whisper-language.h     | 200 ++++++++--------
 src/whisper-utils/whisper-processing.cpp |  26 ++-
 src/whisper-utils/whisper-processing.h   |   2 +-
 14 files changed, 495 insertions(+), 229 deletions(-)
 create mode 100644 src/captions-thread.h
 create mode 100644 src/utils.cpp
 create mode 100644 src/utils.h

diff --git a/.github/workflows/build-project.yaml b/.github/workflows/build-project.yaml
index 21df812..addbd69 100644
--- a/.github/workflows/build-project.yaml
+++ b/.github/workflows/build-project.yaml
@@ -246,7 +246,7 @@ jobs:
     needs: check-event
     strategy:
       matrix:
-        cublas: [cpu, 12.2.0, 11.8.0]
+        cublas: [cpu, clblast, 12.2.0, 11.8.0]
     defaults:
       run:
         shell: pwsh
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
index e71de58..27a98e6 100644
--- a/.github/workflows/push.yaml
+++ b/.github/workflows/push.yaml
@@ -76,6 +76,7 @@ jobs:
 
           variants=(
             'windows-x64-cpu;zip|exe'
+            'windows-x64-clblast;zip|exe'
             'windows-x64-11.8.0;zip|exe'
             'windows-x64-12.2.0;zip|exe'
             'macos-arm64;tar.xz|pkg'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4f6946..c253488 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,7 @@ target_sources(
           src/whisper-utils/whisper-processing.cpp
           src/whisper-utils/whisper-utils.cpp
           src/whisper-utils/silero-vad-onnx.cpp
-          src/translation/translation.cpp)
+          src/translation/translation.cpp
+          src/utils.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
diff --git a/cmake/BuildCTranslate2.cmake b/cmake/BuildCTranslate2.cmake
index df48fdc..41dcc1c 100644
--- a/cmake/BuildCTranslate2.cmake
+++ b/cmake/BuildCTranslate2.cmake
@@ -21,10 +21,10 @@ elseif(WIN32)
 
   # check CPU_OR_CUDA environment variable
   if(NOT DEFINED ENV{CPU_OR_CUDA})
-    message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either CPU or CUDA")
+    message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`")
   endif()
 
-  if($ENV{CPU_OR_CUDA} STREQUAL "cpu")
+  if($ENV{CPU_OR_CUDA} STREQUAL "cpu" OR $ENV{CPU_OR_CUDA} STREQUAL "clblast")
     FetchContent_Declare(
       ctranslate2_fetch
       URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.2.0/libctranslate2-windows-4.1.1-Release-cpu.zip
diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index 6c78e61..50ad450 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -14,12 +14,12 @@ if(APPLE)
   endif(NOT DEFINED ENV{MACOS_ARCH})
 
   set(WHISPER_CPP_URL
-      "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.1/whispercpp-macos-$ENV{MACOS_ARCH}-0.0.1.tar.gz"
+      "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.2/whispercpp-macos-$ENV{MACOS_ARCH}-0.0.2.tar.gz"
   )
   if($ENV{MACOS_ARCH} STREQUAL "x86_64")
-    set(WHISPER_CPP_HASH "36F39F02F999AAF157EAD3460DD00C8BDAA3D6C4A769A9E4F64E327871B4B11F")
+    set(WHISPER_CPP_HASH "00C308AF0BFFF7619934403A8080CC9AFC4EDAA328D7587E617150A2C6A33313")
   elseif($ENV{MACOS_ARCH} STREQUAL "arm64")
-    set(WHISPER_CPP_HASH "6AF7BB904B03B6208B4281D44465B727FB608A32CABD1394B727937C5F4828A1")
+    set(WHISPER_CPP_HASH "0478E2079E07FA81BEE77506101003F4A4C8F0DF9E23757BD7E1D25DCBD1DB30")
   else()
     message(
       FATAL_ERROR
@@ -45,24 +45,30 @@ elseif(WIN32)
   if(NOT DEFINED ENV{CPU_OR_CUDA})
     message(
       FATAL_ERROR
-        "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu` or `11.8.0` or `12.2.0`")
+        "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`")
   endif(NOT DEFINED ENV{CPU_OR_CUDA})
 
-  set(CUDA_PREFIX $ENV{CPU_OR_CUDA})
-  if(NOT $ENV{CPU_OR_CUDA} STREQUAL "cpu")
-    set(CUDA_PREFIX "cuda$ENV{CPU_OR_CUDA}")
+  set(ARCH_PREFIX $ENV{CPU_OR_CUDA})
+  if(NOT $ENV{CPU_OR_CUDA} STREQUAL "cpu" AND NOT $ENV{CPU_OR_CUDA} STREQUAL "clblast")
+    set(ARCH_PREFIX "cuda$ENV{CPU_OR_CUDA}")
     add_compile_definitions("LOCALVOCAL_WITH_CUDA")
+  elseif($ENV{CPU_OR_CUDA} STREQUAL "cpu")
+    add_compile_definitions("LOCALVOCAL_WITH_CPU")
+  else()
+    add_compile_definitions("LOCALVOCAL_WITH_CLBLAST")
   endif()
 
   set(WHISPER_CPP_URL
-      "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.1/whispercpp-windows-${CUDA_PREFIX}-0.0.1.zip"
+      "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.2/whispercpp-windows-${ARCH_PREFIX}-0.0.2.zip"
   )
   if($ENV{CPU_OR_CUDA} STREQUAL "cpu")
-    set(WHISPER_CPP_HASH "5261FCCD18BA52AE7ECD37617452F0514238FAB4B12713F1FCA491F4ABA170AA")
+    set(WHISPER_CPP_HASH "6DE628A51B9352624A1EC397231591FA3370E6BB42D9364F4F91F11DD18F77D2")
+  elseif($ENV{CPU_OR_CUDA} STREQUAL "clblast")
+    set(WHISPER_CPP_HASH "97BF58520F1818B7C9F4E996197F3097934E5E0BBA92B0B016C6B28BE9FF1642")
   elseif($ENV{CPU_OR_CUDA} STREQUAL "12.2.0")
-    set(WHISPER_CPP_HASH "1966A6C7347FCB9529140F8097AED306F31C6DDE328836FD6498A980E20B8E6C")
+    set(WHISPER_CPP_HASH "48C059A3364E0AAD9FB0D4194BA554865928D22A27ECE5E3C116DC672D5D6EDE")
   elseif($ENV{CPU_OR_CUDA} STREQUAL "11.8.0")
-    set(WHISPER_CPP_HASH "172F4021E888A89A694373AE0888C04DB99BC11F3A2633270248E03AF5AC762E")
+    set(WHISPER_CPP_HASH "29A5530E83896DE207F0199535CBBB24DF0D63B1373BA66139AD240BA67120EB")
   else()
     message(
       FATAL_ERROR
diff --git a/src/captions-thread.h b/src/captions-thread.h
new file mode 100644
index 0000000..5ebf17b
--- /dev/null
+++ b/src/captions-thread.h
@@ -0,0 +1,127 @@
+#ifndef CAPTIONS_THREAD_H
+#define CAPTIONS_THREAD_H
+
+#include <queue>
+#include <vector>
+#include <chrono>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <functional>
+#include <string>
+
+#include <obs.h>
+
+#include "plugin-support.h"
+
+class CaptionMonitor {
+public:
+	// default constructor
+	CaptionMonitor() = default;
+
+	~CaptionMonitor()
+	{
+		{
+			std::lock_guard<std::mutex> lock(queueMutex);
+			stop = true;
+		}
+		condVar.notify_all();
+		workerThread.join();
+	}
+
+	void initialize(std::function<void(const std::string &)> callback, size_t maxSize,
+			std::chrono::seconds maxTime)
+	{
+		obs_log(LOG_INFO, "CaptionMonitor::initialize");
+		this->callback = callback;
+		this->maxSize = maxSize;
+		this->maxTime = maxTime;
+		this->initialized = true;
+		this->workerThread = std::thread(&CaptionMonitor::monitor, this);
+	}
+
+	void addWords(const std::vector<std::string> &words)
+	{
+		{
+			std::lock_guard<std::mutex> lock(queueMutex);
+			for (const auto &word : words) {
+				wordQueue.push_back(word);
+			}
+			this->newDataAvailable = true;
+		}
+		obs_log(LOG_INFO, "CaptionMonitor::addWords: number of words in queue: %d",
+			wordQueue.size());
+		condVar.notify_all();
+	}
+
+private:
+	void monitor()
+	{
+		obs_log(LOG_INFO, "CaptionMonitor::monitor");
+		auto startTime = std::chrono::steady_clock::now();
+		while (true) {
+			std::unique_lock<std::mutex> lock(this->queueMutex);
+			// wait for new data or stop signal
+			this->condVar.wait(lock,
+					   [this] { return this->newDataAvailable || this->stop; });
+
+			if (this->stop) {
+				obs_log(LOG_INFO, "CaptionMonitor::monitor: stopping");
+				break;
+			}
+
+			if (this->wordQueue.empty()) {
+				continue;
+			}
+
+			obs_log(LOG_INFO, "CaptionMonitor::monitor: wordQueue size: %d",
+				this->wordQueue.size());
+
+			// emit up to maxSize words from the wordQueue
+			std::vector<std::string> emitted;
+			while (!this->wordQueue.empty() && emitted.size() <= this->maxSize) {
+				emitted.push_back(this->wordQueue.front());
+				this->wordQueue.pop_front();
+			}
+			// emit the caption, joining the words with a space
+			std::string output;
+			for (const auto &word : emitted) {
+				output += word + " ";
+			}
+			this->callback(output);
+			// push back the words that were emitted, in reverse order
+			for (auto it = emitted.rbegin(); it != emitted.rend(); ++it) {
+				this->wordQueue.push_front(*it);
+			}
+
+			if (this->wordQueue.size() >= this->maxSize ||
+			    std::chrono::steady_clock::now() - startTime >= this->maxTime) {
+				// flush the queue if it's full or we've reached the max time
+				size_t words_to_flush =
+					std::min(this->wordQueue.size(), this->maxSize);
+				obs_log(LOG_INFO, "CaptionMonitor::monitor: flushing %d words",
+					words_to_flush);
+				for (size_t i = 0; i < words_to_flush; ++i) {
+					wordQueue.pop_front();
+				}
+				startTime = std::chrono::steady_clock::now();
+			}
+
+			newDataAvailable = false;
+		}
+		obs_log(LOG_INFO, "CaptionMonitor::monitor: done");
+	}
+
+	std::deque<std::string> wordQueue;
+	std::thread workerThread;
+	std::mutex queueMutex;
+	std::condition_variable condVar;
+	std::function<void(std::string)> callback;
+	size_t maxSize;
+	std::chrono::seconds maxTime;
+	bool stop;
+	bool initialized = false;
+	bool newDataAvailable = false;
+};
+
+#endif // CAPTIONS_THREAD_H
diff --git a/src/model-utils/model-infos.cpp b/src/model-utils/model-infos.cpp
index cd00814..b7cb6b1 100644
--- a/src/model-utils/model-infos.cpp
+++ b/src/model-utils/model-infos.cpp
@@ -29,7 +29,7 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-base-q5_1.bin",
 	    "422F1AE452ADE6F30A004D7E5C6A43195E4433BC370BF23FAC9CC591F01A8898"}}}},
-	{"Whisper Base En q5 (57Mb)",
+	{"Whisper Base English q5 (57Mb)",
 	 {"Whisper Base En q5",
 	  "ggml-model-whisper-base-en-q5_1",
 	  MODEL_TYPE_TRANSCRIPTION,
@@ -41,7 +41,7 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-base.bin",
 	    "60ED5BC3DD14EEA856493D334349B405782DDCAF0028D4B5DF4088345FBA2EFE"}}}},
-	{"Whisper Base En (141Mb)",
+	{"Whisper Base English (141Mb)",
 	 {"Whisper Base En",
 	  "ggml-model-whisper-base-en",
 	  MODEL_TYPE_TRANSCRIPTION,
@@ -59,7 +59,7 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-medium-q5_0.bin",
 	    "19FEA4B380C3A618EC4723C3EEF2EB785FFBA0D0538CF43F8F235E7B3B34220F"}}}},
-	{"Whisper Medium En q5 (514Mb)",
+	{"Whisper Medium English q5 (514Mb)",
 	 {"Whisper Medium En q5",
 	  "ggml-model-whisper-medium-en-q5_0",
 	  MODEL_TYPE_TRANSCRIPTION,
@@ -71,7 +71,7 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-small-q5_1.bin",
 	    "AE85E4A935D7A567BD102FE55AFC16BB595BDB618E11B2FC7591BC08120411BB"}}}},
-	{"Whisper Small En q5 (181Mb)",
+	{"Whisper Small English q5 (181Mb)",
 	 {"Whisper Small En q5",
 	  "ggml-model-whisper-small-en-q5_1",
 	  MODEL_TYPE_TRANSCRIPTION,
@@ -83,7 +83,7 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-small.bin",
 	    "1BE3A9B2063867B937E64E2EC7483364A79917E157FA98C5D94B5C1FFFEA987B"}}}},
-	{"Whisper Small En (465Mb)",
+	{"Whisper Small English (465Mb)",
 	 {"Whisper Small En",
 	  "ggml-model-whisper-small-en",
 	  MODEL_TYPE_TRANSCRIPTION,
@@ -101,19 +101,19 @@ std::map<std::string, ModelInfo> models_info = {{
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin",
 	    "818710568DA3CA15689E31A743197B520007872FF9576237BDA97BD1B469C3D7"}}}},
-	{"Whisper Tiny En q5 (31Mb)",
+	{"Whisper Tiny English q5 (31Mb)",
 	 {"Whisper Tiny En q5",
 	  "ggml-model-whisper-tiny-en-q5_1",
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin",
 	    "C77C5766F1CEF09B6B7D47F21B546CBDDD4157886B3B5D6D4F709E91E66C7C2B"}}}},
-	{"Whisper Tiny En q8 (42Mb)",
+	{"Whisper Tiny English q8 (42Mb)",
 	 {"Whisper Tiny En q8",
 	  "ggml-model-whisper-tiny-en-q8_0",
 	  MODEL_TYPE_TRANSCRIPTION,
 	  {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin",
 	    "5BC2B3860AA151A4C6E7BB095E1FCCE7CF12C7B020CA08DCEC0C6D018BB7DD94"}}}},
-	{"Whisper Tiny En (74Mb)",
+	{"Whisper Tiny English (74Mb)",
 	 {"Whisper Tiny En",
 	  "ggml-model-whisper-tiny-en",
 	  MODEL_TYPE_TRANSCRIPTION,
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 0b34f5d..374ec2a 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -17,6 +17,7 @@
 
 #include "translation/translation.h"
 #include "whisper-utils/silero-vad-onnx.h"
+#include "captions-thread.h"
 
 #define MAX_PREPROC_CHANNELS 10
 
@@ -89,6 +90,7 @@ struct transcription_filter_data {
 	bool translate = false;
 	std::string source_lang;
 	std::string target_lang;
+	bool buffered_output = false;
 
 	// Last transcription result
 	std::string last_text;
@@ -113,6 +115,8 @@ struct transcription_filter_data {
 	// translation context
 	struct translation_context translation_ctx;
 
+	CaptionMonitor captions_monitor;
+
 	// ctor
 	transcription_filter_data()
 	{
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index f62fb2a..e7c28bf 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -10,6 +10,7 @@
 #include "whisper-utils/whisper-utils.h"
 #include "translation/language_codes.h"
 #include "translation/translation.h"
+#include "utils.h"
 
 #include <algorithm>
 #include <fstream>
@@ -220,6 +221,35 @@ inline bool is_valid_lead_byte(const uint8_t *c)
 	return false;
 }
 
+void send_caption_to_source(const std::string &str_copy, struct transcription_filter_data *gf)
+{
+	if (!gf->text_source_mutex) {
+		obs_log(LOG_ERROR, "text_source_mutex is null");
+		return;
+	}
+
+	if (!gf->text_source) {
+		// attempt to acquire a weak ref to the text source if it's yet available
+		acquire_weak_text_source_ref(gf);
+	}
+
+	std::lock_guard<std::mutex> lock(*gf->text_source_mutex);
+
+	if (!gf->text_source) {
+		obs_log(gf->log_level, "text_source is null");
+		return;
+	}
+	auto target = obs_weak_source_get_source(gf->text_source);
+	if (!target) {
+		obs_log(gf->log_level, "text_source target is null");
+		return;
+	}
+	auto text_settings = obs_source_get_settings(target);
+	obs_data_set_string(text_settings, "text", str_copy.c_str());
+	obs_source_update(target, text_settings);
+	obs_source_release(target);
+}
+
 void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn)
 {
@@ -302,6 +332,8 @@ void set_text_callback(struct transcription_filter_data *gf,
 
 	gf->last_text = str_copy;
 
+	gf->captions_monitor.addWords(split_words(str_copy));
+
 	if (gf->caption_to_stream) {
 		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
 		if (streaming_output) {
@@ -360,31 +392,10 @@ void set_text_callback(struct transcription_filter_data *gf,
 			gf->sentence_number++;
 		}
 	} else {
-		if (!gf->text_source_mutex) {
-			obs_log(LOG_ERROR, "text_source_mutex is null");
-			return;
+		if (!gf->buffered_output) {
+			// Send the caption to the text source
+			send_caption_to_source(str_copy, gf);
 		}
-
-		if (!gf->text_source) {
-			// attempt to acquire a weak ref to the text source if it's yet available
-			acquire_weak_text_source_ref(gf);
-		}
-
-		std::lock_guard<std::mutex> lock(*gf->text_source_mutex);
-
-		if (!gf->text_source) {
-			obs_log(gf->log_level, "text_source is null");
-			return;
-		}
-		auto target = obs_weak_source_get_source(gf->text_source);
-		if (!target) {
-			obs_log(gf->log_level, "text_source target is null");
-			return;
-		}
-		auto text_settings = obs_source_get_settings(target);
-		obs_data_set_string(text_settings, "text", str_copy.c_str());
-		obs_source_update(target, text_settings);
-		obs_source_release(target);
 	}
 };
 
@@ -414,6 +425,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
 	gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
 	gf->last_sub_render_time = 0;
+	gf->buffered_output = obs_data_get_bool(s, "buffered_output");
 
 	bool new_translate = obs_data_get_bool(s, "translate");
 	gf->source_lang = obs_data_get_string(s, "translate_source_language");
@@ -552,6 +564,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->rename_file_to_match_recording =
 		obs_data_get_bool(settings, "rename_file_to_match_recording");
 	gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");
+	gf->buffered_output = obs_data_get_bool(settings, "buffered_output");
 
 	for (size_t i = 0; i < gf->channels; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
@@ -592,10 +605,62 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	obs_log(gf->log_level, "clear text source data");
 	gf->text_source = nullptr;
 	const char *subtitle_sources = obs_data_get_string(settings, "subtitle_sources");
-	if (subtitle_sources != nullptr) {
-		gf->text_source_name = bstrdup(subtitle_sources);
+	if (subtitle_sources == nullptr || strcmp(subtitle_sources, "none") == 0 ||
+	    strcmp(subtitle_sources, "(null)") == 0 || strlen(subtitle_sources) == 0) {
+		obs_log(LOG_INFO, "create text source");
+		// check if a source called "LocalVocal Subtitles" exists
+		obs_source_t *source = obs_get_source_by_name("LocalVocal Subtitles");
+		if (source) {
+			// source exists, release it
+			obs_source_release(source);
+		} else {
+			// create a new OBS text source called "LocalVocal Subtitles"
+			obs_source_t *scene_as_source = obs_frontend_get_current_scene();
+			obs_scene_t *scene = obs_scene_from_source(scene_as_source);
+			source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles",
+						   nullptr, nullptr);
+			if (source) {
+				// add source to the current scene
+				obs_scene_add(scene, source);
+				// set source settings
+				obs_data_t *source_settings = obs_source_get_settings(source);
+				obs_data_set_bool(source_settings, "word_wrap", true);
+				obs_data_set_int(source_settings, "custom_width", 1760);
+				obs_data_t *font_data = obs_data_create();
+				obs_data_set_string(font_data, "face", "Arial");
+				obs_data_set_string(font_data, "style", "Regular");
+				obs_data_set_int(font_data, "size", 72);
+				obs_data_set_int(font_data, "flags", 0);
+				obs_data_set_obj(source_settings, "font", font_data);
+				obs_data_release(font_data);
+				obs_source_update(source, source_settings);
+				obs_data_release(source_settings);
+
+				// set transform settings
+				obs_transform_info transform_info;
+				transform_info.pos.x = 1852.0;
+				transform_info.pos.y = 1034.0;
+				transform_info.bounds.x = 1769.0;
+				transform_info.bounds.y = 145.0;
+				transform_info.bounds_type =
+					obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
+				transform_info.scale.x = 1.0;
+				transform_info.scale.y = 1.0;
+				transform_info.rot = 0.0;
+				obs_sceneitem_t *source_sceneitem =
+					obs_scene_sceneitem_from_source(scene, source);
+				obs_sceneitem_set_info(source_sceneitem, &transform_info);
+				obs_sceneitem_release(source_sceneitem);
+
+				obs_source_release(source);
+			}
+			obs_source_release(scene_as_source);
+		}
+		gf->text_source_name = bstrdup("LocalVocal Subtitles");
+		obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles");
 	} else {
-		gf->text_source_name = nullptr;
+		// set the text source name
+		gf->text_source_name = bstrdup(subtitle_sources);
 	}
 	obs_log(gf->log_level, "clear paths and whisper context");
 	gf->whisper_model_file_currently_loaded = "";
@@ -603,6 +668,15 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->whisper_model_path = std::string(""); // The update function will set the model path
 	gf->whisper_context = nullptr;
 
+	gf->captions_monitor.initialize(
+		[gf](const std::string &text) {
+			obs_log(LOG_INFO, "Captions: %s", text.c_str());
+			if (gf->buffered_output) {
+				send_caption_to_source(text, gf);
+			}
+		},
+		20, std::chrono::seconds(10));
+
 	obs_log(gf->log_level, "run update");
 	// get the settings updated on the filter data struct
 	transcription_filter_update(gf, settings);
@@ -662,12 +736,11 @@ bool subs_output_select_changed(obs_properties_t *props, obs_property_t *propert
 	// Show or hide the output filename selection input
 	const char *new_output = obs_data_get_string(settings, "subtitle_sources");
 	const bool show_hide = (strcmp(new_output, "text_file") == 0);
-	obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"), show_hide);
-	obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
-	obs_property_set_visible(obs_properties_get(props, "truncate_output_file"), show_hide);
-	obs_property_set_visible(obs_properties_get(props, "only_while_recording"), show_hide);
-	obs_property_set_visible(obs_properties_get(props, "rename_file_to_match_recording"),
-				 show_hide);
+	for (const std::string &prop_name :
+	     {"subtitle_output_filename", "subtitle_save_srt", "truncate_output_file",
+	      "only_while_recording", "rename_file_to_match_recording"}) {
+		obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), show_hide);
+	}
 	return true;
 }
 
@@ -691,9 +764,10 @@ void transcription_filter_defaults(obs_data_t *s)
 {
 	obs_log(LOG_INFO, "filter defaults");
 
+	obs_data_set_default_bool(s, "buffered_output", false);
 	obs_data_set_default_bool(s, "vad_enabled", true);
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
-	obs_data_set_default_bool(s, "log_words", true);
+	obs_data_set_default_bool(s, "log_words", false);
 	obs_data_set_default_bool(s, "caption_to_stream", false);
 	obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny En (74Mb)");
 	obs_data_set_default_string(s, "whisper_language_select", "en");
@@ -742,76 +816,11 @@ void transcription_filter_defaults(obs_data_t *s)
 
 obs_properties_t *transcription_filter_properties(void *data)
 {
-	obs_log(LOG_DEBUG, "Add filter properties");
-
 	struct transcription_filter_data *gf =
 		static_cast<struct transcription_filter_data *>(data);
 
 	obs_properties_t *ppts = obs_properties_create();
 
-	obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
-	obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
-
-	obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000,
-				      DEFAULT_BUFFER_SIZE_MSEC, 50);
-	obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, 300,
-				      50);
-
-	obs_property_t *step_by_step_processing = obs_properties_add_bool(
-		ppts, "step_by_step_processing", MT_("step_by_step_processing"));
-	obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000,
-				      DEFAULT_BUFFER_SIZE_MSEC, 50);
-	obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000,
-				      50);
-
-	obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props,
-								       obs_property_t *property,
-								       obs_data_t *settings) {
-		UNUSED_PARAMETER(property);
-		// Show/Hide the step size input
-		obs_property_set_visible(obs_properties_get(props, "step_size_msec"),
-					 obs_data_get_bool(settings, "step_by_step_processing"));
-		return true;
-	});
-
-	// add translation option group
-	obs_properties_t *translation_group = obs_properties_create();
-	obs_property_t *translation_group_prop = obs_properties_add_group(
-		ppts, "translate", MT_("translate"), OBS_GROUP_CHECKABLE, translation_group);
-	// add target language selection
-	obs_property_t *prop_tgt = obs_properties_add_list(
-		translation_group, "translate_target_language", MT_("target_language"),
-		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
-	obs_property_t *prop_src = obs_properties_add_list(
-		translation_group, "translate_source_language", MT_("source_language"),
-		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
-	obs_properties_add_bool(translation_group, "translate_add_context",
-				MT_("translate_add_context"));
-
-	// Populate the dropdown with the language codes
-	for (const auto &language : language_codes) {
-		obs_property_list_add_string(prop_tgt, language.second.c_str(),
-					     language.first.c_str());
-		obs_property_list_add_string(prop_src, language.second.c_str(),
-					     language.first.c_str());
-	}
-
-	// add callback to enable/disable translation group
-	obs_property_set_modified_callback(translation_group_prop, [](obs_properties_t *props,
-								      obs_property_t *property,
-								      obs_data_t *settings) {
-		UNUSED_PARAMETER(property);
-		// Show/Hide the translation group
-		const bool translate_enabled = obs_data_get_bool(settings, "translate");
-		for (const auto &prop : {"translate_target_language", "translate_source_language",
-					 "translate_add_context"}) {
-			obs_property_set_visible(obs_properties_get(props, prop),
-						 translate_enabled);
-		}
-		return true;
-	});
-
-	obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
 	obs_property_t *subs_output =
 		obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
 					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -885,6 +894,43 @@ obs_properties_t *transcription_filter_properties(void *data)
 		return true;
 	});
 
+	// add translation option group
+	obs_properties_t *translation_group = obs_properties_create();
+	obs_property_t *translation_group_prop = obs_properties_add_group(
+		ppts, "translate", MT_("translate"), OBS_GROUP_CHECKABLE, translation_group);
+	// add target language selection
+	obs_property_t *prop_tgt = obs_properties_add_list(
+		translation_group, "translate_target_language", MT_("target_language"),
+		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
+	obs_property_t *prop_src = obs_properties_add_list(
+		translation_group, "translate_source_language", MT_("source_language"),
+		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
+	obs_properties_add_bool(translation_group, "translate_add_context",
+				MT_("translate_add_context"));
+
+	// Populate the dropdown with the language codes
+	for (const auto &language : language_codes) {
+		obs_property_list_add_string(prop_tgt, language.second.c_str(),
+					     language.first.c_str());
+		obs_property_list_add_string(prop_src, language.second.c_str(),
+					     language.first.c_str());
+	}
+
+	// add callback to enable/disable translation group
+	obs_property_set_modified_callback(translation_group_prop, [](obs_properties_t *props,
+								      obs_property_t *property,
+								      obs_data_t *settings) {
+		UNUSED_PARAMETER(property);
+		// Show/Hide the translation group
+		const bool translate_enabled = obs_data_get_bool(settings, "translate");
+		for (const auto &prop : {"translate_target_language", "translate_source_language",
+					 "translate_add_context"}) {
+			obs_property_set_visible(obs_properties_get(props, prop),
+						 translate_enabled);
+		}
+		return true;
+	});
+
 	obs_property_t *advanced_settings_prop =
 		obs_properties_add_bool(ppts, "advanced_settings", MT_("advanced_settings"));
 	obs_property_set_modified_callback(advanced_settings_prop, [](obs_properties_t *props,
@@ -893,11 +939,44 @@ obs_properties_t *transcription_filter_properties(void *data)
 		UNUSED_PARAMETER(property);
 		// If advanced settings is enabled, show the advanced settings group
 		const bool show_hide = obs_data_get_bool(settings, "advanced_settings");
-		obs_property_set_visible(obs_properties_get(props, "whisper_params_group"),
-					 show_hide);
+		for (const std::string &prop_name :
+		     {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec",
+		      "overlap_size_msec", "step_by_step_processing", "min_sub_duration",
+		      "process_while_muted", "buffered_output"}) {
+			obs_property_set_visible(obs_properties_get(props, prop_name.c_str()),
+						 show_hide);
+		}
 		return true;
 	});
 
+	obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output"));
+	obs_properties_add_bool(ppts, "log_words", MT_("log_words"));
+	obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream"));
+
+	obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000,
+				      DEFAULT_BUFFER_SIZE_MSEC, 250);
+	obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, 300,
+				      50);
+
+	obs_property_t *step_by_step_processing = obs_properties_add_bool(
+		ppts, "step_by_step_processing", MT_("step_by_step_processing"));
+	obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000,
+				      DEFAULT_BUFFER_SIZE_MSEC, 50);
+	obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000,
+				      50);
+
+	obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props,
+								       obs_property_t *property,
+								       obs_data_t *settings) {
+		UNUSED_PARAMETER(property);
+		// Show/Hide the step size input
+		obs_property_set_visible(obs_properties_get(props, "step_size_msec"),
+					 obs_data_get_bool(settings, "step_by_step_processing"));
+		return true;
+	});
+
+	obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
+
 	obs_properties_t *whisper_params_group = obs_properties_create();
 	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
 				 OBS_GROUP_NORMAL, whisper_params_group);
diff --git a/src/utils.cpp b/src/utils.cpp
new file mode 100644
index 0000000..6639ae7
--- /dev/null
+++ b/src/utils.cpp
@@ -0,0 +1,21 @@
+#include "utils.h"
+
+std::vector<std::string> split_words(const std::string &str_copy)
+{
+	std::vector<std::string> words;
+	std::string word;
+	for (char c : str_copy) {
+		if (std::isspace(c)) {
+			if (!word.empty()) {
+				words.push_back(word);
+				word.clear();
+			}
+		} else {
+			word += c;
+		}
+	}
+	if (!word.empty()) {
+		words.push_back(word);
+	}
+	return words;
+}
diff --git a/src/utils.h b/src/utils.h
new file mode 100644
index 0000000..9348417
--- /dev/null
+++ b/src/utils.h
@@ -0,0 +1,9 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> split_words(const std::string &str_copy);
+
+#endif // UTILS_H
diff --git a/src/whisper-utils/whisper-language.h b/src/whisper-utils/whisper-language.h
index 611ab39..f1bfb99 100644
--- a/src/whisper-utils/whisper-language.h
+++ b/src/whisper-utils/whisper-language.h
@@ -7,403 +7,403 @@
 static const std::map<std::string, std::string> whisper_available_lang = {
 	{
 		"auto",
-		"auto",
+		"Auto detect",
 	},
 	{
 		"en",
-		"english",
+		"English",
 	},
 	{
 		"zh",
-		"chinese",
+		"Chinese",
 	},
 	{
 		"de",
-		"german",
+		"German",
 	},
 	{
 		"es",
-		"spanish",
+		"Spanish",
 	},
 	{
 		"ru",
-		"russian",
+		"Russian",
 	},
 	{
 		"ko",
-		"korean",
+		"Korean",
 	},
 	{
 		"fr",
-		"french",
+		"French",
 	},
 	{
 		"ja",
-		"japanese",
+		"Japanese",
 	},
 	{
 		"pt",
-		"portuguese",
+		"Portuguese",
 	},
 	{
 		"tr",
-		"turkish",
+		"Turkish",
 	},
 	{
 		"pl",
-		"polish",
+		"Polish",
 	},
 	{
 		"ca",
-		"catalan",
+		"Catalan",
 	},
 	{
 		"nl",
-		"dutch",
+		"Dutch",
 	},
 	{
 		"ar",
-		"arabic",
+		"Arabic",
 	},
 	{
 		"sv",
-		"swedish",
+		"Swedish",
 	},
 	{
 		"it",
-		"italian",
+		"Italian",
 	},
 	{
 		"id",
-		"indonesian",
+		"Indonesian",
 	},
 	{
 		"hi",
-		"hindi",
+		"Hindi",
 	},
 	{
 		"fi",
-		"finnish",
+		"Finnish",
 	},
 	{
 		"vi",
-		"vietnamese",
+		"Vietnamese",
 	},
 	{
 		"he",
-		"hebrew",
+		"Hebrew",
 	},
 	{
 		"uk",
-		"ukrainian",
+		"Ukrainian",
 	},
 	{
 		"el",
-		"greek",
+		"Greek",
 	},
 	{
 		"ms",
-		"malay",
+		"Malay",
 	},
 	{
 		"cs",
-		"czech",
+		"Czech",
 	},
 	{
 		"ro",
-		"romanian",
+		"Romanian",
 	},
 	{
 		"da",
-		"danish",
+		"Danish",
 	},
 	{
 		"hu",
-		"hungarian",
+		"Hungarian",
 	},
 	{
 		"ta",
-		"tamil",
+		"Tamil",
 	},
 	{
 		"no",
-		"norwegian",
+		"Norwegian",
 	},
 	{
 		"th",
-		"thai",
+		"Thai",
 	},
 	{
 		"ur",
-		"urdu",
+		"Urdu",
 	},
 	{
 		"hr",
-		"croatian",
+		"Croatian",
 	},
 	{
 		"bg",
-		"bulgarian",
+		"Bulgarian",
 	},
 	{
 		"lt",
-		"lithuanian",
+		"Lithuanian",
 	},
 	{
 		"la",
-		"latin",
+		"Latin",
 	},
 	{
 		"mi",
-		"maori",
+		"Maori",
 	},
 	{
 		"ml",
-		"malayalam",
+		"Malayalam",
 	},
 	{
 		"cy",
-		"welsh",
+		"Welsh",
 	},
 	{
 		"sk",
-		"slovak",
+		"Slovak",
 	},
 	{
 		"te",
-		"telugu",
+		"Telugu",
 	},
 	{
 		"fa",
-		"persian",
+		"Persian",
 	},
 	{
 		"lv",
-		"latvian",
+		"Latvian",
 	},
 	{
 		"bn",
-		"bengali",
+		"Bengali",
 	},
 	{
 		"sr",
-		"serbian",
+		"Serbian",
 	},
 	{
 		"az",
-		"azerbaijani",
+		"Azerbaijani",
 	},
 	{
 		"sl",
-		"slovenian",
+		"Slovenian",
 	},
 	{
 		"kn",
-		"kannada",
+		"Kannada",
 	},
 	{
 		"et",
-		"estonian",
+		"Estonian",
 	},
 	{
 		"mk",
-		"macedonian",
+		"Macedonian",
 	},
 	{
 		"br",
-		"breton",
+		"Breton",
 	},
 	{
 		"eu",
-		"basque",
+		"Basque",
 	},
 	{
 		"is",
-		"icelandic",
+		"Icelandic",
 	},
 	{
 		"hy",
-		"armenian",
+		"Armenian",
 	},
 	{
 		"ne",
-		"nepali",
+		"Nepali",
 	},
 	{
 		"mn",
-		"mongolian",
+		"Mongolian",
 	},
 	{
 		"bs",
-		"bosnian",
+		"Bosnian",
 	},
 	{
 		"kk",
-		"kazakh",
+		"Kazakh",
 	},
 	{
 		"sq",
-		"albanian",
+		"Albanian",
 	},
 	{
 		"sw",
-		"swahili",
+		"Swahili",
 	},
 	{
 		"gl",
-		"galician",
+		"Galician",
 	},
 	{
 		"mr",
-		"marathi",
+		"Marathi",
 	},
 	{
 		"pa",
-		"punjabi",
+		"Punjabi",
 	},
 	{
 		"si",
-		"sinhala",
+		"Sinhala",
 	},
 	{
 		"km",
-		"khmer",
+		"Khmer",
 	},
 	{
 		"sn",
-		"shona",
+		"Shona",
 	},
 	{
 		"yo",
-		"yoruba",
+		"Yoruba",
 	},
 	{
 		"so",
-		"somali",
+		"Somali",
 	},
 	{
 		"af",
-		"afrikaans",
+		"Afrikaans",
 	},
 	{
 		"oc",
-		"occitan",
+		"Occitan",
 	},
 	{
 		"ka",
-		"georgian",
+		"Georgian",
 	},
 	{
 		"be",
-		"belarusian",
+		"Belarusian",
 	},
 	{
 		"tg",
-		"tajik",
+		"Tajik",
 	},
 	{
 		"sd",
-		"sindhi",
+		"Sindhi",
 	},
 	{
 		"gu",
-		"gujarati",
+		"Gujarati",
 	},
 	{
 		"am",
-		"amharic",
+		"Amharic",
 	},
 	{
 		"yi",
-		"yiddish",
+		"Yiddish",
 	},
 	{
 		"lo",
-		"lao",
+		"Lao",
 	},
 	{
 		"uz",
-		"uzbek",
+		"Uzbek",
 	},
 	{
 		"fo",
-		"faroese",
+		"Faroese",
 	},
 	{
 		"ht",
-		"haitian",
+		"Haitian",
 	},
 	{
 		"ps",
-		"pashto",
+		"Pashto",
 	},
 	{
 		"tk",
-		"turkmen",
+		"Turkmen",
 	},
 	{
 		"nn",
-		"nynorsk",
+		"Nynorsk",
 	},
 	{
 		"mt",
-		"maltese",
+		"Maltese",
 	},
 	{
 		"sa",
-		"sanskrit",
+		"Sanskrit",
 	},
 	{
 		"lb",
-		"luxembourgish",
+		"Luxembourgish",
 	},
 	{
 		"my",
-		"myanmar",
+		"Myanmar",
 	},
 	{
 		"bo",
-		"tibetan",
+		"Tibetan",
 	},
 	{
 		"tl",
-		"tagalog",
+		"Tagalog",
 	},
 	{
 		"mg",
-		"malagasy",
+		"Malagasy",
 	},
 	{
 		"as",
-		"assamese",
+		"Assamese",
 	},
 	{
 		"tt",
-		"tatar",
+		"Tatar",
 	},
 	{
 		"haw",
-		"hawaiian",
+		"Hawaiian",
 	},
 	{
 		"ln",
-		"lingala",
+		"Lingala",
 	},
 	{
 		"ha",
-		"hausa",
+		"Hausa",
 	},
 	{
 		"ba",
-		"bashkir",
+		"Bashkir",
 	},
 	{
 		"jw",
-		"javanese",
+		"Javanese",
 	},
 	{
 		"su",
-		"sundanese",
+		"Sundanese",
 	},
 };
 
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 3fb8944..807cb54 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -132,14 +132,21 @@ struct whisper_context *init_whisper_context(const std::string &model_path_in)
 		[](enum ggml_log_level level, const char *text, void *user_data) {
 			UNUSED_PARAMETER(level);
 			UNUSED_PARAMETER(user_data);
-			obs_log(LOG_INFO, "Whisper: %s", text);
+			// remove trailing newline
+			char *text_copy = bstrdup(text);
+			text_copy[strcspn(text_copy, "\n")] = 0;
+			obs_log(LOG_INFO, "Whisper: %s", text_copy);
+			bfree(text_copy);
 		},
 		nullptr);
 
 	struct whisper_context_params cparams = whisper_context_default_params();
 #ifdef LOCALVOCAL_WITH_CUDA
 	cparams.use_gpu = true;
-	obs_log(LOG_INFO, "Using GPU for inference, device %d", cparams.gpu_device);
+	obs_log(LOG_INFO, "Using CUDA GPU for inference, device %d", cparams.gpu_device);
+#elif defined(LOCALVOCAL_WITH_CLBLAST)
+	cparams.use_gpu = true;
+	obs_log(LOG_INFO, "Using OpenCL for inference");
 #else
 	cparams.use_gpu = false;
 	obs_log(LOG_INFO, "Using CPU for inference");
@@ -191,6 +198,16 @@ struct whisper_context *init_whisper_context(const std::string &model_path_in)
 struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf,
 						     const float *pcm32f_data, size_t pcm32f_size)
 {
+	if (gf == nullptr) {
+		obs_log(LOG_ERROR, "run_whisper_inference: gf is null");
+		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
+	}
+
+	if (pcm32f_data == nullptr || pcm32f_size == 0) {
+		obs_log(LOG_ERROR, "run_whisper_inference: pcm32f_data is null or size is 0");
+		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
+	}
+
 	obs_log(gf->log_level, "%s: processing %d samples, %.3f sec, %d threads", __func__,
 		int(pcm32f_size), float(pcm32f_size) / WHISPER_SAMPLE_RATE,
 		gf->whisper_params.n_threads);
@@ -201,8 +218,6 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
 	}
 
-	// set duration in ms
-	const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
 	// Get the duration in ms since the beginning of the stream (gf->start_timestamp_ms)
 	const uint64_t offset_ms =
 		(uint64_t)(std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -226,6 +241,9 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result);
 		return {DETECTION_RESULT_UNKNOWN, "", 0, 0};
 	} else {
+		// duration in ms
+		const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
+
 		const int n_segment = 0;
 		const char *text = whisper_full_get_segment_text(gf->whisper_context, n_segment);
 		const int64_t t0 = offset_ms;
diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h
index edc8a66..3e189fe 100644
--- a/src/whisper-utils/whisper-processing.h
+++ b/src/whisper-utils/whisper-processing.h
@@ -2,7 +2,7 @@
 #define WHISPER_PROCESSING_H
 
 // buffer size in msec
-#define DEFAULT_BUFFER_SIZE_MSEC 3000
+#define DEFAULT_BUFFER_SIZE_MSEC 2000
 // overlap in msec
 #define DEFAULT_OVERLAP_SIZE_MSEC 100
 

From be6f027c54e116d5aa3ffb6df68282c9765edcf9 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 00:42:51 -0400
Subject: [PATCH 2/6] Update CPU_OR_CUDA environment variable error messages

---
 cmake/BuildCTranslate2.cmake | 3 ++-
 cmake/BuildWhispercpp.cmake  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/BuildCTranslate2.cmake b/cmake/BuildCTranslate2.cmake
index 41dcc1c..c0a4eac 100644
--- a/cmake/BuildCTranslate2.cmake
+++ b/cmake/BuildCTranslate2.cmake
@@ -21,7 +21,8 @@ elseif(WIN32)
 
   # check CPU_OR_CUDA environment variable
   if(NOT DEFINED ENV{CPU_OR_CUDA})
-    message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`")
+    message(
+      FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`")
   endif()
 
   if($ENV{CPU_OR_CUDA} STREQUAL "cpu" OR $ENV{CPU_OR_CUDA} STREQUAL "clblast")
diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake
index 50ad450..d9b8d96 100644
--- a/cmake/BuildWhispercpp.cmake
+++ b/cmake/BuildWhispercpp.cmake
@@ -45,7 +45,8 @@ elseif(WIN32)
   if(NOT DEFINED ENV{CPU_OR_CUDA})
     message(
       FATAL_ERROR
-        "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`")
+        "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`"
+    )
   endif(NOT DEFINED ENV{CPU_OR_CUDA})
 
   set(ARCH_PREFIX $ENV{CPU_OR_CUDA})

From 501a217923e5021f38b1e99573e2de44e7fb2f52 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 00:55:13 -0400
Subject: [PATCH 3/6] Update Cublas validation in Package-Windows.ps1 and
 initialize function in captions-thread.h

---
 .github/scripts/Package-Windows.ps1 |  4 +++-
 src/captions-thread.h               | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/Package-Windows.ps1 b/.github/scripts/Package-Windows.ps1
index cd12f90..e2c86e2 100644
--- a/.github/scripts/Package-Windows.ps1
+++ b/.github/scripts/Package-Windows.ps1
@@ -4,7 +4,7 @@ param(
     [string] $Target = 'x64',
     [ValidateSet('Debug', 'RelWithDebInfo', 'Release', 'MinSizeRel')]
     [string] $Configuration = 'RelWithDebInfo',
-    [ValidateSet('cpu', '12.2.0', '11.8.0')]
+    [ValidateSet('cpu', 'cblast', '12.2.0', '11.8.0')]
     [string] $Cublas = 'cpu',
     [switch] $BuildInstaller,
     [switch] $SkipDeps
@@ -52,6 +52,8 @@ function Package {
     # Check if $cublas is cpu or cuda
     if ( $Cublas -eq 'cpu' ) {
         $CudaName = 'cpu'
+    } elseif ( $Cublas -eq 'cblast' ) {
+        $CudaName = 'cblast'
     } else {
         $CudaName = "cuda${Cublas}"
     }
diff --git a/src/captions-thread.h b/src/captions-thread.h
index 5ebf17b..5b15ca5 100644
--- a/src/captions-thread.h
+++ b/src/captions-thread.h
@@ -29,13 +29,13 @@ class CaptionMonitor {
 		workerThread.join();
 	}
 
-	void initialize(std::function<void(const std::string &)> callback, size_t maxSize,
-			std::chrono::seconds maxTime)
+	void initialize(std::function<void(const std::string &)> callback_, size_t maxSize_,
+			std::chrono::seconds maxTime_)
 	{
 		obs_log(LOG_INFO, "CaptionMonitor::initialize");
-		this->callback = callback;
-		this->maxSize = maxSize;
-		this->maxTime = maxTime;
+		this->callback = callback_;
+		this->maxSize = maxSize_;
+		this->maxTime = maxTime_;
 		this->initialized = true;
 		this->workerThread = std::thread(&CaptionMonitor::monitor, this);
 	}

From 9385a0c987c8b6f7af0e46bdc0de3d6755ca9efc Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 00:59:55 -0400
Subject: [PATCH 4/6] Update Cublas validation and fix typo in
 Package-Windows.ps1

---
 .github/scripts/Package-Windows.ps1 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/Package-Windows.ps1 b/.github/scripts/Package-Windows.ps1
index e2c86e2..1721272 100644
--- a/.github/scripts/Package-Windows.ps1
+++ b/.github/scripts/Package-Windows.ps1
@@ -4,7 +4,7 @@ param(
     [string] $Target = 'x64',
     [ValidateSet('Debug', 'RelWithDebInfo', 'Release', 'MinSizeRel')]
     [string] $Configuration = 'RelWithDebInfo',
-    [ValidateSet('cpu', 'cblast', '12.2.0', '11.8.0')]
+    [ValidateSet('cpu', 'clblast', '12.2.0', '11.8.0')]
     [string] $Cublas = 'cpu',
     [switch] $BuildInstaller,
     [switch] $SkipDeps

From 66f1afc54a55442a8df9325e607e3d32d4f45897 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 08:57:39 -0400
Subject: [PATCH 5/6] Update default whisper model path to Whisper Tiny English
 (74Mb)

---
 src/transcription-filter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index e7c28bf..8c05743 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -769,7 +769,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
 	obs_data_set_default_bool(s, "log_words", false);
 	obs_data_set_default_bool(s, "caption_to_stream", false);
-	obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny En (74Mb)");
+	obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)");
 	obs_data_set_default_string(s, "whisper_language_select", "en");
 	obs_data_set_default_string(s, "subtitle_sources", "none");
 	obs_data_set_default_bool(s, "step_by_step_processing", false);

From 7873def295294c92a241991af6829c35492c805b Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 18 Apr 2024 09:21:45 -0400
Subject: [PATCH 6/6] Update translation strings for multiple locales

---
 data/locale/ar-SA.ini        |  2 ++
 data/locale/de-DE.ini        |  2 ++
 data/locale/en-US.ini        |  4 +++-
 data/locale/es-ES.ini        |  2 ++
 data/locale/fr-FR.ini        |  2 ++
 data/locale/hi-IN.ini        |  2 ++
 data/locale/ja-JP.ini        |  2 ++
 data/locale/ko-KR.ini        |  2 ++
 data/locale/pl-PL.ini        |  2 ++
 data/locale/pt-BR.ini        |  2 ++
 data/locale/ru-RU.ini        |  2 ++
 data/locale/zh-CN.ini        |  2 ++
 src/captions-thread.h        |  9 ---------
 src/transcription-filter.cpp | 26 +++++++++++++++-----------
 14 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/data/locale/ar-SA.ini b/data/locale/ar-SA.ini
index bdd2fe8..5c08e38 100644
--- a/data/locale/ar-SA.ini
+++ b/data/locale/ar-SA.ini
@@ -49,3 +49,5 @@ source_language="لغة المصدر"
 translate="ترجمة (⚠️ زيادة المعالجة)"
 translate_add_context="الترجمة مع السياق"
 whisper_translate="ترجمة إلى الإنجليزية (Whisper)"
+buffer_size_msec="حجم الذاكرة المؤقتة (ملي ثانية)"
+overlap_size_msec="حجم التداخل (ملي ثانية)"
diff --git a/data/locale/de-DE.ini b/data/locale/de-DE.ini
index e83f8b9..df2e450 100644
--- a/data/locale/de-DE.ini
+++ b/data/locale/de-DE.ini
@@ -49,3 +49,5 @@ source_language="Quellsprache"
 translate="Übersetzen (⚠️ erhöhte Verarbeitung)"
 translate_add_context="Mit Kontext übersetzen"
 whisper_translate="Ins Englische übersetzen (Flüstern)"
+buffer_size_msec="Puffergröße (ms)"
+overlap_size_msec="Überlappungsgröße (ms)"
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 97dba52..6f7c1e9 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -12,7 +12,7 @@ text_file_output="Text File output"
 output_filename="Output filename"
 whisper_model="Whisper Model"
 external_model_file="External model file"
-whisper_parameters="Advanced Settings"
+whisper_parameters="Whisper Model Parameters"
 language="Language"
 whisper_sampling_method="Whisper Sampling Method"
 n_threads="Number of threads"
@@ -49,3 +49,5 @@ source_language="Source language"
 translate="Translate (⚠️ increased processing)"
 translate_add_context="Translate with context"
 whisper_translate="Translate to English (Whisper)"
+buffer_size_msec="Buffer size (ms)"
+overlap_size_msec="Overlap size (ms)"
diff --git a/data/locale/es-ES.ini b/data/locale/es-ES.ini
index 8634c3f..59dc350 100644
--- a/data/locale/es-ES.ini
+++ b/data/locale/es-ES.ini
@@ -49,3 +49,5 @@ source_language="Idioma fuente"
 translate="Traducir (⚠️ procesamiento aumentado)"
 translate_add_context="Traducir con contexto"
 whisper_translate="Traducir al inglés (Whisper)"
+buffer_size_msec="Tamaño del búfer (ms)"
+overlap_size_msec="Tamaño de superposición (ms)"
diff --git a/data/locale/fr-FR.ini b/data/locale/fr-FR.ini
index 2a8759a..0b6cd0c 100644
--- a/data/locale/fr-FR.ini
+++ b/data/locale/fr-FR.ini
@@ -49,3 +49,5 @@ source_language="Langue source"
 translate="Traduire (⚠️ traitement accru)"
 translate_add_context="Traduire avec contexte"
 whisper_translate="Traduire en anglais (Whisper)"
+buffer_size_msec="Taille du tampon (ms)"
+overlap_size_msec="Taille de chevauchement (ms)"
diff --git a/data/locale/hi-IN.ini b/data/locale/hi-IN.ini
index 6e61512..a7b2d26 100644
--- a/data/locale/hi-IN.ini
+++ b/data/locale/hi-IN.ini
@@ -49,3 +49,5 @@ source_language="स्रोत भाषा"
 translate="अनुवाद करें (⚠️ बढ़ी प्रसंस्करण)"
 translate_add_context="संदर्भ के साथ अनुवाद करें"
 whisper_translate="अंग्रेजी में अनुवाद करें (व्हिस्पर)"
+buffer_size_msec="बफ़र आकार (ms)"
+overlap_size_msec="ओवरलैप आकार (ms)"
diff --git a/data/locale/ja-JP.ini b/data/locale/ja-JP.ini
index 8cce9b5..18445bc 100644
--- a/data/locale/ja-JP.ini
+++ b/data/locale/ja-JP.ini
@@ -49,3 +49,5 @@ source_language="ソース言語"
 translate="翻訳 (⚠️処理増加)"
 translate_add_context="コンテキスト付きで翻訳"
 whisper_translate="英語に翻訳（ウィスパー）"
+buffer_size_msec="バッファサイズ（ms）"
+overlap_size_msec="オーバーラップサイズ（ms）"
diff --git a/data/locale/ko-KR.ini b/data/locale/ko-KR.ini
index 4014456..8dbe564 100644
--- a/data/locale/ko-KR.ini
+++ b/data/locale/ko-KR.ini
@@ -49,3 +49,5 @@ source_language="원본 언어"
 translate="번역 (⚠️ 처리 시간 증가)"
 translate_add_context="컨텍스트와 함께 번역"
 whisper_translate="영어로 번역 (속삭임)"
+buffer_size_msec="버퍼 크기 (ms)"
+overlap_size_msec="오버랩 크기 (ms)"
diff --git a/data/locale/pl-PL.ini b/data/locale/pl-PL.ini
index 1025bb1..64ee55b 100644
--- a/data/locale/pl-PL.ini
+++ b/data/locale/pl-PL.ini
@@ -49,3 +49,5 @@ source_language="Język źródłowy"
 translate="Tłumacz (⚠️ zwiększone przetwarzanie)"
 translate_add_context="Tłumacz z kontekstem"
 whisper_translate="Tłumacz na angielski (Whisper)"
+buffer_size_msec="Rozmiar bufora (ms)"
+overlap_size_msec="Rozmiar nakładki (ms)"
diff --git a/data/locale/pt-BR.ini b/data/locale/pt-BR.ini
index 73ab770..2f0a27e 100644
--- a/data/locale/pt-BR.ini
+++ b/data/locale/pt-BR.ini
@@ -49,3 +49,5 @@ source_language="Língua de origem"
 translate="Traduzir (⚠️ o processamento aumentará)"
 translate_add_context="Traduzir com contexto"
 whisper_translate="Traduzir para inglês (Whisper)"
+buffer_size_msec="Tamanho do buffer (ms)"
+overlap_size_msec="Tamanho da sobreposição (ms)"
diff --git a/data/locale/ru-RU.ini b/data/locale/ru-RU.ini
index 8428761..23090b6 100644
--- a/data/locale/ru-RU.ini
+++ b/data/locale/ru-RU.ini
@@ -48,3 +48,5 @@ source_language="Исходный язык"
 translate="Перевести (⚠️ обработка будет увеличена)"
 translate_add_context="Перевести с контекстом"
 whisper_translate="Перевести на английский (Whisper)"
+buffer_size_msec="Размер буфера (мс)"
+overlap_size_msec="Размер перекрытия (мс)"
diff --git a/data/locale/zh-CN.ini b/data/locale/zh-CN.ini
index a9e382e..a561f35 100644
--- a/data/locale/zh-CN.ini
+++ b/data/locale/zh-CN.ini
@@ -49,3 +49,5 @@ source_language="源语言"
 translate="翻译 （⚠️ 增加处理）"
 translate_add_context="带上下文翻译"
 whisper_translate="翻译为英语（Whisper）"
+buffer_size_msec="缓冲区大小（毫秒）"
+overlap_size_msec="重叠大小（毫秒）"
diff --git a/src/captions-thread.h b/src/captions-thread.h
index 5b15ca5..1cdb079 100644
--- a/src/captions-thread.h
+++ b/src/captions-thread.h
@@ -32,7 +32,6 @@ class CaptionMonitor {
 	void initialize(std::function<void(const std::string &)> callback_, size_t maxSize_,
 			std::chrono::seconds maxTime_)
 	{
-		obs_log(LOG_INFO, "CaptionMonitor::initialize");
 		this->callback = callback_;
 		this->maxSize = maxSize_;
 		this->maxTime = maxTime_;
@@ -49,8 +48,6 @@ class CaptionMonitor {
 			}
 			this->newDataAvailable = true;
 		}
-		obs_log(LOG_INFO, "CaptionMonitor::addWords: number of words in queue: %d",
-			wordQueue.size());
 		condVar.notify_all();
 	}
 
@@ -66,7 +63,6 @@ class CaptionMonitor {
 					   [this] { return this->newDataAvailable || this->stop; });
 
 			if (this->stop) {
-				obs_log(LOG_INFO, "CaptionMonitor::monitor: stopping");
 				break;
 			}
 
@@ -74,9 +70,6 @@ class CaptionMonitor {
 				continue;
 			}
 
-			obs_log(LOG_INFO, "CaptionMonitor::monitor: wordQueue size: %d",
-				this->wordQueue.size());
-
 			// emit up to maxSize words from the wordQueue
 			std::vector<std::string> emitted;
 			while (!this->wordQueue.empty() && emitted.size() <= this->maxSize) {
@@ -99,8 +92,6 @@ class CaptionMonitor {
 				// flush the queue if it's full or we've reached the max time
 				size_t words_to_flush =
 					std::min(this->wordQueue.size(), this->maxSize);
-				obs_log(LOG_INFO, "CaptionMonitor::monitor: flushing %d words",
-					words_to_flush);
 				for (size_t i = 0; i < words_to_flush; ++i) {
 					wordQueue.pop_front();
 				}
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 8c05743..c12b2a7 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -332,7 +332,9 @@ void set_text_callback(struct transcription_filter_data *gf,
 
 	gf->last_text = str_copy;
 
-	gf->captions_monitor.addWords(split_words(str_copy));
+	if (gf->buffered_output) {
+		gf->captions_monitor.addWords(split_words(str_copy));
+	}
 
 	if (gf->caption_to_stream) {
 		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
@@ -638,12 +640,14 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 
 				// set transform settings
 				obs_transform_info transform_info;
-				transform_info.pos.x = 1852.0;
-				transform_info.pos.y = 1034.0;
+				transform_info.pos.x = 962.0;
+				transform_info.pos.y = 959.0;
 				transform_info.bounds.x = 1769.0;
 				transform_info.bounds.y = 145.0;
 				transform_info.bounds_type =
 					obs_bounds_type::OBS_BOUNDS_SCALE_INNER;
+				transform_info.bounds_alignment = OBS_ALIGN_CENTER;
+				transform_info.alignment = OBS_ALIGN_CENTER;
 				transform_info.scale.x = 1.0;
 				transform_info.scale.y = 1.0;
 				transform_info.rot = 0.0;
@@ -942,7 +946,7 @@ obs_properties_t *transcription_filter_properties(void *data)
 		for (const std::string &prop_name :
 		     {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec",
 		      "overlap_size_msec", "step_by_step_processing", "min_sub_duration",
-		      "process_while_muted", "buffered_output"}) {
+		      "process_while_muted", "buffered_output", "vad_enabled", "log_level"}) {
 			obs_property_set_visible(obs_properties_get(props, prop_name.c_str()),
 						 show_hide);
 		}
@@ -977,18 +981,18 @@ obs_properties_t *transcription_filter_properties(void *data)
 
 	obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
 
-	obs_properties_t *whisper_params_group = obs_properties_create();
-	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
-				 OBS_GROUP_NORMAL, whisper_params_group);
+	obs_properties_add_bool(ppts, "vad_enabled", MT_("vad_enabled"));
 
-	obs_properties_add_bool(whisper_params_group, "vad_enabled", MT_("vad_enabled"));
-	obs_property_t *list = obs_properties_add_list(whisper_params_group, "log_level",
-						       MT_("log_level"), OBS_COMBO_TYPE_LIST,
-						       OBS_COMBO_FORMAT_INT);
+	obs_property_t *list = obs_properties_add_list(ppts, "log_level", MT_("log_level"),
+						       OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
 	obs_property_list_add_int(list, "DEBUG", LOG_DEBUG);
 	obs_property_list_add_int(list, "INFO", LOG_INFO);
 	obs_property_list_add_int(list, "WARNING", LOG_WARNING);
 
+	obs_properties_t *whisper_params_group = obs_properties_create();
+	obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"),
+				 OBS_GROUP_NORMAL, whisper_params_group);
+
 	// Add language selector
 	obs_property_t *whisper_language_select_list = obs_properties_add_list(
 		whisper_params_group, "whisper_language_select", MT_("language"),