From f4d2cfc71c7fff6788a36bda110e043d4d31fe50 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 21 Aug 2024 23:08:59 -0400
Subject: [PATCH 01/18] Fix hangups and VAD segmentation

---
 CMakeLists.txt                           |   2 +
 src/transcription-filter-data.h          |   1 +
 src/transcription-filter-properties.cpp  |  73 +++++++
 src/transcription-filter.cpp             |  70 +------
 src/whisper-utils/vad-processing.cpp     | 237 +++++++++++++++++++++
 src/whisper-utils/vad-processing.h       |  15 ++
 src/whisper-utils/whisper-processing.cpp | 252 ++---------------------
 src/whisper-utils/whisper-processing.h   |   4 +-
 8 files changed, 343 insertions(+), 311 deletions(-)
 create mode 100644 src/whisper-utils/vad-processing.cpp
 create mode 100644 src/whisper-utils/vad-processing.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7eff873..e5fa82a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,7 @@ target_sources(
           src/whisper-utils/whisper-model-utils.cpp
           src/whisper-utils/silero-vad-onnx.cpp
           src/whisper-utils/token-buffer-thread.cpp
+          src/whisper-utils/vad-processing.cpp
           src/translation/language_codes.cpp
           src/translation/translation.cpp
           src/translation/translation-utils.cpp
@@ -137,6 +138,7 @@ if(ENABLE_TESTS)
             src/whisper-utils/whisper-utils.cpp
             src/whisper-utils/silero-vad-onnx.cpp
             src/whisper-utils/token-buffer-thread.cpp
+            src/whisper-utils/vad-processing.cpp
             src/translation/language_codes.cpp
             src/translation/translation.cpp)
 
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 4b16d13..9cefd13 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -84,6 +84,7 @@ struct transcription_filter_data {
 	bool initial_creation = true;
 	bool partial_transcription = false;
 	int partial_latency = 1000;
+	float duration_filter_threshold = 2.25f;
 
 	// Last transcription result
 	std::string last_text;
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 523bbf8..acecf4f 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -320,6 +320,9 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 	// add vad threshold slider
 	obs_properties_add_float_slider(advanced_config_group, "vad_threshold",
 					MT_("vad_threshold"), 0.0, 1.0, 0.05);
+	// add duration filter threshold slider
+	obs_properties_add_float_slider(advanced_config_group, "duration_filter_threshold",
+					MT_("duration_filter_threshold"), 0.1, 3.0, 0.05);
 
 	// add button to open filter and replace UI dialog
 	obs_properties_add_button2(
@@ -507,3 +510,73 @@ obs_properties_t *transcription_filter_properties(void *data)
 	UNUSED_PARAMETER(data);
 	return ppts;
 }
+
+void transcription_filter_defaults(obs_data_t *s)
+{
+	obs_log(LOG_DEBUG, "filter defaults");
+
+	obs_data_set_default_bool(s, "buffered_output", false);
+	obs_data_set_default_int(s, "buffer_num_lines", 2);
+	obs_data_set_default_int(s, "buffer_num_chars_per_line", 30);
+	obs_data_set_default_int(s, "buffer_output_type",
+				 (int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
+
+	obs_data_set_default_bool(s, "vad_enabled", true);
+	obs_data_set_default_double(s, "vad_threshold", 0.65);
+	obs_data_set_default_double(s, "duration_filter_threshold", 2.25);
+	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
+	obs_data_set_default_bool(s, "log_words", false);
+	obs_data_set_default_bool(s, "caption_to_stream", false);
+	obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)");
+	obs_data_set_default_string(s, "whisper_language_select", "en");
+	obs_data_set_default_string(s, "subtitle_sources", "none");
+	obs_data_set_default_bool(s, "process_while_muted", false);
+	obs_data_set_default_bool(s, "subtitle_save_srt", false);
+	obs_data_set_default_bool(s, "truncate_output_file", false);
+	obs_data_set_default_bool(s, "only_while_recording", false);
+	obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
+	obs_data_set_default_int(s, "min_sub_duration", 3000);
+	obs_data_set_default_bool(s, "advanced_settings", false);
+	obs_data_set_default_bool(s, "translate", false);
+	obs_data_set_default_string(s, "translate_target_language", "__es__");
+	obs_data_set_default_bool(s, "translate_add_context", true);
+	obs_data_set_default_string(s, "translate_model", "whisper-based-translation");
+	obs_data_set_default_string(s, "translation_model_path_external", "");
+	obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100);
+	obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4);
+	obs_data_set_default_bool(s, "partial_group", false);
+	obs_data_set_default_int(s, "partial_latency", 1100);
+
+	// translation options
+	obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
+	obs_data_set_default_double(s, "translation_repetition_penalty", 2.0);
+	obs_data_set_default_int(s, "translation_beam_size", 1);
+	obs_data_set_default_int(s, "translation_max_decoding_length", 65);
+	obs_data_set_default_int(s, "translation_no_repeat_ngram_size", 1);
+	obs_data_set_default_int(s, "translation_max_input_length", 65);
+
+	// Whisper parameters
+	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
+	obs_data_set_default_string(s, "initial_prompt", "");
+	obs_data_set_default_int(s, "n_threads", 4);
+	obs_data_set_default_int(s, "n_max_text_ctx", 16384);
+	obs_data_set_default_bool(s, "whisper_translate", false);
+	obs_data_set_default_bool(s, "no_context", true);
+	obs_data_set_default_bool(s, "single_segment", true);
+	obs_data_set_default_bool(s, "print_special", false);
+	obs_data_set_default_bool(s, "print_progress", false);
+	obs_data_set_default_bool(s, "print_realtime", false);
+	obs_data_set_default_bool(s, "print_timestamps", false);
+	obs_data_set_default_bool(s, "token_timestamps", false);
+	obs_data_set_default_bool(s, "dtw_token_timestamps", false);
+	obs_data_set_default_double(s, "thold_pt", 0.01);
+	obs_data_set_default_double(s, "thold_ptsum", 0.01);
+	obs_data_set_default_int(s, "max_len", 0);
+	obs_data_set_default_bool(s, "split_on_word", true);
+	obs_data_set_default_int(s, "max_tokens", 0);
+	obs_data_set_default_bool(s, "suppress_blank", false);
+	obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
+	obs_data_set_default_double(s, "temperature", 0.1);
+	obs_data_set_default_double(s, "max_initial_ts", 1.0);
+	obs_data_set_default_double(s, "length_penalty", -1.0);
+}
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 3683c18..d47876a 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -188,6 +188,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
 	gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
 	gf->last_sub_render_time = now_ms();
+	gf->duration_filter_threshold = (float)obs_data_get_double(s, "duration_filter_threshold");
 	gf->partial_transcription = obs_data_get_bool(s, "partial_group");
 	gf->partial_latency = (int)obs_data_get_int(s, "partial_latency");
 	bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
@@ -551,72 +552,3 @@ void transcription_filter_hide(void *data)
 		static_cast<struct transcription_filter_data *>(data);
 	obs_log(gf->log_level, "filter hide");
 }
-
-void transcription_filter_defaults(obs_data_t *s)
-{
-	obs_log(LOG_DEBUG, "filter defaults");
-
-	obs_data_set_default_bool(s, "buffered_output", false);
-	obs_data_set_default_int(s, "buffer_num_lines", 2);
-	obs_data_set_default_int(s, "buffer_num_chars_per_line", 30);
-	obs_data_set_default_int(s, "buffer_output_type",
-				 (int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
-
-	obs_data_set_default_bool(s, "vad_enabled", true);
-	obs_data_set_default_double(s, "vad_threshold", 0.65);
-	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
-	obs_data_set_default_bool(s, "log_words", false);
-	obs_data_set_default_bool(s, "caption_to_stream", false);
-	obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)");
-	obs_data_set_default_string(s, "whisper_language_select", "en");
-	obs_data_set_default_string(s, "subtitle_sources", "none");
-	obs_data_set_default_bool(s, "process_while_muted", false);
-	obs_data_set_default_bool(s, "subtitle_save_srt", false);
-	obs_data_set_default_bool(s, "truncate_output_file", false);
-	obs_data_set_default_bool(s, "only_while_recording", false);
-	obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
-	obs_data_set_default_int(s, "min_sub_duration", 3000);
-	obs_data_set_default_bool(s, "advanced_settings", false);
-	obs_data_set_default_bool(s, "translate", false);
-	obs_data_set_default_string(s, "translate_target_language", "__es__");
-	obs_data_set_default_bool(s, "translate_add_context", true);
-	obs_data_set_default_string(s, "translate_model", "whisper-based-translation");
-	obs_data_set_default_string(s, "translation_model_path_external", "");
-	obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100);
-	obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4);
-	obs_data_set_default_bool(s, "partial_group", false);
-	obs_data_set_default_int(s, "partial_latency", 1100);
-
-	// translation options
-	obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
-	obs_data_set_default_double(s, "translation_repetition_penalty", 2.0);
-	obs_data_set_default_int(s, "translation_beam_size", 1);
-	obs_data_set_default_int(s, "translation_max_decoding_length", 65);
-	obs_data_set_default_int(s, "translation_no_repeat_ngram_size", 1);
-	obs_data_set_default_int(s, "translation_max_input_length", 65);
-
-	// Whisper parameters
-	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
-	obs_data_set_default_string(s, "initial_prompt", "");
-	obs_data_set_default_int(s, "n_threads", 4);
-	obs_data_set_default_int(s, "n_max_text_ctx", 16384);
-	obs_data_set_default_bool(s, "whisper_translate", false);
-	obs_data_set_default_bool(s, "no_context", true);
-	obs_data_set_default_bool(s, "single_segment", true);
-	obs_data_set_default_bool(s, "print_special", false);
-	obs_data_set_default_bool(s, "print_progress", false);
-	obs_data_set_default_bool(s, "print_realtime", false);
-	obs_data_set_default_bool(s, "print_timestamps", false);
-	obs_data_set_default_bool(s, "token_timestamps", false);
-	obs_data_set_default_bool(s, "dtw_token_timestamps", false);
-	obs_data_set_default_double(s, "thold_pt", 0.01);
-	obs_data_set_default_double(s, "thold_ptsum", 0.01);
-	obs_data_set_default_int(s, "max_len", 0);
-	obs_data_set_default_bool(s, "split_on_word", true);
-	obs_data_set_default_int(s, "max_tokens", 0);
-	obs_data_set_default_bool(s, "suppress_blank", false);
-	obs_data_set_default_bool(s, "suppress_non_speech_tokens", true);
-	obs_data_set_default_double(s, "temperature", 0.1);
-	obs_data_set_default_double(s, "max_initial_ts", 1.0);
-	obs_data_set_default_double(s, "length_penalty", -1.0);
-}
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
new file mode 100644
index 0000000..2a1f8d4
--- /dev/null
+++ b/src/whisper-utils/vad-processing.cpp
@@ -0,0 +1,237 @@
+
+#include <util/profiler.hpp>
+
+#include "transcription-filter-data.h"
+
+#include "vad-processing.h"
+
+vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
+{
+	uint32_t num_frames_from_infos = 0;
+	uint64_t start_timestamp_offset_ns = 0;
+	uint64_t end_timestamp_offset_ns = 0;
+
+	{
+		// scoped lock the buffer mutex
+		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
+
+		if (gf->input_buffers[0].size == 0) {
+			return last_vad_state;
+		}
+
+		obs_log(gf->log_level,
+			"vad based segmentation. currently %lu bytes in the audio input buffer",
+			gf->input_buffers[0].size);
+
+		// max number of frames is 10 seconds worth of audio
+		const size_t max_num_frames = gf->sample_rate * 10;
+
+		// pop all infos from the info buffer and mark the beginning timestamp from the first
+		// info as the beginning timestamp of the segment
+		struct transcription_filter_audio_info info_from_buf = {0};
+		const size_t size_of_audio_info = sizeof(transcription_filter_audio_info);
+		while (gf->info_buffer.size >= size_of_audio_info) {
+			circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
+			num_frames_from_infos += info_from_buf.frames;
+			if (start_timestamp_offset_ns == 0) {
+				start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
+			}
+			// Check if we're within the needed segment length
+			if (num_frames_from_infos > max_num_frames) {
+				// too big, push the last info into the buffer's front where it was
+				num_frames_from_infos -= info_from_buf.frames;
+				circlebuf_push_front(&gf->info_buffer, &info_from_buf,
+						     size_of_audio_info);
+				break;
+			}
+		}
+		// calculate the end timestamp from the info plus the number of frames in the packet
+		end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns +
+					  info_from_buf.frames * 1000000000 / gf->sample_rate;
+
+		if (start_timestamp_offset_ns > end_timestamp_offset_ns) {
+			// this may happen when the incoming media has a timestamp reset
+			// in this case, we should figure out the start timestamp from the end timestamp
+			// and the number of frames
+			start_timestamp_offset_ns =
+				end_timestamp_offset_ns -
+				num_frames_from_infos * 1000000000 / gf->sample_rate;
+		}
+
+		for (size_t c = 0; c < gf->channels; c++) {
+			// zero the rest of copy_buffers
+			memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float));
+		}
+
+		/* Pop from input circlebuf */
+		for (size_t c = 0; c < gf->channels; c++) {
+			// Push the new data to copy_buffers[c]
+			circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c],
+					    num_frames_from_infos * sizeof(float));
+		}
+	}
+
+	obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
+	gf->last_num_frames = num_frames_from_infos;
+
+	{
+		// resample to 16kHz
+		float *resampled_16khz[MAX_PREPROC_CHANNELS];
+		uint32_t resampled_16khz_frames;
+		uint64_t ts_offset;
+		{
+			ProfileScope("resample");
+			audio_resampler_resample(gf->resampler_to_whisper,
+						 (uint8_t **)resampled_16khz,
+						 &resampled_16khz_frames, &ts_offset,
+						 (const uint8_t **)gf->copy_buffers,
+						 (uint32_t)num_frames_from_infos);
+		}
+
+		obs_log(gf->log_level, "resampled: %d channels, %d frames, %f ms",
+			(int)gf->channels, (int)resampled_16khz_frames,
+			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f);
+		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
+				    resampled_16khz_frames * sizeof(float));
+	}
+
+	const size_t vad_window_size_samples = gf->vad->get_window_size_samples() * sizeof(float);
+	const size_t min_vad_buffer_size = vad_window_size_samples * 15;
+	if (gf->resampled_buffer.size < min_vad_buffer_size)
+		return last_vad_state;
+
+	size_t vad_num_windows = gf->resampled_buffer.size / vad_window_size_samples;
+
+	std::vector<float> vad_input;
+	vad_input.resize(vad_num_windows * gf->vad->get_window_size_samples());
+	circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
+			    vad_input.size() * sizeof(float));
+
+	obs_log(gf->log_level, "sending %d frames to vad, %d windows, reset state? %s",
+		vad_input.size(), vad_num_windows, (!last_vad_state.vad_on) ? "yes" : "no");
+	{
+		ProfileScope("vad->process");
+		gf->vad->process(vad_input, !last_vad_state.vad_on);
+	}
+
+	const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
+	const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
+
+	vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms,
+				       last_vad_state.last_partial_segment_end_ts};
+
+	std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
+	if (stamps.size() == 0) {
+		obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
+		if (last_vad_state.vad_on) {
+			obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
+			run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+						    last_vad_state.end_ts_offset_ms,
+						    VAD_STATE_WAS_ON);
+			current_vad_state.last_partial_segment_end_ts = 0;
+		}
+
+		if (gf->enable_audio_chunks_callback) {
+			audio_chunk_callback(gf, vad_input.data(), vad_input.size(),
+					     VAD_STATE_IS_OFF,
+					     {DETECTION_RESULT_SILENCE,
+					      "[silence]",
+					      current_vad_state.start_ts_offest_ms,
+					      current_vad_state.end_ts_offset_ms,
+					      {}});
+		}
+
+		return current_vad_state;
+	}
+
+	// process vad segments
+	for (size_t i = 0; i < stamps.size(); i++) {
+		int start_frame = stamps[i].start;
+		if (i > 0) {
+			// if this is not the first segment, start from the end of the previous segment
+			start_frame = stamps[i - 1].end;
+		} else {
+			// take at least 100ms of audio before the first speech segment, if available
+			start_frame = std::max(0, start_frame - WHISPER_SAMPLE_RATE / 10);
+		}
+
+		int end_frame = stamps[i].end;
+		if (i == stamps.size() - 1 && stamps[i].end < (int)vad_input.size()) {
+			// take at least 100ms of audio after the last speech segment, if available
+			end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
+					     (int)vad_input.size());
+		}
+
+		const int number_of_frames = end_frame - start_frame;
+
+		// push the data into gf-whisper_buffer
+		circlebuf_push_back(&gf->whisper_buffer, vad_input.data() + start_frame,
+				    number_of_frames * sizeof(float));
+
+		obs_log(gf->log_level,
+			"VAD segment %d/%d. pushed %d to %d (%d frames / %lu ms). current size: %lu bytes / %lu frames / %lu ms",
+			i, (stamps.size() - 1), start_frame, end_frame, number_of_frames,
+			number_of_frames * 1000 / WHISPER_SAMPLE_RATE, gf->whisper_buffer.size,
+			gf->whisper_buffer.size / sizeof(float),
+			gf->whisper_buffer.size / sizeof(float) * 1000 / WHISPER_SAMPLE_RATE);
+
+		// segment "end" is in the middle of the buffer, send it to inference
+		if (stamps[i].end < (int)vad_input.size()) {
+			// new "ending" segment (not up to the end of the buffer)
+			obs_log(gf->log_level, "VAD segment end -> send to inference");
+			// find the end timestamp of the segment
+			const uint64_t segment_end_ts =
+				start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
+			run_inference_and_callbacks(
+				gf, last_vad_state.start_ts_offest_ms, segment_end_ts,
+				last_vad_state.vad_on ? VAD_STATE_WAS_ON : VAD_STATE_WAS_OFF);
+			current_vad_state.vad_on = false;
+			current_vad_state.start_ts_offest_ms = current_vad_state.end_ts_offset_ms;
+			current_vad_state.end_ts_offset_ms = 0;
+			current_vad_state.last_partial_segment_end_ts = 0;
+			last_vad_state = current_vad_state;
+			continue;
+		}
+
+		// end not reached - speech is ongoing
+		current_vad_state.vad_on = true;
+		if (last_vad_state.vad_on) {
+			current_vad_state.start_ts_offest_ms = last_vad_state.start_ts_offest_ms;
+		} else {
+			current_vad_state.start_ts_offest_ms =
+				start_ts_offset_ms + start_frame * 1000 / WHISPER_SAMPLE_RATE;
+		}
+		obs_log(gf->log_level, "end not reached. vad state: start ts: %llu, end ts: %llu",
+			current_vad_state.start_ts_offest_ms, current_vad_state.end_ts_offset_ms);
+
+		last_vad_state = current_vad_state;
+
+		// if partial transcription is enabled, check if we should send a partial segment
+		if (!gf->partial_transcription) {
+			continue;
+		}
+
+		// current length of audio in buffer
+		const uint64_t current_length_ms =
+			(current_vad_state.end_ts_offset_ms > 0
+				 ? current_vad_state.end_ts_offset_ms
+				 : current_vad_state.start_ts_offest_ms) -
+			(current_vad_state.last_partial_segment_end_ts > 0
+				 ? current_vad_state.last_partial_segment_end_ts
+				 : current_vad_state.start_ts_offest_ms);
+		obs_log(gf->log_level, "current buffer length after last partial (%lu): %lu ms",
+			current_vad_state.last_partial_segment_end_ts, current_length_ms);
+
+		if (current_length_ms > (uint64_t)gf->partial_latency) {
+			current_vad_state.last_partial_segment_end_ts =
+				current_vad_state.end_ts_offset_ms;
+			// send partial segment to inference
+			obs_log(gf->log_level, "Partial segment -> send to inference");
+			run_inference_and_callbacks(gf, current_vad_state.start_ts_offest_ms,
+						    current_vad_state.end_ts_offset_ms,
+						    VAD_STATE_PARTIAL);
+		}
+	}
+
+	return current_vad_state;
+}
diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h
new file mode 100644
index 0000000..237abe6
--- /dev/null
+++ b/src/whisper-utils/vad-processing.h
@@ -0,0 +1,15 @@
+#ifndef VAD_PROCESSING_H
+#define VAD_PROCESSING_H
+
+enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL };
+
+struct vad_state {
+	bool vad_on;
+	uint64_t start_ts_offest_ms;
+	uint64_t end_ts_offset_ms;
+	uint64_t last_partial_segment_end_ts;
+};
+
+vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
+
+#endif // VAD_PROCESSING_H
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 6d2d76e..17e817a 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -17,18 +17,12 @@
 #endif
 
 #include "model-utils/model-find-utils.h"
+#include "vad-processing.h"
 
 #include <algorithm>
 #include <chrono>
 #include <regex>
 
-struct vad_state {
-	bool vad_on;
-	uint64_t start_ts_offest_ms;
-	uint64_t end_ts_offset_ms;
-	uint64_t last_partial_segment_end_ts;
-};
-
 struct whisper_context *init_whisper_context(const std::string &model_path_in,
 					     struct transcription_filter_data *gf)
 {
@@ -161,6 +155,10 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 	float *pcm32f_data = (float *)pcm32f_data_;
 	size_t pcm32f_size = pcm32f_num_samples;
 
+	// incoming duration in ms
+	const uint64_t incoming_duration_ms =
+		(uint64_t)(pcm32f_num_samples * 1000 / WHISPER_SAMPLE_RATE);
+
 	if (pcm32f_num_samples < WHISPER_SAMPLE_RATE) {
 		obs_log(gf->log_level,
 			"Speech segment is less than 1 second, padding with zeros to 1 second");
@@ -175,7 +173,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 	}
 
 	// duration in ms
-	const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
+	const uint64_t whisper_duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE);
 
 	std::lock_guard<std::mutex> lock(gf->whisper_ctx_mutex);
 	if (gf->whisper_context == nullptr) {
@@ -185,7 +183,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 
 	// run the inference
 	int whisper_full_result = -1;
-	gf->whisper_params.duration_ms = (int)(duration_ms);
+	gf->whisper_params.duration_ms = (int)(whisper_duration_ms);
 	try {
 		whisper_full_result = whisper_full(gf->whisper_context, gf->whisper_params,
 						   pcm32f_data, (int)pcm32f_size);
@@ -243,13 +241,14 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 			// token ids https://huggingface.co/openai/whisper-large-v3/raw/main/tokenizer.json
 			if (token.id > 50365 && token.id <= 51865) {
 				const float time = ((float)token.id - 50365.0f) * 0.02f;
-				const float duration_s = (float)duration_ms / 1000.0f;
+				const float duration_s = (float)incoming_duration_ms / 1000.0f;
 				const float ratio =
 					std::max(time, duration_s) / std::min(time, duration_s);
 				obs_log(gf->log_level,
-					"Time token found %d -> %.3f. Duration: %.3f. Ratio: %.3f.",
-					token.id, time, duration_s, ratio);
-				if (ratio > 3.0f) {
+					"Time token found %d -> %.3f. Duration: %.3f. Ratio: %.3f. Threshold %.2f",
+					token.id, time, duration_s, ratio,
+					gf->duration_filter_threshold);
+				if (ratio > gf->duration_filter_threshold) {
 					// ratio is too high, skip this detection
 					obs_log(gf->log_level,
 						"Time token ratio too high, skipping");
@@ -327,233 +326,6 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
 	bfree(pcm32f_data);
 }
 
-vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
-{
-	uint32_t num_frames_from_infos = 0;
-	uint64_t start_timestamp_offset_ns = 0;
-	uint64_t end_timestamp_offset_ns = 0;
-	size_t overlap_size = 0;
-
-	for (size_t c = 0; c < gf->channels; c++) {
-		// zero the rest of copy_buffers
-		memset(gf->copy_buffers[c] + overlap_size, 0,
-		       (gf->frames - overlap_size) * sizeof(float));
-	}
-
-	{
-		// scoped lock the buffer mutex
-		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
-
-		obs_log(gf->log_level,
-			"vad based segmentation. currently %lu bytes in the audio input buffer",
-			gf->input_buffers[0].size);
-
-		// max number of frames is 10 seconds worth of audio
-		const size_t max_num_frames = gf->sample_rate * 10;
-
-		// pop all infos from the info buffer and mark the beginning timestamp from the first
-		// info as the beginning timestamp of the segment
-		struct transcription_filter_audio_info info_from_buf = {0};
-		const size_t size_of_audio_info = sizeof(transcription_filter_audio_info);
-		while (gf->info_buffer.size >= size_of_audio_info) {
-			circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
-			num_frames_from_infos += info_from_buf.frames;
-			if (start_timestamp_offset_ns == 0) {
-				start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
-			}
-			// Check if we're within the needed segment length
-			if (num_frames_from_infos > max_num_frames) {
-				// too big, push the last info into the buffer's front where it was
-				num_frames_from_infos -= info_from_buf.frames;
-				circlebuf_push_front(&gf->info_buffer, &info_from_buf,
-						     size_of_audio_info);
-				break;
-			}
-		}
-		end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
-
-		if (start_timestamp_offset_ns > end_timestamp_offset_ns) {
-			// this may happen when the incoming media has a timestamp reset
-			// in this case, we should figure out the start timestamp from the end timestamp
-			// and the number of frames
-			start_timestamp_offset_ns =
-				end_timestamp_offset_ns -
-				num_frames_from_infos * 1000000000 / gf->sample_rate;
-		}
-
-		/* Pop from input circlebuf */
-		for (size_t c = 0; c < gf->channels; c++) {
-			// Push the new data to copy_buffers[c]
-			circlebuf_pop_front(&gf->input_buffers[c],
-					    gf->copy_buffers[c] + overlap_size,
-					    num_frames_from_infos * sizeof(float));
-		}
-	}
-
-	obs_log(gf->log_level, "found %d frames from info buffer. %lu in overlap",
-		num_frames_from_infos, overlap_size);
-	gf->last_num_frames = num_frames_from_infos + overlap_size;
-
-	{
-		// resample to 16kHz
-		float *resampled_16khz[MAX_PREPROC_CHANNELS];
-		uint32_t resampled_16khz_frames;
-		uint64_t ts_offset;
-		{
-			ProfileScope("resample");
-			audio_resampler_resample(gf->resampler_to_whisper,
-						 (uint8_t **)resampled_16khz,
-						 &resampled_16khz_frames, &ts_offset,
-						 (const uint8_t **)gf->copy_buffers,
-						 (uint32_t)num_frames_from_infos);
-		}
-
-		obs_log(gf->log_level, "resampled: %d channels, %d frames, %f ms",
-			(int)gf->channels, (int)resampled_16khz_frames,
-			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f);
-		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
-				    resampled_16khz_frames * sizeof(float));
-	}
-
-	if (gf->resampled_buffer.size < (gf->vad->get_window_size_samples() * sizeof(float)))
-		return last_vad_state;
-
-	size_t len =
-		gf->resampled_buffer.size / (gf->vad->get_window_size_samples() * sizeof(float));
-
-	std::vector<float> vad_input;
-	vad_input.resize(len * gf->vad->get_window_size_samples());
-	circlebuf_pop_front(&gf->resampled_buffer, vad_input.data(),
-			    vad_input.size() * sizeof(float));
-
-	obs_log(gf->log_level, "sending %d frames to vad", vad_input.size());
-	{
-		ProfileScope("vad->process");
-		gf->vad->process(vad_input, !last_vad_state.vad_on);
-	}
-
-	const uint64_t start_ts_offset_ms = start_timestamp_offset_ns / 1000000;
-	const uint64_t end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
-
-	vad_state current_vad_state = {false, start_ts_offset_ms, end_ts_offset_ms,
-				       last_vad_state.last_partial_segment_end_ts};
-
-	std::vector<timestamp_t> stamps = gf->vad->get_speech_timestamps();
-	if (stamps.size() == 0) {
-		obs_log(gf->log_level, "VAD detected no speech in %u frames", vad_input.size());
-		if (last_vad_state.vad_on) {
-			obs_log(gf->log_level, "Last VAD was ON: segment end -> send to inference");
-			run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
-						    last_vad_state.end_ts_offset_ms,
-						    VAD_STATE_WAS_ON);
-			current_vad_state.last_partial_segment_end_ts = 0;
-		}
-
-		if (gf->enable_audio_chunks_callback) {
-			audio_chunk_callback(gf, vad_input.data(), vad_input.size(),
-					     VAD_STATE_IS_OFF,
-					     {DETECTION_RESULT_SILENCE,
-					      "[silence]",
-					      current_vad_state.start_ts_offest_ms,
-					      current_vad_state.end_ts_offset_ms,
-					      {}});
-		}
-
-		return current_vad_state;
-	}
-
-	// process vad segments
-	for (size_t i = 0; i < stamps.size(); i++) {
-		int start_frame = stamps[i].start;
-		if (i > 0) {
-			// if this is not the first segment, start from the end of the previous segment
-			start_frame = stamps[i - 1].end;
-		} else {
-			// take at least 100ms of audio before the first speech segment, if available
-			start_frame = std::max(0, start_frame - WHISPER_SAMPLE_RATE / 10);
-		}
-
-		int end_frame = stamps[i].end;
-		if (i == stamps.size() - 1 && stamps[i].end < (int)vad_input.size()) {
-			// take at least 100ms of audio after the last speech segment, if available
-			end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
-					     (int)vad_input.size());
-		}
-
-		const int number_of_frames = end_frame - start_frame;
-
-		// push the data into gf-whisper_buffer
-		circlebuf_push_back(&gf->whisper_buffer, vad_input.data() + start_frame,
-				    number_of_frames * sizeof(float));
-
-		obs_log(gf->log_level,
-			"VAD segment %d. pushed %d to %d (%d frames / %lu ms). current size: %lu bytes / %lu frames / %lu ms",
-			i, start_frame, end_frame, number_of_frames,
-			number_of_frames * 1000 / WHISPER_SAMPLE_RATE, gf->whisper_buffer.size,
-			gf->whisper_buffer.size / sizeof(float),
-			gf->whisper_buffer.size / sizeof(float) * 1000 / WHISPER_SAMPLE_RATE);
-
-		// segment "end" is in the middle of the buffer, send it to inference
-		if (stamps[i].end < (int)vad_input.size()) {
-			// new "ending" segment (not up to the end of the buffer)
-			obs_log(gf->log_level, "VAD segment end -> send to inference");
-			// find the end timestamp of the segment
-			const uint64_t segment_end_ts =
-				start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
-			run_inference_and_callbacks(
-				gf, last_vad_state.start_ts_offest_ms, segment_end_ts,
-				last_vad_state.vad_on ? VAD_STATE_WAS_ON : VAD_STATE_WAS_OFF);
-			current_vad_state.vad_on = false;
-			current_vad_state.start_ts_offest_ms = current_vad_state.end_ts_offset_ms;
-			current_vad_state.end_ts_offset_ms = 0;
-			current_vad_state.last_partial_segment_end_ts = 0;
-			last_vad_state = current_vad_state;
-			continue;
-		}
-
-		// end not reached - speech is ongoing
-		current_vad_state.vad_on = true;
-		if (last_vad_state.vad_on) {
-			current_vad_state.start_ts_offest_ms = last_vad_state.start_ts_offest_ms;
-		} else {
-			current_vad_state.start_ts_offest_ms =
-				start_ts_offset_ms + start_frame * 1000 / WHISPER_SAMPLE_RATE;
-		}
-		obs_log(gf->log_level, "end not reached. vad state: start ts: %llu, end ts: %llu",
-			current_vad_state.start_ts_offest_ms, current_vad_state.end_ts_offset_ms);
-
-		last_vad_state = current_vad_state;
-
-		// if partial transcription is enabled, check if we should send a partial segment
-		if (!gf->partial_transcription) {
-			continue;
-		}
-
-		// current length of audio in buffer
-		const uint64_t current_length_ms =
-			(current_vad_state.end_ts_offset_ms > 0
-				 ? current_vad_state.end_ts_offset_ms
-				 : current_vad_state.start_ts_offest_ms) -
-			(current_vad_state.last_partial_segment_end_ts > 0
-				 ? current_vad_state.last_partial_segment_end_ts
-				 : current_vad_state.start_ts_offest_ms);
-		obs_log(gf->log_level, "current buffer length after last partial (%lu): %lu ms",
-			current_vad_state.last_partial_segment_end_ts, current_length_ms);
-
-		if (current_length_ms > (uint64_t)gf->partial_latency) {
-			current_vad_state.last_partial_segment_end_ts =
-				current_vad_state.end_ts_offset_ms;
-			// send partial segment to inference
-			obs_log(gf->log_level, "Partial segment -> send to inference");
-			run_inference_and_callbacks(gf, current_vad_state.start_ts_offest_ms,
-						    current_vad_state.end_ts_offset_ms,
-						    VAD_STATE_PARTIAL);
-		}
-	}
-
-	return current_vad_state;
-}
-
 void whisper_loop(void *data)
 {
 	if (data == nullptr) {
diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h
index 5bc162b..a00f7cb 100644
--- a/src/whisper-utils/whisper-processing.h
+++ b/src/whisper-utils/whisper-processing.h
@@ -29,10 +29,10 @@ struct DetectionResultWithText {
 	std::string language;
 };
 
-enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL };
-
 void whisper_loop(void *data);
 struct whisper_context *init_whisper_context(const std::string &model_path,
 					     struct transcription_filter_data *gf);
+void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_offset_ms,
+				 uint64_t end_offset_ms, int vad_state);
 
 #endif // WHISPER_PROCESSING_H

From dcf368d38f60064889167f7f1e83e374f1b22445 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 23 Aug 2024 16:17:34 -0400
Subject: [PATCH 02/18] feat: Add max_sub_duration field to transcription
 filter data

---
 src/transcription-filter-callbacks.cpp    | 28 ++++---
 src/transcription-filter-data.h           |  2 +
 src/transcription-filter-properties.cpp   | 29 ++++++-
 src/transcription-filter.cpp              |  2 +
 src/whisper-utils/token-buffer-thread.cpp | 92 +++++++++++++++++------
 src/whisper-utils/token-buffer-thread.h   | 28 ++++++-
 src/whisper-utils/vad-processing.cpp      | 50 ++++++++++--
 src/whisper-utils/vad-processing.h        |  1 +
 src/whisper-utils/whisper-processing.cpp  |  2 +-
 src/whisper-utils/whisper-utils.cpp       | 21 +-----
 10 files changed, 189 insertions(+), 66 deletions(-)

diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index f5c2209..22e240b 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -199,11 +199,6 @@ void set_text_callback(struct transcription_filter_data *gf,
 		       const DetectionResultWithText &resultIn)
 {
 	DetectionResultWithText result = resultIn;
-	if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
-				     result.result == DETECTION_RESULT_PARTIAL)) {
-		gf->last_sub_render_time = now_ms();
-		gf->cleared_last_sub = false;
-	}
 
 	std::string str_copy = result.text;
 
@@ -243,10 +238,12 @@ void set_text_callback(struct transcription_filter_data *gf,
 			str_copy = translated_sentence;
 		} else {
 			if (gf->buffered_output) {
-				if (result.result == DETECTION_RESULT_SPEECH) {
-					// buffered output - add the sentence to the monitor
-					gf->translation_monitor.addSentence(translated_sentence);
-				}
+				// buffered output - add the sentence to the monitor
+				gf->translation_monitor.addSentenceFromStdString(
+					translated_sentence,
+					get_time_point_from_ms(result.start_timestamp_ms),
+					get_time_point_from_ms(result.end_timestamp_ms),
+					result.result == DETECTION_RESULT_PARTIAL);
 			} else {
 				// non-buffered output - send the sentence to the selected source
 				send_caption_to_source(gf->translation_output, translated_sentence,
@@ -256,9 +253,10 @@ void set_text_callback(struct transcription_filter_data *gf,
 	}
 
 	if (gf->buffered_output) {
-		if (result.result == DETECTION_RESULT_SPEECH) {
-			gf->captions_monitor.addSentence(str_copy);
-		}
+		gf->captions_monitor.addSentenceFromStdString(
+			str_copy, get_time_point_from_ms(result.start_timestamp_ms),
+			get_time_point_from_ms(result.end_timestamp_ms),
+			result.result == DETECTION_RESULT_PARTIAL);
 	} else {
 		// non-buffered output - send the sentence to the selected source
 		send_caption_to_source(gf->text_source_name, str_copy, gf);
@@ -273,6 +271,12 @@ void set_text_callback(struct transcription_filter_data *gf,
 	    result.result == DETECTION_RESULT_SPEECH) {
 		send_sentence_to_file(gf, result, str_copy, translated_sentence);
 	}
+
+	if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
+				     result.result == DETECTION_RESULT_PARTIAL)) {
+		gf->last_sub_render_time = now_ms();
+		gf->cleared_last_sub = false;
+	}
 };
 
 void recording_state_callback(enum obs_frontend_event event, void *data)
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 9cefd13..9205c96 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -36,6 +36,8 @@ struct transcription_filter_data {
 	size_t sentence_number;
 	// Minimal subtitle duration in ms
 	size_t min_sub_duration;
+	// Maximal subtitle duration in ms
+	size_t max_sub_duration;
 	// Last time a subtitle was rendered
 	uint64_t last_sub_render_time;
 	bool cleared_last_sub;
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index acecf4f..e689b8d 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -290,6 +290,30 @@ void add_buffered_output_group_properties(obs_properties_t *ppts)
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
 	obs_property_list_add_int(buffer_type_list, "Character", SEGMENTATION_TOKEN);
 	obs_property_list_add_int(buffer_type_list, "Word", SEGMENTATION_WORD);
+	obs_property_list_add_int(buffer_type_list, "Sentence", SEGMENTATION_SENTENCE);
+	// add callback to the segmentation selection to set default values
+	obs_property_set_modified_callback(buffer_type_list, [](obs_properties_t *props,
+								obs_property_t *property,
+								obs_data_t *settings) {
+		UNUSED_PARAMETER(property);
+		const int segmentation_type = obs_data_get_int(settings, "buffer_output_type");
+		// set default values for the number of lines and characters per line
+		switch (segmentation_type) {
+		case SEGMENTATION_TOKEN:
+			obs_data_set_int(settings, "buffer_num_lines", 2);
+			obs_data_set_int(settings, "buffer_num_chars_per_line", 30);
+			break;
+		case SEGMENTATION_WORD:
+			obs_data_set_int(settings, "buffer_num_lines", 2);
+			obs_data_set_int(settings, "buffer_num_chars_per_line", 10);
+			break;
+		case SEGMENTATION_SENTENCE:
+			obs_data_set_int(settings, "buffer_num_lines", 2);
+			obs_data_set_int(settings, "buffer_num_chars_per_line", 2);
+			break;
+		}
+		return true;
+	});
 	// add buffer lines parameter
 	obs_properties_add_int_slider(buffered_output_group, "buffer_num_lines",
 				      MT_("buffer_num_lines"), 1, 5, 1);
@@ -310,6 +334,8 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 
 	obs_properties_add_int_slider(advanced_config_group, "min_sub_duration",
 				      MT_("min_sub_duration"), 1000, 5000, 50);
+	obs_properties_add_int_slider(advanced_config_group, "max_sub_duration",
+				      MT_("max_sub_duration"), 1000, 5000, 50);
 	obs_properties_add_float_slider(advanced_config_group, "sentence_psum_accept_thresh",
 					MT_("sentence_psum_accept_thresh"), 0.0, 1.0, 0.05);
 
@@ -535,7 +561,8 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "truncate_output_file", false);
 	obs_data_set_default_bool(s, "only_while_recording", false);
 	obs_data_set_default_bool(s, "rename_file_to_match_recording", true);
-	obs_data_set_default_int(s, "min_sub_duration", 3000);
+	obs_data_set_default_int(s, "min_sub_duration", 1000);
+	obs_data_set_default_int(s, "max_sub_duration", 3000);
 	obs_data_set_default_bool(s, "advanced_settings", false);
 	obs_data_set_default_bool(s, "translate", false);
 	obs_data_set_default_string(s, "translate_target_language", "__es__");
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index d47876a..357030f 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -187,6 +187,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->sentence_number = 1;
 	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
 	gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration");
+	gf->max_sub_duration = (int)obs_data_get_int(s, "max_sub_duration");
 	gf->last_sub_render_time = now_ms();
 	gf->duration_filter_threshold = (float)obs_data_get_double(s, "duration_filter_threshold");
 	gf->partial_transcription = obs_data_get_bool(s, "partial_group");
@@ -432,6 +433,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 	gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / MAX_MS_WORK_BUFFER));
 	gf->last_num_frames = 0;
 	gf->min_sub_duration = (int)obs_data_get_int(settings, "min_sub_duration");
+	gf->max_sub_duration = (int)obs_data_get_int(settings, "max_sub_duration");
 	gf->last_sub_render_time = now_ms();
 	gf->log_level = (int)obs_data_get_int(settings, "log_level");
 	gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp
index ac34534..dc4b3fa 100644
--- a/src/whisper-utils/token-buffer-thread.cpp
+++ b/src/whisper-utils/token-buffer-thread.cpp
@@ -6,6 +6,9 @@
 #include "whisper-utils.h"
 #include "transcription-utils.h"
 
+#include <iostream>
+#include <sstream>
+
 #include <obs-module.h>
 
 #ifdef _WIN32
@@ -75,37 +78,74 @@ void TokenBufferThread::log_token_vector(const std::vector<std::string> &tokens)
 	obs_log(LOG_INFO, "TokenBufferThread::log_token_vector: '%s'", output.c_str());
 }
 
-void TokenBufferThread::addSentence(const std::string &sentence)
+void TokenBufferThread::addSentenceFromStdString(const std::string &sentence,
+						 TokenBufferTimePoint start_time,
+						 TokenBufferTimePoint end_time, bool is_partial)
 {
+	if (sentence.empty()) {
+		return;
+	}
 #ifdef _WIN32
 	// on windows convert from multibyte to wide char
 	int count =
 		MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), NULL, 0);
-	std::wstring sentence_ws(count, 0);
+	TokenBufferString sentence_ws(count, 0);
 	MultiByteToWideChar(CP_UTF8, 0, sentence.c_str(), (int)sentence.length(), &sentence_ws[0],
 			    count);
 #else
-	std::string sentence_ws = sentence;
+	TokenBufferString sentence_ws = sentence;
 #endif
-	// split to characters
-	std::vector<TokenBufferString> characters;
-	for (const auto &c : sentence_ws) {
-		characters.push_back(TokenBufferString(1, c));
+
+	TokenBufferSentence sentence_for_add;
+	sentence_for_add.start_time = start_time;
+	sentence_for_add.end_time = end_time;
+
+	if (this->segmentation == SEGMENTATION_WORD) {
+		// split the sentence to words
+		std::vector<TokenBufferString> words;
+		std::basic_istringstream<TokenBufferString::value_type> iss(sentence_ws);
+		TokenBufferString word;
+		while (iss >> word) {
+			words.push_back(word);
+		}
+		// add the words to a sentence
+		for (const auto &word : words) {
+			sentence_for_add.tokens.push_back({word, is_partial});
+			sentence_for_add.tokens.push_back({SPACE, is_partial});
+		}
+	} else if (this->segmentation == SEGMENTATION_TOKEN) {
+		// split to characters
+		std::vector<TokenBufferString> characters;
+		for (const auto &c : sentence_ws) {
+			characters.push_back(TokenBufferString(1, c));
+		}
+		// add the characters to a sentece
+		for (const auto &character : characters) {
+			sentence_for_add.tokens.push_back({character, is_partial});
+		}
+	} else {
+		// add the whole sentence as a single token
+		sentence_for_add.tokens.push_back({sentence_ws, is_partial});
+		sentence_for_add.tokens.push_back({SPACE, is_partial});
 	}
+	addSentence(sentence_for_add);
+}
 
-	std::lock_guard<std::mutex> lock(inputQueueMutex);
+void TokenBufferThread::addSentence(const TokenBufferSentence &sentence)
+{
+	std::lock_guard<std::mutex> lock(this->inputQueueMutex);
 
-	// add the characters to the inputQueue
-	for (const auto &character : characters) {
+	// add the tokens to the inputQueue
+	for (const auto &character : sentence.tokens) {
 		inputQueue.push_back(character);
 	}
-	inputQueue.push_back(SPACE);
+	inputQueue.push_back({SPACE, sentence.tokens.back().is_partial});
 
 	// add to the contribution queue as well
-	for (const auto &character : characters) {
+	for (const auto &character : sentence.tokens) {
 		contributionQueue.push_back(character);
 	}
-	contributionQueue.push_back(SPACE);
+	contributionQueue.push_back({SPACE, sentence.tokens.back().is_partial});
 	this->lastContributionTime = std::chrono::steady_clock::now();
 }
 
@@ -148,7 +188,7 @@ void TokenBufferThread::monitor()
 				if (this->segmentation == SEGMENTATION_TOKEN) {
 					// pop tokens until a space is found
 					while (!presentationQueue.empty() &&
-					       presentationQueue.front() != SPACE) {
+					       presentationQueue.front().token != SPACE) {
 						presentationQueue.pop_front();
 					}
 				}
@@ -158,6 +198,13 @@ void TokenBufferThread::monitor()
 				std::lock_guard<std::mutex> lock(inputQueueMutex);
 
 				if (!inputQueue.empty()) {
+					// if the input on the inputQueue is partial - first remove all partials
+					// from the end of the presentation queue
+					while (!presentationQueue.empty() &&
+					       presentationQueue.back().is_partial) {
+						presentationQueue.pop_back();
+					}
+
 					// if there are token on the input queue
 					// then add to the presentation queue based on the segmentation
 					if (this->segmentation == SEGMENTATION_SENTENCE) {
@@ -171,16 +218,17 @@ void TokenBufferThread::monitor()
 						presentationQueue.push_back(inputQueue.front());
 						inputQueue.pop_front();
 					} else {
+						// SEGMENTATION_WORD
 						// skip spaces in the beginning of the input queue
 						while (!inputQueue.empty() &&
-						       inputQueue.front() == SPACE) {
+						       inputQueue.front().token == SPACE) {
 							inputQueue.pop_front();
 						}
 						// add one word to the presentation queue
-						TokenBufferString word;
+						TokenBufferToken word;
 						while (!inputQueue.empty() &&
-						       inputQueue.front() != SPACE) {
-							word += inputQueue.front();
+						       inputQueue.front().token != SPACE) {
+							word = inputQueue.front();
 							inputQueue.pop_front();
 						}
 						presentationQueue.push_back(word);
@@ -200,7 +248,7 @@ void TokenBufferThread::monitor()
 					size_t wordsInSentence = 0;
 					for (size_t i = 0; i < presentationQueue.size(); i++) {
 						const auto &word = presentationQueue[i];
-						sentences.back() += word + SPACE;
+						sentences.back() += word.token + SPACE;
 						wordsInSentence++;
 						if (wordsInSentence == this->numPerSentence) {
 							sentences.push_back(TokenBufferString());
@@ -211,12 +259,12 @@ void TokenBufferThread::monitor()
 					for (size_t i = 0; i < presentationQueue.size(); i++) {
 						const auto &token = presentationQueue[i];
 						// skip spaces in the beginning of a sentence (tokensInSentence == 0)
-						if (token == SPACE &&
+						if (token.token == SPACE &&
 						    sentences.back().length() == 0) {
 							continue;
 						}
 
-						sentences.back() += token;
+						sentences.back() += token.token;
 						if (sentences.back().length() ==
 						    this->numPerSentence) {
 							// if the next character is not a space - this is a broken word
@@ -280,7 +328,7 @@ void TokenBufferThread::monitor()
 				// take the contribution queue and send it to the output
 				TokenBufferString contribution;
 				for (const auto &token : contributionQueue) {
-					contribution += token;
+					contribution += token.token;
 				}
 				contributionQueue.clear();
 #ifdef _WIN32
diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h
index 13be208..7666669 100644
--- a/src/whisper-utils/token-buffer-thread.h
+++ b/src/whisper-utils/token-buffer-thread.h
@@ -16,8 +16,10 @@
 
 #ifdef _WIN32
 typedef std::wstring TokenBufferString;
+typedef wchar_t TokenBufferChar;
 #else
 typedef std::string TokenBufferString;
+typedef char TokenBufferChar;
 #endif
 
 struct transcription_filter_data;
@@ -27,6 +29,22 @@ enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST };
 
 typedef std::chrono::time_point<std::chrono::steady_clock> TokenBufferTimePoint;
 
+inline std::chrono::time_point<std::chrono::steady_clock> get_time_point_from_ms(uint64_t ms)
+{
+	return std::chrono::time_point<std::chrono::steady_clock>(std::chrono::milliseconds(ms));
+}
+
+struct TokenBufferToken {
+	TokenBufferString token;
+	bool is_partial;
+};
+
+struct TokenBufferSentence {
+	std::vector<TokenBufferToken> tokens;
+	TokenBufferTimePoint start_time;
+	TokenBufferTimePoint end_time;
+};
+
 class TokenBufferThread {
 public:
 	// default constructor
@@ -40,7 +58,9 @@ class TokenBufferThread {
 			std::chrono::seconds maxTime_,
 			TokenBufferSegmentation segmentation_ = SEGMENTATION_TOKEN);
 
-	void addSentence(const std::string &sentence);
+	void addSentenceFromStdString(const std::string &sentence, TokenBufferTimePoint start_time,
+				      TokenBufferTimePoint end_time, bool is_partial = false);
+	void addSentence(const TokenBufferSentence &sentence);
 	void clear();
 	void stopThread();
 
@@ -59,9 +79,9 @@ class TokenBufferThread {
 	void log_token_vector(const std::vector<std::string> &tokens);
 	int getWaitTime(TokenBufferSpeed speed) const;
 	struct transcription_filter_data *gf;
-	std::deque<TokenBufferString> inputQueue;
-	std::deque<TokenBufferString> presentationQueue;
-	std::deque<TokenBufferString> contributionQueue;
+	std::deque<TokenBufferToken> inputQueue;
+	std::deque<TokenBufferToken> presentationQueue;
+	std::deque<TokenBufferToken> contributionQueue;
 	std::thread workerThread;
 	std::mutex inputQueueMutex;
 	std::mutex presentationQueueMutex;
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 2a1f8d4..9f104b9 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -5,6 +5,11 @@
 
 #include "vad-processing.h"
 
+#ifdef _WIN32
+#define NOMINMAX
+#include <Windows.h>
+#endif
+
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
 {
 	uint32_t num_frames_from_infos = 0;
@@ -96,7 +101,7 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 	}
 
 	const size_t vad_window_size_samples = gf->vad->get_window_size_samples() * sizeof(float);
-	const size_t min_vad_buffer_size = vad_window_size_samples * 15;
+	const size_t min_vad_buffer_size = vad_window_size_samples * 8;
 	if (gf->resampled_buffer.size < min_vad_buffer_size)
 		return last_vad_state;
 
@@ -156,11 +161,11 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 		}
 
 		int end_frame = stamps[i].end;
-		if (i == stamps.size() - 1 && stamps[i].end < (int)vad_input.size()) {
-			// take at least 100ms of audio after the last speech segment, if available
-			end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
-					     (int)vad_input.size());
-		}
+		// if (i == stamps.size() - 1 && stamps[i].end < (int)vad_input.size()) {
+		// 	// take at least 100ms of audio after the last speech segment, if available
+		// 	end_frame = std::min(end_frame + WHISPER_SAMPLE_RATE / 10,
+		// 			     (int)vad_input.size());
+		// }
 
 		const int number_of_frames = end_frame - start_frame;
 
@@ -196,12 +201,22 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 		// end not reached - speech is ongoing
 		current_vad_state.vad_on = true;
 		if (last_vad_state.vad_on) {
+			obs_log(gf->log_level,
+				"last vad state was: ON, start ts: %llu, end ts: %llu",
+				last_vad_state.start_ts_offest_ms, last_vad_state.end_ts_offset_ms);
 			current_vad_state.start_ts_offest_ms = last_vad_state.start_ts_offest_ms;
 		} else {
+			obs_log(gf->log_level,
+				"last vad state was: OFF, start ts: %llu, end ts: %llu. start_ts_offset_ms: %llu, start_frame: %d",
+				last_vad_state.start_ts_offest_ms, last_vad_state.end_ts_offset_ms,
+				start_ts_offset_ms, start_frame);
 			current_vad_state.start_ts_offest_ms =
 				start_ts_offset_ms + start_frame * 1000 / WHISPER_SAMPLE_RATE;
 		}
-		obs_log(gf->log_level, "end not reached. vad state: start ts: %llu, end ts: %llu",
+		current_vad_state.end_ts_offset_ms =
+			start_ts_offset_ms + end_frame * 1000 / WHISPER_SAMPLE_RATE;
+		obs_log(gf->log_level,
+			"end not reached. vad state: ON, start ts: %llu, end ts: %llu",
 			current_vad_state.start_ts_offest_ms, current_vad_state.end_ts_offset_ms);
 
 		last_vad_state = current_vad_state;
@@ -235,3 +250,24 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 
 	return current_vad_state;
 }
+
+void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file)
+{
+	// initialize Silero VAD
+#ifdef _WIN32
+	// convert mbstring to wstring
+	int count = MultiByteToWideChar(CP_UTF8, 0, silero_vad_model_file,
+					strlen(silero_vad_model_file), NULL, 0);
+	std::wstring silero_vad_model_path(count, 0);
+	MultiByteToWideChar(CP_UTF8, 0, silero_vad_model_file, strlen(silero_vad_model_file),
+			    &silero_vad_model_path[0], count);
+	obs_log(gf->log_level, "Create silero VAD: %S", silero_vad_model_path.c_str());
+#else
+	std::string silero_vad_model_path = silero_vad_model_file;
+	obs_log(gf->log_level, "Create silero VAD: %s", silero_vad_model_path.c_str());
+#endif
+	// roughly following https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+	// for silero vad parameters
+	gf->vad.reset(new VadIterator(silero_vad_model_path, WHISPER_SAMPLE_RATE, 32, 0.5f, 200,
+				      100, 100));
+}
diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h
index 237abe6..1593d8a 100644
--- a/src/whisper-utils/vad-processing.h
+++ b/src/whisper-utils/vad-processing.h
@@ -11,5 +11,6 @@ struct vad_state {
 };
 
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
+void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file);
 
 #endif // VAD_PROCESSING_H
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 17e817a..1a1e700 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -361,7 +361,7 @@ void whisper_loop(void *data)
 		if (!gf->cleared_last_sub) {
 			// check if we should clear the current sub depending on the minimum subtitle duration
 			uint64_t now = now_ms();
-			if ((now - gf->last_sub_render_time) > gf->min_sub_duration) {
+			if ((now - gf->last_sub_render_time) > gf->max_sub_duration) {
 				// clear the current sub, call the callback with an empty string
 				obs_log(gf->log_level,
 					"Clearing current subtitle. now: %lu ms, last: %lu ms", now,
diff --git a/src/whisper-utils/whisper-utils.cpp b/src/whisper-utils/whisper-utils.cpp
index c2e4929..84f3b0a 100644
--- a/src/whisper-utils/whisper-utils.cpp
+++ b/src/whisper-utils/whisper-utils.cpp
@@ -2,13 +2,10 @@
 #include "plugin-support.h"
 #include "model-utils/model-downloader.h"
 #include "whisper-processing.h"
+#include "vad-processing.h"
 
 #include <obs-module.h>
 
-#ifdef _WIN32
-#include <Windows.h>
-#endif
-
 void shutdown_whisper_thread(struct transcription_filter_data *gf)
 {
 	obs_log(gf->log_level, "shutdown_whisper_thread");
@@ -40,21 +37,7 @@ void start_whisper_thread_with_path(struct transcription_filter_data *gf,
 	}
 
 	// initialize Silero VAD
-#ifdef _WIN32
-	// convert mbstring to wstring
-	int count = MultiByteToWideChar(CP_UTF8, 0, silero_vad_model_file,
-					strlen(silero_vad_model_file), NULL, 0);
-	std::wstring silero_vad_model_path(count, 0);
-	MultiByteToWideChar(CP_UTF8, 0, silero_vad_model_file, strlen(silero_vad_model_file),
-			    &silero_vad_model_path[0], count);
-	obs_log(gf->log_level, "Create silero VAD: %S", silero_vad_model_path.c_str());
-#else
-	std::string silero_vad_model_path = silero_vad_model_file;
-	obs_log(gf->log_level, "Create silero VAD: %s", silero_vad_model_path.c_str());
-#endif
-	// roughly following https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
-	// for silero vad parameters
-	gf->vad.reset(new VadIterator(silero_vad_model_path, WHISPER_SAMPLE_RATE));
+	initialize_vad(gf, silero_vad_model_file);
 
 	obs_log(gf->log_level, "Create whisper context");
 	gf->whisper_context = init_whisper_context(whisper_model_path, gf);

From a93cb6a5fddc241a5d00c2643026ea86464da211 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 23 Aug 2024 17:02:00 -0400
Subject: [PATCH 03/18] chore: Update VAD parameters for better segmentation
 accuracy

---
 src/transcription-filter-properties.cpp | 1 +
 src/whisper-utils/vad-processing.cpp    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index e689b8d..a0c7452 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -296,6 +296,7 @@ void add_buffered_output_group_properties(obs_properties_t *ppts)
 								obs_property_t *property,
 								obs_data_t *settings) {
 		UNUSED_PARAMETER(property);
+		UNUSED_PARAMETER(props);
 		const int segmentation_type = obs_data_get_int(settings, "buffer_output_type");
 		// set default values for the number of lines and characters per line
 		switch (segmentation_type) {
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 9f104b9..9f14a32 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -268,6 +268,6 @@ void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_
 #endif
 	// roughly following https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
 	// for silero vad parameters
-	gf->vad.reset(new VadIterator(silero_vad_model_path, WHISPER_SAMPLE_RATE, 32, 0.5f, 200,
+	gf->vad.reset(new VadIterator(silero_vad_model_path, WHISPER_SAMPLE_RATE, 32, 0.5f, 100,
 				      100, 100));
 }

From 16bb8be8ea562c387e8ab86c60f15543ea055eff Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 23 Aug 2024 19:38:28 -0400
Subject: [PATCH 04/18] feat: Add segment_duration field to transcription
 filter data

---
 src/transcription-filter-data.h          |   1 +
 src/transcription-filter-properties.cpp  |   8 +-
 src/transcription-filter.cpp             |   1 +
 src/whisper-utils/vad-processing.cpp     | 120 +++++++++++++++++++++--
 src/whisper-utils/vad-processing.h       |   1 +
 src/whisper-utils/whisper-processing.cpp |   7 +-
 6 files changed, 124 insertions(+), 14 deletions(-)

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 9205c96..577d8fb 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -87,6 +87,7 @@ struct transcription_filter_data {
 	bool partial_transcription = false;
 	int partial_latency = 1000;
 	float duration_filter_threshold = 2.25f;
+	int segment_duration = 7000;
 
 	// Last transcription result
 	std::string last_text;
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index a0c7452..f77937a 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -297,7 +297,7 @@ void add_buffered_output_group_properties(obs_properties_t *ppts)
 								obs_data_t *settings) {
 		UNUSED_PARAMETER(property);
 		UNUSED_PARAMETER(props);
-		const int segmentation_type = obs_data_get_int(settings, "buffer_output_type");
+		const int segmentation_type = (int)obs_data_get_int(settings, "buffer_output_type");
 		// set default values for the number of lines and characters per line
 		switch (segmentation_type) {
 		case SEGMENTATION_TOKEN:
@@ -350,6 +350,9 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 	// add duration filter threshold slider
 	obs_properties_add_float_slider(advanced_config_group, "duration_filter_threshold",
 					MT_("duration_filter_threshold"), 0.1, 3.0, 0.05);
+	// add segment duration slider
+	obs_properties_add_int_slider(advanced_config_group, "segment_duration",
+				      MT_("segment_duration"), 3000, 15000, 100);
 
 	// add button to open filter and replace UI dialog
 	obs_properties_add_button2(
@@ -549,8 +552,9 @@ void transcription_filter_defaults(obs_data_t *s)
 				 (int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
 
 	obs_data_set_default_bool(s, "vad_enabled", true);
-	obs_data_set_default_double(s, "vad_threshold", 0.65);
+	obs_data_set_default_double(s, "vad_threshold", 0.2);
 	obs_data_set_default_double(s, "duration_filter_threshold", 2.25);
+	obs_data_set_default_int(s, "segment_duration", 7000);
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
 	obs_data_set_default_bool(s, "log_words", false);
 	obs_data_set_default_bool(s, "caption_to_stream", false);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 357030f..c0635ae 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -190,6 +190,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->max_sub_duration = (int)obs_data_get_int(s, "max_sub_duration");
 	gf->last_sub_render_time = now_ms();
 	gf->duration_filter_threshold = (float)obs_data_get_double(s, "duration_filter_threshold");
+	gf->segment_duration = (int)obs_data_get_int(s, "segment_duration");
 	gf->partial_transcription = obs_data_get_bool(s, "partial_group");
 	gf->partial_latency = (int)obs_data_get_int(s, "partial_latency");
 	bool new_buffered_output = obs_data_get_bool(s, "buffered_output");
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 9f14a32..20fdae0 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -10,22 +10,22 @@
 #include <Windows.h>
 #endif
 
-vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
+int get_data_from_buf_and_resample(transcription_filter_data *gf,
+				   uint64_t &start_timestamp_offset_ns,
+				   uint64_t &end_timestamp_offset_ns)
 {
 	uint32_t num_frames_from_infos = 0;
-	uint64_t start_timestamp_offset_ns = 0;
-	uint64_t end_timestamp_offset_ns = 0;
 
 	{
 		// scoped lock the buffer mutex
 		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
 
 		if (gf->input_buffers[0].size == 0) {
-			return last_vad_state;
+			return 1;
 		}
 
 		obs_log(gf->log_level,
-			"vad based segmentation. currently %lu bytes in the audio input buffer",
+			"segmentation: currently %lu bytes in the audio input buffer",
 			gf->input_buffers[0].size);
 
 		// max number of frames is 10 seconds worth of audio
@@ -93,11 +93,28 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 						 (uint32_t)num_frames_from_infos);
 		}
 
-		obs_log(gf->log_level, "resampled: %d channels, %d frames, %f ms",
-			(int)gf->channels, (int)resampled_16khz_frames,
-			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f);
 		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
 				    resampled_16khz_frames * sizeof(float));
+		obs_log(gf->log_level,
+			"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
+			(int)gf->channels, (int)resampled_16khz_frames,
+			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
+			gf->resampled_buffer.size);
+	}
+
+	return 0;
+}
+
+vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
+{
+	// get data from buffer and resample
+	uint64_t start_timestamp_offset_ns = 0;
+	uint64_t end_timestamp_offset_ns = 0;
+
+	const int ret = get_data_from_buf_and_resample(gf, start_timestamp_offset_ns,
+						       end_timestamp_offset_ns);
+	if (ret != 0) {
+		return last_vad_state;
 	}
 
 	const size_t vad_window_size_samples = gf->vad->get_window_size_samples() * sizeof(float);
@@ -251,6 +268,93 @@ vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_v
 	return current_vad_state;
 }
 
+vad_state hybrid_vad_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
+{
+	// get data from buffer and resample
+	uint64_t start_timestamp_offset_ns = 0;
+	uint64_t end_timestamp_offset_ns = 0;
+
+	if (get_data_from_buf_and_resample(gf, start_timestamp_offset_ns,
+					   end_timestamp_offset_ns) != 0) {
+		return last_vad_state;
+	}
+
+	last_vad_state.end_ts_offset_ms = end_timestamp_offset_ns / 1000000;
+
+	// extract the data from the resampled buffer with circlebuf_pop_front into a temp buffer
+	// and then push it into the whisper buffer
+	const size_t resampled_buffer_size = gf->resampled_buffer.size;
+	uint8_t *temp_buffer = new uint8_t[resampled_buffer_size];
+	circlebuf_pop_front(&gf->resampled_buffer, temp_buffer, resampled_buffer_size);
+	circlebuf_push_back(&gf->whisper_buffer, temp_buffer, resampled_buffer_size);
+	delete[] temp_buffer;
+
+	obs_log(gf->log_level, "whisper buffer size: %lu bytes", gf->whisper_buffer.size);
+
+	// use last_vad_state timestamps to calculate the duration of the current segment
+	if (last_vad_state.end_ts_offset_ms - last_vad_state.start_ts_offest_ms >=
+	    gf->segment_duration) {
+		obs_log(gf->log_level, "%d seconds worth of audio -> send to inference",
+			gf->segment_duration);
+		run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+					    last_vad_state.end_ts_offset_ms, VAD_STATE_WAS_ON);
+		last_vad_state.start_ts_offest_ms = end_timestamp_offset_ns / 1000000;
+		last_vad_state.last_partial_segment_end_ts = 0;
+		return last_vad_state;
+	}
+
+	// if partial transcription is enabled, check if we should send a partial segment
+	if (gf->partial_transcription) {
+		// current length of audio in buffer
+		const uint64_t current_length_ms =
+			(last_vad_state.end_ts_offset_ms > 0 ? last_vad_state.end_ts_offset_ms
+							     : last_vad_state.start_ts_offest_ms) -
+			(last_vad_state.last_partial_segment_end_ts > 0
+				 ? last_vad_state.last_partial_segment_end_ts
+				 : last_vad_state.start_ts_offest_ms);
+		obs_log(gf->log_level, "current buffer length after last partial (%lu): %lu ms",
+			last_vad_state.last_partial_segment_end_ts, current_length_ms);
+
+		if (current_length_ms > (uint64_t)gf->partial_latency) {
+			// send partial segment to inference
+			obs_log(gf->log_level, "Partial segment -> send to inference");
+			last_vad_state.last_partial_segment_end_ts =
+				last_vad_state.end_ts_offset_ms;
+
+			// run vad on the current buffer
+			std::vector<float> vad_input;
+			vad_input.resize(gf->whisper_buffer.size / sizeof(float));
+			circlebuf_peek_front(&gf->whisper_buffer, vad_input.data(),
+					     vad_input.size() * sizeof(float));
+
+			obs_log(gf->log_level, "sending %d frames to vad, %.1f ms",
+				vad_input.size(),
+				(float)vad_input.size() * 1000.0f / (float)WHISPER_SAMPLE_RATE);
+			{
+				ProfileScope("vad->process");
+				gf->vad->process(vad_input, true);
+			}
+
+			if (gf->vad->get_speech_timestamps().size() > 0) {
+				// VAD detected speech in the partial segment
+				run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,
+							    last_vad_state.end_ts_offset_ms,
+							    VAD_STATE_PARTIAL);
+			} else {
+				// VAD detected silence in the partial segment
+				obs_log(gf->log_level, "VAD detected silence in partial segment");
+				// pop the partial segment from the whisper buffer, save some audio for the next segment
+				const size_t num_bytes_to_keep =
+					(WHISPER_SAMPLE_RATE / 4) * sizeof(float);
+				circlebuf_pop_front(&gf->whisper_buffer, nullptr,
+						    gf->whisper_buffer.size - num_bytes_to_keep);
+			}
+		}
+	}
+
+	return last_vad_state;
+}
+
 void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file)
 {
 	// initialize Silero VAD
diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h
index 1593d8a..b878a0f 100644
--- a/src/whisper-utils/vad-processing.h
+++ b/src/whisper-utils/vad-processing.h
@@ -11,6 +11,7 @@ struct vad_state {
 };
 
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
+vad_state hybrid_vad_segmentation(transcription_filter_data *gf, vad_state last_vad_state);
 void initialize_vad(transcription_filter_data *gf, const char *silero_vad_model_file);
 
 #endif // VAD_PROCESSING_H
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 1a1e700..ab9693a 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -242,8 +242,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 			if (token.id > 50365 && token.id <= 51865) {
 				const float time = ((float)token.id - 50365.0f) * 0.02f;
 				const float duration_s = (float)incoming_duration_ms / 1000.0f;
-				const float ratio =
-					std::max(time, duration_s) / std::min(time, duration_s);
+				const float ratio = time / duration_s;
 				obs_log(gf->log_level,
 					"Time token found %d -> %.3f. Duration: %.3f. Ratio: %.3f. Threshold %.2f",
 					token.id, time, duration_s, ratio,
@@ -338,7 +337,7 @@ void whisper_loop(void *data)
 
 	obs_log(gf->log_level, "Starting whisper thread");
 
-	vad_state current_vad_state = {false, 0, 0, 0};
+	vad_state current_vad_state = {false, now_ms(), 0, 0};
 
 	const char *whisper_loop_name = "Whisper loop";
 	profile_register_root(whisper_loop_name, 50 * 1000 * 1000);
@@ -356,7 +355,7 @@ void whisper_loop(void *data)
 			}
 		}
 
-		current_vad_state = vad_based_segmentation(gf, current_vad_state);
+		current_vad_state = hybrid_vad_segmentation(gf, current_vad_state);
 
 		if (!gf->cleared_last_sub) {
 			// check if we should clear the current sub depending on the minimum subtitle duration

From e572c358661e2adad6435443b16470cee8e2df3a Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 23 Aug 2024 22:35:47 -0400
Subject: [PATCH 05/18] feat: Optimize VAD processing for better performance

---
 src/whisper-utils/vad-processing.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 20fdae0..0e9c744 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -284,16 +284,16 @@ vad_state hybrid_vad_segmentation(transcription_filter_data *gf, vad_state last_
 	// extract the data from the resampled buffer with circlebuf_pop_front into a temp buffer
 	// and then push it into the whisper buffer
 	const size_t resampled_buffer_size = gf->resampled_buffer.size;
-	uint8_t *temp_buffer = new uint8_t[resampled_buffer_size];
-	circlebuf_pop_front(&gf->resampled_buffer, temp_buffer, resampled_buffer_size);
-	circlebuf_push_back(&gf->whisper_buffer, temp_buffer, resampled_buffer_size);
-	delete[] temp_buffer;
+	std::vector<uint8_t> temp_buffer;
+	temp_buffer.resize(resampled_buffer_size);
+	circlebuf_pop_front(&gf->resampled_buffer, temp_buffer.data(), resampled_buffer_size);
+	circlebuf_push_back(&gf->whisper_buffer, temp_buffer.data(), resampled_buffer_size);
 
 	obs_log(gf->log_level, "whisper buffer size: %lu bytes", gf->whisper_buffer.size);
 
 	// use last_vad_state timestamps to calculate the duration of the current segment
 	if (last_vad_state.end_ts_offset_ms - last_vad_state.start_ts_offest_ms >=
-	    gf->segment_duration) {
+	    (uint64_t)gf->segment_duration) {
 		obs_log(gf->log_level, "%d seconds worth of audio -> send to inference",
 			gf->segment_duration);
 		run_inference_and_callbacks(gf, last_vad_state.start_ts_offest_ms,

From a3700815abcae509c86a023e8f87ea77d483ed87 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 23 Aug 2024 22:46:38 -0400
Subject: [PATCH 06/18] feat: Refactor token buffer thread and whisper
 processing

The code changes involve refactoring the token buffer thread and whisper processing. The token buffer thread now uses the variable name `word_token` instead of `word` for better clarity. In the whisper processing, the log message format has been updated to include the segment number and token number. These changes aim to improve the performance and accuracy of VAD processing, as well as add new fields to the transcription filter data.
---
 src/whisper-utils/token-buffer-thread.cpp | 6 +++---
 src/whisper-utils/whisper-processing.cpp  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp
index dc4b3fa..3e3b002 100644
--- a/src/whisper-utils/token-buffer-thread.cpp
+++ b/src/whisper-utils/token-buffer-thread.cpp
@@ -104,9 +104,9 @@ void TokenBufferThread::addSentenceFromStdString(const std::string &sentence,
 		// split the sentence to words
 		std::vector<TokenBufferString> words;
 		std::basic_istringstream<TokenBufferString::value_type> iss(sentence_ws);
-		TokenBufferString word;
-		while (iss >> word) {
-			words.push_back(word);
+		TokenBufferString word_token;
+		while (iss >> word_token) {
+			words.push_back(word_token);
 		}
 		// add the words to a sentence
 		for (const auto &word : words) {
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index ab9693a..9be84e4 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -261,8 +261,8 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 				text += token_str;
 				tokens.push_back(token);
 			}
-			obs_log(gf->log_level, "S %d, Token %d: %d\t%s\tp: %.3f [keep: %d]",
-				n_segment, j, token.id, token_str, token.p, keep);
+			obs_log(gf->log_level, "S %d, T %d: %d\t%s\tp: %.3f [keep: %d]", n_segment,
+				j, token.id, token_str, token.p, keep);
 		}
 	}
 	sentence_p /= (float)tokens.size();

From 1f4bf65fc44c395b8ebf3fc7637da9e097a1c429 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 27 Aug 2024 11:44:41 -0400
Subject: [PATCH 07/18] Refactor token buffer thread and whisper processing

---
 data/locale/en-US.ini                    | 12 +++++++-----
 src/tests/localvocal-offline-test.cpp    |  3 ++-
 src/transcription-filter-callbacks.cpp   |  5 ++++-
 src/transcription-filter-data.h          |  3 ++-
 src/transcription-filter-properties.cpp  | 16 +++++++++++++---
 src/transcription-filter.cpp             |  5 +++--
 src/whisper-utils/vad-processing.h       |  1 +
 src/whisper-utils/whisper-processing.cpp |  6 +++++-
 8 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 0f7b661..9ef4d18 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -1,12 +1,9 @@
 LocalVocalPlugin="LocalVocal Plugin"
 transcription_filterAudioFilter="LocalVocal Transcription"
-vad_enabled="VAD Enabled"
 vad_threshold="VAD Threshold"
 log_level="Internal Log Level"
 log_words="Log Output to Console"
 caption_to_stream="Stream Captions"
-step_by_step_processing="Step-by-step processing (⚠️ increased processing)"
-step_size_msec="Step size (ms)"
 subtitle_sources="Output Destination"
 none_no_output="None / No output"
 file_output_enable="Save to File"
@@ -51,7 +48,6 @@ translate="Translation"
 translate_add_context="Translate with context"
 whisper_translate="Translate to English (Whisper)"
 buffer_size_msec="Buffer size (ms)"
-overlap_size_msec="Overlap size (ms)"
 suppress_sentences="Suppress sentences (each line)"
 translate_output="Output Destination"
 dtw_token_timestamps="DTW token timestamps"
@@ -85,4 +81,10 @@ buffered_output_parameters="Buffered Output Configuration"
 file_output_info="Note: Translation output will be saved to a file in the same directory with the target language added to the name, e.g. 'output_es.srt'."
 partial_transcription="Enable Partial Transcription"
 partial_transcription_info="Partial transcription will increase processing load on your machine to transcribe content in real-time, which may impact performance."
-partial_latency="Latency (ms)"
\ No newline at end of file
+partial_latency="Latency (ms)"
+vad_mode="VAD Mode"
+Active_VAD="Active VAD"
+Hybrid_VAD="Hybrid VAD"
+translate_only_full_sentences="Translate only full sentences"
+duration_filter_threshold="Duration filter"
+segment_duration="Segment duration"
\ No newline at end of file
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 8fec08b..27c2037 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -17,6 +17,7 @@
 #include "transcription-filter.h"
 #include "transcription-utils.h"
 #include "whisper-utils/whisper-utils.h"
+#include "whisper-utils/vad-processing.h"
 #include "audio-file-utils.h"
 #include "translation/language_codes.h"
 
@@ -148,7 +149,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	// 	},
 	// 	30, std::chrono::seconds(10));
 
-	gf->vad_enabled = true;
+	gf->vad_mode = VAD_MODE_ACTIVE;
 	gf->log_words = true;
 	gf->caption_to_stream = false;
 	gf->start_timestamp_ms = now_ms();
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 22e240b..ccdbf00 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -228,9 +228,12 @@ void set_text_callback(struct transcription_filter_data *gf,
 		}
 	}
 
+	bool should_translate =
+		gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;
+
 	// send the sentence to translation (if enabled)
 	std::string translated_sentence =
-		send_sentence_to_translation(str_copy, gf, result.language);
+		should_translate ? send_sentence_to_translation(str_copy, gf, result.language) : "";
 
 	if (gf->translate) {
 		if (gf->translation_output == "none") {
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 577d8fb..34541f2 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -64,7 +64,7 @@ struct transcription_filter_data {
 	float sentence_psum_accept_thresh;
 
 	bool do_silence;
-	bool vad_enabled;
+	int vad_mode;
 	int log_level = LOG_DEBUG;
 	bool log_words;
 	bool caption_to_stream;
@@ -114,6 +114,7 @@ struct transcription_filter_data {
 	struct translation_context translation_ctx;
 	std::string translation_model_index;
 	std::string translation_model_path_external;
+	bool translate_only_full_sentences;
 
 	bool buffered_output = false;
 	TokenBufferThread captions_monitor;
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index f77937a..10726cc 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -7,6 +7,7 @@
 #include "transcription-filter.h"
 #include "transcription-filter-utils.h"
 #include "whisper-utils/whisper-language.h"
+#include "whisper-utils/vad-processing.h"
 #include "model-utils/model-downloader-types.h"
 #include "translation/language_codes.h"
 #include "ui/filter-replace-dialog.h"
@@ -212,8 +213,11 @@ void add_translation_group_properties(obs_properties_t *ppts)
 	obs_property_t *prop_tgt = obs_properties_add_list(
 		translation_group, "translate_target_language", MT_("target_language"),
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
+
 	obs_properties_add_bool(translation_group, "translate_add_context",
 				MT_("translate_add_context"));
+	obs_properties_add_bool(translation_group, "translate_only_full_sentences",
+				MT_("translate_only_full_sentences"));
 
 	// Populate the dropdown with the language codes
 	for (const auto &language : language_codes) {
@@ -343,7 +347,12 @@ void add_advanced_group_properties(obs_properties_t *ppts, struct transcription_
 	obs_properties_add_bool(advanced_config_group, "process_while_muted",
 				MT_("process_while_muted"));
 
-	obs_properties_add_bool(advanced_config_group, "vad_enabled", MT_("vad_enabled"));
+	// add selection for Active VAD vs Hybrid VAD
+	obs_property_t *vad_mode_list =
+		obs_properties_add_list(advanced_config_group, "vad_mode", MT_("vad_mode"),
+					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT);
+	obs_property_list_add_int(vad_mode_list, MT_("Active_VAD"), VAD_MODE_ACTIVE);
+	obs_property_list_add_int(vad_mode_list, MT_("Hybrid_VAD"), VAD_MODE_HYBRID);
 	// add vad threshold slider
 	obs_properties_add_float_slider(advanced_config_group, "vad_threshold",
 					MT_("vad_threshold"), 0.0, 1.0, 0.05);
@@ -551,8 +560,8 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_int(s, "buffer_output_type",
 				 (int)TokenBufferSegmentation::SEGMENTATION_TOKEN);
 
-	obs_data_set_default_bool(s, "vad_enabled", true);
-	obs_data_set_default_double(s, "vad_threshold", 0.2);
+	obs_data_set_default_bool(s, "vad_mode", VAD_MODE_ACTIVE);
+	obs_data_set_default_double(s, "vad_threshold", 0.65);
 	obs_data_set_default_double(s, "duration_filter_threshold", 2.25);
 	obs_data_set_default_int(s, "segment_duration", 7000);
 	obs_data_set_default_int(s, "log_level", LOG_DEBUG);
@@ -572,6 +581,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "translate", false);
 	obs_data_set_default_string(s, "translate_target_language", "__es__");
 	obs_data_set_default_bool(s, "translate_add_context", true);
+	obs_data_set_default_bool(s, "translate_only_full_sentences", true);
 	obs_data_set_default_string(s, "translate_model", "whisper-based-translation");
 	obs_data_set_default_string(s, "translation_model_path_external", "");
 	obs_data_set_default_int(s, "translate_input_tokenization_style", INPUT_TOKENIZAION_M2M100);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index c0635ae..781ae9d 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -174,7 +174,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	obs_log(gf->log_level, "LocalVocal filter update");
 
 	gf->log_level = (int)obs_data_get_int(s, "log_level");
-	gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
+	gf->vad_mode = (int)obs_data_get_int(s, "vad_mode");
 	gf->log_words = obs_data_get_bool(s, "log_words");
 	gf->caption_to_stream = obs_data_get_bool(s, "caption_to_stream");
 	gf->save_to_file = obs_data_get_bool(s, "file_output_enable");
@@ -287,6 +287,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	gf->translation_ctx.add_context = obs_data_get_bool(s, "translate_add_context");
 	gf->translation_ctx.input_tokenization_style =
 		(InputTokenizationStyle)obs_data_get_int(s, "translate_input_tokenization_style");
+	gf->translate_only_full_sentences = obs_data_get_bool(s, "translate_only_full_sentences");
 	gf->translation_output = obs_data_get_string(s, "translate_output");
 	std::string new_translate_model_index = obs_data_get_string(s, "translate_model");
 	std::string new_translation_model_path_external =
@@ -393,7 +394,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		gf->whisper_params.max_initial_ts = (float)obs_data_get_double(s, "max_initial_ts");
 		gf->whisper_params.length_penalty = (float)obs_data_get_double(s, "length_penalty");
 
-		if (gf->vad_enabled && gf->vad) {
+		if (gf->vad) {
 			const float vad_threshold = (float)obs_data_get_double(s, "vad_threshold");
 			gf->vad->set_threshold(vad_threshold);
 		}
diff --git a/src/whisper-utils/vad-processing.h b/src/whisper-utils/vad-processing.h
index b878a0f..996002b 100644
--- a/src/whisper-utils/vad-processing.h
+++ b/src/whisper-utils/vad-processing.h
@@ -2,6 +2,7 @@
 #define VAD_PROCESSING_H
 
 enum VadState { VAD_STATE_WAS_ON = 0, VAD_STATE_WAS_OFF, VAD_STATE_IS_OFF, VAD_STATE_PARTIAL };
+enum VadMode { VAD_MODE_ACTIVE = 0, VAD_MODE_HYBRID, VAD_MODE_DISABLED };
 
 struct vad_state {
 	bool vad_on;
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 9be84e4..fe858e4 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -355,7 +355,11 @@ void whisper_loop(void *data)
 			}
 		}
 
-		current_vad_state = hybrid_vad_segmentation(gf, current_vad_state);
+		if (gf->vad_mode == VAD_MODE_HYBRID) {
+			current_vad_state = hybrid_vad_segmentation(gf, current_vad_state);
+		} else if (gf->vad_mode == VAD_MODE_ACTIVE) {
+			current_vad_state = vad_based_segmentation(gf, current_vad_state);
+		}
 
 		if (!gf->cleared_last_sub) {
 			// check if we should clear the current sub depending on the minimum subtitle duration

From 3ad5df1073dbbc65c92aa9e935e1a35bdd923c47 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 28 Aug 2024 13:20:21 -0400
Subject: [PATCH 08/18] refactor: Update translation context in transcription
 filter

The code changes in this commit update the translation context in the transcription filter. The `translate_add_context` property has been changed from a boolean to an integer slider, allowing the user to specify the number of context lines to add to the translation. This change aims to provide more flexibility in controlling the context for translation and improve the accuracy of the translation output.
---
 src/tests/localvocal-offline-test.cpp   |  2 +-
 src/transcription-filter-callbacks.cpp  |  5 +++
 src/transcription-filter-properties.cpp |  7 +--
 src/transcription-filter.cpp            |  2 +-
 src/translation/translation.cpp         | 60 ++++++++++++++++++++-----
 src/translation/translation.h           |  9 ++--
 6 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 27c2037..ee936af 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -158,7 +158,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->buffered_output = false;
 
 	gf->target_lang = "";
-	gf->translation_ctx.add_context = true;
+	gf->translation_ctx.add_context = 1;
 	gf->translation_output = "";
 	gf->translate = false;
 	gf->sentence_psum_accept_thresh = 0.4;
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index ccdbf00..3fc55f9 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -321,6 +321,11 @@ void reset_caption_state(transcription_filter_data *gf_)
 	}
 	send_caption_to_source(gf_->text_source_name, "", gf_);
 	send_caption_to_source(gf_->translation_output, "", gf_);
+	// reset translation context
+	gf_->last_text = "";
+	gf_->last_text_translation = "";
+	gf_->translation_ctx.last_input_tokens.clear();
+	gf_->translation_ctx.last_translation_tokens.clear();
 	// flush the buffer
 	{
 		std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 10726cc..436df91 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -214,8 +214,9 @@ void add_translation_group_properties(obs_properties_t *ppts)
 		translation_group, "translate_target_language", MT_("target_language"),
 		OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
 
-	obs_properties_add_bool(translation_group, "translate_add_context",
-				MT_("translate_add_context"));
+	// add slider for number of context lines to add to the translation
+	obs_properties_add_int_slider(translation_group, "translate_add_context",
+				      MT_("translate_add_context"), 0, 5, 1);
 	obs_properties_add_bool(translation_group, "translate_only_full_sentences",
 				MT_("translate_only_full_sentences"));
 
@@ -580,7 +581,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "advanced_settings", false);
 	obs_data_set_default_bool(s, "translate", false);
 	obs_data_set_default_string(s, "translate_target_language", "__es__");
-	obs_data_set_default_bool(s, "translate_add_context", true);
+	obs_data_set_default_int(s, "translate_add_context", 1);
 	obs_data_set_default_bool(s, "translate_only_full_sentences", true);
 	obs_data_set_default_string(s, "translate_model", "whisper-based-translation");
 	obs_data_set_default_string(s, "translation_model_path_external", "");
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 781ae9d..fbc085b 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -284,7 +284,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 
 	bool new_translate = obs_data_get_bool(s, "translate");
 	gf->target_lang = obs_data_get_string(s, "translate_target_language");
-	gf->translation_ctx.add_context = obs_data_get_bool(s, "translate_add_context");
+	gf->translation_ctx.add_context = (int)obs_data_get_int(s, "translate_add_context");
 	gf->translation_ctx.input_tokenization_style =
 		(InputTokenizationStyle)obs_data_get_int(s, "translate_input_tokenization_style");
 	gf->translate_only_full_sentences = obs_data_get_bool(s, "translate_only_full_sentences");
diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp
index e11f072..6d0d277 100644
--- a/src/translation/translation.cpp
+++ b/src/translation/translation.cpp
@@ -114,31 +114,53 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 		if (translation_ctx.input_tokenization_style == INPUT_TOKENIZAION_M2M100) {
 			// set input tokens
 			std::vector<std::string> input_tokens = {source_lang, "<s>"};
-			if (translation_ctx.add_context &&
+			if (translation_ctx.add_context > 0 &&
 			    translation_ctx.last_input_tokens.size() > 0) {
-				input_tokens.insert(input_tokens.end(),
-						    translation_ctx.last_input_tokens.begin(),
-						    translation_ctx.last_input_tokens.end());
+				// add the last input tokens sentences to the input tokens
+				for (const auto &tokens : translation_ctx.last_input_tokens) {
+					input_tokens.insert(input_tokens.end(), tokens.begin(),
+							    tokens.end());
+				}
 			}
 			std::vector<std::string> new_input_tokens = translation_ctx.tokenizer(text);
 			input_tokens.insert(input_tokens.end(), new_input_tokens.begin(),
 					    new_input_tokens.end());
 			input_tokens.push_back("</s>");
 
-			translation_ctx.last_input_tokens = new_input_tokens;
+			// log the input tokens
+			std::string input_tokens_str;
+			for (const auto &token : input_tokens) {
+				input_tokens_str += token + ", ";
+			}
+			obs_log(LOG_INFO, "Input tokens: %s", input_tokens_str.c_str());
+
+			translation_ctx.last_input_tokens.push_back(new_input_tokens);
+			// remove the oldest input tokens
+			while (translation_ctx.last_input_tokens.size() >
+			       translation_ctx.add_context) {
+				translation_ctx.last_input_tokens.pop_front();
+			}
 
 			const std::vector<std::vector<std::string>> batch = {input_tokens};
 
 			// get target prefix
 			target_prefix = {target_lang};
-			if (translation_ctx.add_context &&
+			// add the last translation tokens to the target prefix
+			if (translation_ctx.add_context > 0 &&
 			    translation_ctx.last_translation_tokens.size() > 0) {
-				target_prefix.insert(
-					target_prefix.end(),
-					translation_ctx.last_translation_tokens.begin(),
-					translation_ctx.last_translation_tokens.end());
+				for (const auto &tokens : translation_ctx.last_translation_tokens) {
+					target_prefix.insert(target_prefix.end(), tokens.begin(),
+							     tokens.end());
+				}
 			}
 
+			// log the target prefix
+			std::string target_prefix_str;
+			for (const auto &token : target_prefix) {
+				target_prefix_str += token + ",";
+			}
+			obs_log(LOG_INFO, "Target prefix: %s", target_prefix_str.c_str());
+
 			const std::vector<std::vector<std::string>> target_prefix_batch = {
 				target_prefix};
 			results = translation_ctx.translator->translate_batch(
@@ -161,7 +183,23 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 		std::vector<std::string> translation_tokens(
 			tokens_result.begin() + target_prefix.size(), tokens_result.end());
 
-		translation_ctx.last_translation_tokens = translation_tokens;
+		// log the translation tokens
+		std::string translation_tokens_str;
+		for (const auto &token : translation_tokens) {
+			translation_tokens_str += token + ", ";
+		}
+		obs_log(LOG_INFO, "Translation tokens: %s", translation_tokens_str.c_str());
+
+		// save the translation tokens
+		translation_ctx.last_translation_tokens.push_back(translation_tokens);
+		// remove the oldest translation tokens
+		while (translation_ctx.last_translation_tokens.size() >
+		       translation_ctx.add_context) {
+			translation_ctx.last_translation_tokens.pop_front();
+		}
+		obs_log(LOG_INFO, "Last translation tokens deque size: %d",
+			(int)translation_ctx.last_translation_tokens.size());
+
 		// detokenize
 		result = translation_ctx.detokenizer(translation_tokens);
 	} catch (std::exception &e) {
diff --git a/src/translation/translation.h b/src/translation/translation.h
index 0d45080..c740726 100644
--- a/src/translation/translation.h
+++ b/src/translation/translation.h
@@ -3,6 +3,7 @@
 
 #include <string>
 #include <vector>
+#include <deque>
 #include <functional>
 #include <memory>
 
@@ -25,10 +26,10 @@ struct translation_context {
 	std::unique_ptr<ctranslate2::TranslationOptions> options;
 	std::function<std::vector<std::string>(const std::string &)> tokenizer;
 	std::function<std::string(const std::vector<std::string> &)> detokenizer;
-	std::vector<std::string> last_input_tokens;
-	std::vector<std::string> last_translation_tokens;
-	// Use the last translation as context for the next translation
-	bool add_context;
+	std::deque<std::vector<std::string>> last_input_tokens;
+	std::deque<std::vector<std::string>> last_translation_tokens;
+	// How many sentences to use as context for the next translation
+	int add_context;
 	InputTokenizationStyle input_tokenization_style;
 };
 

From 0e3df02ba0e4b17260297b0946f45090700658fa Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Wed, 28 Aug 2024 23:02:13 -0400
Subject: [PATCH 09/18] refactor: Update last_text variable name in
 transcription filter callbacks

---
 src/transcription-filter-callbacks.cpp   | 11 ++++++++---
 src/transcription-filter-data.h          |  4 +++-
 src/translation/translation.cpp          |  4 ++--
 src/whisper-utils/whisper-processing.cpp |  3 +++
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 3fc55f9..f3ffe30 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -53,8 +53,8 @@ std::string send_sentence_to_translation(const std::string &sentence,
 					 struct transcription_filter_data *gf,
 					 const std::string &source_language)
 {
-	const std::string last_text = gf->last_text;
-	gf->last_text = sentence;
+	const std::string last_text = gf->last_text_for_translation;
+	gf->last_text_for_translation = sentence;
 	if (gf->translate && !sentence.empty()) {
 		obs_log(gf->log_level, "Translating text. %s -> %s", source_language.c_str(),
 			gf->target_lang.c_str());
@@ -279,6 +279,10 @@ void set_text_callback(struct transcription_filter_data *gf,
 				     result.result == DETECTION_RESULT_PARTIAL)) {
 		gf->last_sub_render_time = now_ms();
 		gf->cleared_last_sub = false;
+		if (result.result == DETECTION_RESULT_SPEECH) {
+			// save the last subtitle if it was a full sentence
+			gf->last_transcription_sentence = result.text;
+		}
 	}
 };
 
@@ -322,10 +326,11 @@ void reset_caption_state(transcription_filter_data *gf_)
 	send_caption_to_source(gf_->text_source_name, "", gf_);
 	send_caption_to_source(gf_->translation_output, "", gf_);
 	// reset translation context
-	gf_->last_text = "";
+	gf_->last_text_for_translation = "";
 	gf_->last_text_translation = "";
 	gf_->translation_ctx.last_input_tokens.clear();
 	gf_->translation_ctx.last_translation_tokens.clear();
+	gf_->last_transcription_sentence = "";
 	// flush the buffer
 	{
 		std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 34541f2..aa9d75b 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -90,9 +90,11 @@ struct transcription_filter_data {
 	int segment_duration = 7000;
 
 	// Last transcription result
-	std::string last_text;
+	std::string last_text_for_translation;
 	std::string last_text_translation;
 
+	std::string last_transcription_sentence;
+
 	// Text source to output the subtitles
 	std::string text_source_name;
 	// Callback to set the text in the output text source (subtitles)
diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp
index 6d0d277..f7b89dd 100644
--- a/src/translation/translation.cpp
+++ b/src/translation/translation.cpp
@@ -137,7 +137,7 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			translation_ctx.last_input_tokens.push_back(new_input_tokens);
 			// remove the oldest input tokens
 			while (translation_ctx.last_input_tokens.size() >
-			       translation_ctx.add_context) {
+			       (size_t)translation_ctx.add_context) {
 				translation_ctx.last_input_tokens.pop_front();
 			}
 
@@ -194,7 +194,7 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 		translation_ctx.last_translation_tokens.push_back(translation_tokens);
 		// remove the oldest translation tokens
 		while (translation_ctx.last_translation_tokens.size() >
-		       translation_ctx.add_context) {
+		       (size_t)translation_ctx.add_context) {
 			translation_ctx.last_translation_tokens.pop_front();
 		}
 		obs_log(LOG_INFO, "Last translation tokens deque size: %d",
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index fe858e4..c33af52 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -181,6 +181,9 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
 	}
 
+	// obs_log(LOG_INFO, "initial prompt: %s", gf->last_transcription_sentence.c_str());
+	// gf->whisper_params.initial_prompt = gf->last_transcription_sentence.c_str();
+
 	// run the inference
 	int whisper_full_result = -1;
 	gf->whisper_params.duration_ms = (int)(whisper_duration_ms);

From c302d3af923cbd72e83d2437a4aa4b4c21156143 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 14:13:30 -0400
Subject: [PATCH 10/18] feat: Add translation language utilities

This commit adds a new file, `translation-language-utils.h`, which contains utility functions for handling translation languages. The `remove_start_punctuation` function removes any leading punctuation from a given string. This utility will be used in the translation process to improve the quality of the translated output.
---
 CMakeLists.txt                                |  11 +-
 cmake/BuildICU.cmake                          | 112 ++++++++++++++++++
 src/transcription-filter-callbacks.cpp        |   8 +-
 src/transcription-filter-data.h               |   4 +-
 src/transcription-filter-properties.cpp       |   5 +
 src/transcription-filter.cpp                  |   2 +
 .../translation-language-utils.cpp            |  33 ++++++
 src/translation/translation-language-utils.h  |   8 ++
 src/translation/translation.cpp               |   4 +-
 src/whisper-utils/whisper-processing.cpp      |  11 +-
 10 files changed, 190 insertions(+), 8 deletions(-)
 create mode 100644 cmake/BuildICU.cmake
 create mode 100644 src/translation/translation-language-utils.cpp
 create mode 100644 src/translation/translation-language-utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5fa82a..4e86fe5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,11 @@ else()
   include(cmake/FetchOnnxruntime.cmake)
 endif()
 
+include(cmake/BuildICU.cmake)
+# Add ICU to the target
+target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU::ICU)
+target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
+
 target_sources(
   ${CMAKE_PROJECT_NAME}
   PRIVATE src/plugin-main.c
@@ -118,6 +123,7 @@ target_sources(
           src/translation/language_codes.cpp
           src/translation/translation.cpp
           src/translation/translation-utils.cpp
+          src/translation/translation-language-utils.cpp
           src/ui/filter-replace-dialog.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
@@ -140,11 +146,12 @@ if(ENABLE_TESTS)
             src/whisper-utils/token-buffer-thread.cpp
             src/whisper-utils/vad-processing.cpp
             src/translation/language_codes.cpp
-            src/translation/translation.cpp)
+            src/translation/translation.cpp
+            src/translation/translation-language-utils.cpp)
 
   find_libav(${CMAKE_PROJECT_NAME}-tests)
 
-  target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs)
+  target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs ICU::ICU)
   target_include_directories(${CMAKE_PROJECT_NAME}-tests PRIVATE src)
 
   # install the tests to the release/test directory
diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
new file mode 100644
index 0000000..6c83d57
--- /dev/null
+++ b/cmake/BuildICU.cmake
@@ -0,0 +1,112 @@
+include(FetchContent)
+include(ExternalProject)
+
+set(ICU_VERSION "75.1")
+set(ICU_VERSION_UNDERSCORE "75_1")
+set(ICU_VERSION_DASH "75-1")
+set(ICU_VERSION_NO_MINOR "75")
+
+if(WIN32)
+  set(ICU_URL
+      "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-Win64-MSVC2022.zip"
+  )
+  set(ICU_HASH "SHA256=7ac9c0dc6ccc1ec809c7d5689b8d831c5b8f6b11ecf70fdccc55f7ae8731ac8f")
+
+  FetchContent_Declare(
+    ICU
+    URL ${ICU_URL}
+    URL_HASH ${ICU_HASH})
+
+  FetchContent_MakeAvailable(ICU)
+
+  # Assuming the ZIP structure, adjust paths as necessary
+  set(ICU_INCLUDE_DIR "${icu_SOURCE_DIR}/include")
+  set(ICU_LIBRARY_DIR "${icu_SOURCE_DIR}/lib64")
+  set(ICU_BINARY_DIR "${icu_SOURCE_DIR}/bin64")
+
+  # Add ICU libraries
+  find_library(
+    ICU_DATA_LIBRARY
+    NAMES icudt
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+  find_library(
+    ICU_UC_LIBRARY
+    NAMES icuuc
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+  find_library(
+    ICU_IN_LIBRARY
+    NAMES icuin
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+
+  # find the dlls
+  find_file(
+    ICU_DATA_DLL
+    NAMES icudt${ICU_VERSION_NO_MINOR}.dll
+    PATHS ${ICU_BINARY_DIR}
+    NO_DEFAULT_PATH)
+  find_file(
+    ICU_UC_DLL
+    NAMES icuuc${ICU_VERSION_NO_MINOR}.dll
+    PATHS ${ICU_BINARY_DIR}
+    NO_DEFAULT_PATH)
+  find_file(
+    ICU_IN_DLL
+    NAMES icuin${ICU_VERSION_NO_MINOR}.dll
+    PATHS ${ICU_BINARY_DIR}
+    NO_DEFAULT_PATH)
+
+  # Copy the DLLs to the output directory
+  install(FILES ${ICU_DATA_DLL} DESTINATION "obs-plugins/64bit")
+  install(FILES ${ICU_UC_DLL} DESTINATION "obs-plugins/64bit")
+  install(FILES ${ICU_IN_DLL} DESTINATION "obs-plugins/64bit")
+
+else() # Mac and Linux
+  set(ICU_URL
+      "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_UNDERSCORE}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
+  )
+  set(ICU_HASH "SHA256=94bb97d88f13bb74ec0168446a845511bd92c1c49ee8e63df646a48c38dfde6d")
+
+  set(ICU_INSTALL_DIR "${CMAKE_BINARY_DIR}/icu-install")
+
+  ExternalProject_Add(
+    ICU
+    URL ${ICU_URL}
+    URL_HASH ${ICU_HASH}
+    CONFIGURE_COMMAND <SOURCE_DIR>/source/runConfigureICU Linux --prefix=${ICU_INSTALL_DIR}
+    BUILD_COMMAND make -j4
+    INSTALL_COMMAND make install
+    BUILD_IN_SOURCE 1)
+
+  set(ICU_INCLUDE_DIR "${ICU_INSTALL_DIR}/include")
+  set(ICU_LIBRARY_DIR "${ICU_INSTALL_DIR}/lib")
+
+  # Add ICU libraries
+  find_library(
+    ICU_DATA_LIBRARY
+    NAMES icudata
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+  find_library(
+    ICU_UC_LIBRARY
+    NAMES icuuc
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+  find_library(
+    ICU_I18N_LIBRARY
+    NAMES icui18n
+    PATHS ${ICU_LIBRARY_DIR}
+    NO_DEFAULT_PATH)
+endif()
+
+# Create an interface target for ICU
+add_library(ICU::ICU INTERFACE IMPORTED GLOBAL)
+target_include_directories(ICU::ICU INTERFACE ${ICU_INCLUDE_DIR})
+target_link_libraries(ICU::ICU INTERFACE ${ICU_DATA_LIBRARY} ${ICU_UC_LIBRARY}
+                                         $<IF:$<BOOL:${WIN32}>,${ICU_IN_LIBRARY},${ICU_I18N_LIBRARY}>)
+
+if(NOT WIN32)
+  add_dependencies(ICU::ICU ICU)
+endif()
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index f3ffe30..1dc0f0e 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -281,7 +281,11 @@ void set_text_callback(struct transcription_filter_data *gf,
 		gf->cleared_last_sub = false;
 		if (result.result == DETECTION_RESULT_SPEECH) {
 			// save the last subtitle if it was a full sentence
-			gf->last_transcription_sentence = result.text;
+			gf->last_transcription_sentence.push_back(result.text);
+			// remove the oldest sentence if the buffer is too long
+			while (gf->last_transcription_sentence.size() > gf->n_context_sentences) {
+				gf->last_transcription_sentence.pop_front();
+			}
 		}
 	}
 };
@@ -330,7 +334,7 @@ void reset_caption_state(transcription_filter_data *gf_)
 	gf_->last_text_translation = "";
 	gf_->translation_ctx.last_input_tokens.clear();
 	gf_->translation_ctx.last_translation_tokens.clear();
-	gf_->last_transcription_sentence = "";
+	gf_->last_transcription_sentence.clear();
 	// flush the buffer
 	{
 		std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index aa9d75b..e1af694 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -93,7 +93,9 @@ struct transcription_filter_data {
 	std::string last_text_for_translation;
 	std::string last_text_translation;
 
-	std::string last_transcription_sentence;
+	// Transcription context sentences
+	int n_context_sentences;
+	std::deque<std::string> last_transcription_sentence;
 
 	// Text source to output the subtitles
 	std::string text_source_name;
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 436df91..4a3693f 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -414,6 +414,10 @@ void add_whisper_params_group_properties(obs_properties_t *ppts)
 				  WHISPER_SAMPLING_BEAM_SEARCH);
 	obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY);
 
+	// add int slider for context sentences
+	obs_properties_add_int_slider(whisper_params_group, "n_context_sentences",
+				      MT_("n_context_sentences"), 0, 5, 1);
+
 	// int n_threads;
 	obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1);
 	// int n_max_text_ctx;     // max tokens to use from past text as prompt for the decoder
@@ -600,6 +604,7 @@ void transcription_filter_defaults(obs_data_t *s)
 
 	// Whisper parameters
 	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
+	obs_data_set_default_int(s, "n_context_sentences", 0);
 	obs_data_set_default_string(s, "initial_prompt", "");
 	obs_data_set_default_int(s, "n_threads", 4);
 	obs_data_set_default_int(s, "n_max_text_ctx", 16384);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index fbc085b..657fea6 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -346,6 +346,8 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	{
 		std::lock_guard<std::mutex> lock(gf->whisper_ctx_mutex);
 
+		gf->n_context_sentences = (int)obs_data_get_int(s, "n_context_sentences");
+
 		gf->sentence_psum_accept_thresh =
 			(float)obs_data_get_double(s, "sentence_psum_accept_thresh");
 
diff --git a/src/translation/translation-language-utils.cpp b/src/translation/translation-language-utils.cpp
new file mode 100644
index 0000000..685ca1a
--- /dev/null
+++ b/src/translation/translation-language-utils.cpp
@@ -0,0 +1,33 @@
+#include "translation-language-utils.h"
+
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+
+std::string remove_start_punctuation(const std::string &text)
+{
+	if (text.empty()) {
+		return text;
+	}
+
+	// Convert the input string to ICU's UnicodeString
+	icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(text);
+
+	// Find the index of the first non-punctuation character
+	int32_t start = 0;
+	while (start < ustr.length()) {
+		UChar32 ch = ustr.char32At(start);
+		if (!u_ispunct(ch)) {
+			break;
+		}
+		start += U16_LENGTH(ch);
+	}
+
+	// Create a new UnicodeString with punctuation removed from the start
+	icu::UnicodeString result = ustr.tempSubString(start);
+
+	// Convert the result back to UTF-8
+	std::string output;
+	result.toUTF8String(output);
+
+	return output;
+}
diff --git a/src/translation/translation-language-utils.h b/src/translation/translation-language-utils.h
new file mode 100644
index 0000000..44b450a
--- /dev/null
+++ b/src/translation/translation-language-utils.h
@@ -0,0 +1,8 @@
+#ifndef TRANSLATION_LANGUAGE_UTILS_H
+#define TRANSLATION_LANGUAGE_UTILS_H
+
+#include <string>
+
+std::string remove_start_punctuation(const std::string &text);
+
+#endif // TRANSLATION_LANGUAGE_UTILS_H
\ No newline at end of file
diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp
index f7b89dd..0701d95 100644
--- a/src/translation/translation.cpp
+++ b/src/translation/translation.cpp
@@ -3,6 +3,7 @@
 #include "model-utils/model-find-utils.h"
 #include "transcription-filter-data.h"
 #include "language_codes.h"
+#include "translation-language-utils.h"
 
 #include <ctranslate2/translator.h>
 #include <sentencepiece_processor.h>
@@ -201,7 +202,8 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			(int)translation_ctx.last_translation_tokens.size());
 
 		// detokenize
-		result = translation_ctx.detokenizer(translation_tokens);
+		const std::string result_ = translation_ctx.detokenizer(translation_tokens);
+		result = remove_start_punctuation(result_);
 	} catch (std::exception &e) {
 		obs_log(LOG_ERROR, "Error: %s", e.what());
 		return OBS_POLYGLOT_TRANSLATION_FAIL;
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index c33af52..a5b3924 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -181,8 +181,15 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 		return {DETECTION_RESULT_UNKNOWN, "", t0, t1, {}, ""};
 	}
 
-	// obs_log(LOG_INFO, "initial prompt: %s", gf->last_transcription_sentence.c_str());
-	// gf->whisper_params.initial_prompt = gf->last_transcription_sentence.c_str();
+	if (gf->n_context_sentences > 0 && !gf->last_transcription_sentence.empty()) {
+		// set the initial prompt to the last transcription sentences (concatenated)
+		std::string initial_prompt = gf->last_transcription_sentence[0];
+		for (int i = 1; i < gf->last_transcription_sentence.size(); ++i) {
+			initial_prompt += " " + gf->last_transcription_sentence[i];
+		}
+		gf->whisper_params.initial_prompt = initial_prompt.c_str();
+		obs_log(gf->log_level, "Initial prompt: %s", gf->whisper_params.initial_prompt);
+	}
 
 	// run the inference
 	int whisper_full_result = -1;

From 9861a6fa28a5d0221c236d57b6640adfcc0e78e6 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 16:45:19 -0400
Subject: [PATCH 11/18] feat: Update ICU library configuration and dependencies

This commit updates the configuration and dependencies of the ICU library. The `BuildICU.cmake` file has been modified to use the `INSTALL_DIR` variable instead of the `ICU_INSTALL_DIR` variable for setting the ICU library paths. Additionally, the `ICU_IN_LIBRARY` variable has been renamed to `ICU_IN_LIBRARY` for better clarity. These changes aim to improve the build process and ensure proper linking of the ICU library.
---
 cmake/BuildICU.cmake | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index 6c83d57..8d34498 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -69,19 +69,19 @@ else() # Mac and Linux
   )
   set(ICU_HASH "SHA256=94bb97d88f13bb74ec0168446a845511bd92c1c49ee8e63df646a48c38dfde6d")
 
-  set(ICU_INSTALL_DIR "${CMAKE_BINARY_DIR}/icu-install")
-
   ExternalProject_Add(
     ICU
     URL ${ICU_URL}
     URL_HASH ${ICU_HASH}
-    CONFIGURE_COMMAND <SOURCE_DIR>/source/runConfigureICU Linux --prefix=${ICU_INSTALL_DIR}
+    CONFIGURE_COMMAND <SOURCE_DIR>/source/runConfigureICU Linux --prefix=<INSTALL_DIR>
     BUILD_COMMAND make -j4
     INSTALL_COMMAND make install
     BUILD_IN_SOURCE 1)
 
-  set(ICU_INCLUDE_DIR "${ICU_INSTALL_DIR}/include")
-  set(ICU_LIBRARY_DIR "${ICU_INSTALL_DIR}/lib")
+  ExternalProject_Get_Property(ICU INSTALL_DIR)
+
+  set(ICU_INCLUDE_DIR "${INSTALL_DIR}/include")
+  set(ICU_LIBRARY_DIR "${INSTALL_DIR}/lib")
 
   # Add ICU libraries
   find_library(
@@ -95,7 +95,7 @@ else() # Mac and Linux
     PATHS ${ICU_LIBRARY_DIR}
     NO_DEFAULT_PATH)
   find_library(
-    ICU_I18N_LIBRARY
+    ICU_IN_LIBRARY
     NAMES icui18n
     PATHS ${ICU_LIBRARY_DIR}
     NO_DEFAULT_PATH)
@@ -103,10 +103,6 @@ endif()
 
 # Create an interface target for ICU
 add_library(ICU::ICU INTERFACE IMPORTED GLOBAL)
+add_dependencies(ICU::ICU ICU)
 target_include_directories(ICU::ICU INTERFACE ${ICU_INCLUDE_DIR})
-target_link_libraries(ICU::ICU INTERFACE ${ICU_DATA_LIBRARY} ${ICU_UC_LIBRARY}
-                                         $<IF:$<BOOL:${WIN32}>,${ICU_IN_LIBRARY},${ICU_I18N_LIBRARY}>)
-
-if(NOT WIN32)
-  add_dependencies(ICU::ICU ICU)
-endif()
+target_link_libraries(ICU::ICU INTERFACE ${ICU_DATA_LIBRARY} ${ICU_UC_LIBRARY} ${ICU_IN_LIBRARY})

From 851f6a5f2b62bc7ea56095de355a6243146c2235 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 20:50:15 -0400
Subject: [PATCH 12/18] refactor: Update ICU library configuration and
 dependencies

---
 CMakeLists.txt                           |  2 +-
 cmake/BuildICU.cmake                     | 83 +++++++++++++++---------
 src/transcription-filter-callbacks.cpp   |  3 +-
 src/whisper-utils/whisper-processing.cpp |  2 +-
 4 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e86fe5..671d9ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,7 +98,7 @@ endif()
 
 include(cmake/BuildICU.cmake)
 # Add ICU to the target
-target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU::ICU)
+target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
 target_sources(
diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index 8d34498..7da9175 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -13,11 +13,11 @@ if(WIN32)
   set(ICU_HASH "SHA256=7ac9c0dc6ccc1ec809c7d5689b8d831c5b8f6b11ecf70fdccc55f7ae8731ac8f")
 
   FetchContent_Declare(
-    ICU
+    ICU_build
     URL ${ICU_URL}
     URL_HASH ${ICU_HASH})
 
-  FetchContent_MakeAvailable(ICU)
+  FetchContent_MakeAvailable(ICU_build)
 
   # Assuming the ZIP structure, adjust paths as necessary
   set(ICU_INCLUDE_DIR "${icu_SOURCE_DIR}/include")
@@ -63,46 +63,69 @@ if(WIN32)
   install(FILES ${ICU_UC_DLL} DESTINATION "obs-plugins/64bit")
   install(FILES ${ICU_IN_DLL} DESTINATION "obs-plugins/64bit")
 
-else() # Mac and Linux
+  add_library(ICU::ICU_data SHARED IMPORTED GLOBAL)
+  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                         "${ICU_INCLUDE_DIR}")
+
+  add_library(ICU::ICU_uc SHARED IMPORTED GLOBAL)
+  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                     "${ICU_INCLUDE_DIR}")
+
+  add_library(ICU::ICU_in SHARED IMPORTED GLOBAL)
+  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                     "${ICU_INCLUDE_DIR}")
+
+else()
   set(ICU_URL
-      "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_UNDERSCORE}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
+      "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
   )
-  set(ICU_HASH "SHA256=94bb97d88f13bb74ec0168446a845511bd92c1c49ee8e63df646a48c38dfde6d")
+  set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
+  if(APPLE)
+    set(ICU_PLATFORM "MacOSX")
+  else()
+    set(ICU_PLATFORM "Linux")
+  endif()
 
   ExternalProject_Add(
-    ICU
-    URL ${ICU_URL}
-    URL_HASH ${ICU_HASH}
-    CONFIGURE_COMMAND <SOURCE_DIR>/source/runConfigureICU Linux --prefix=<INSTALL_DIR>
+    ICU_build
+    GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
+    GIT_TAG "release-${ICU_VERSION_DASH}"
+    CONFIGURE_COMMAND <SOURCE_DIR>/icu4c/source/runConfigureICU ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static
+                      --disable-shared
     BUILD_COMMAND make -j4
+    BUILD_BYPRODUCTS
+      <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
+      <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}
+      <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuin${CMAKE_STATIC_LIBRARY_SUFFIX}
     INSTALL_COMMAND make install
     BUILD_IN_SOURCE 1)
 
-  ExternalProject_Get_Property(ICU INSTALL_DIR)
+  ExternalProject_Get_Property(ICU_build INSTALL_DIR)
 
   set(ICU_INCLUDE_DIR "${INSTALL_DIR}/include")
   set(ICU_LIBRARY_DIR "${INSTALL_DIR}/lib")
 
-  # Add ICU libraries
-  find_library(
-    ICU_DATA_LIBRARY
-    NAMES icudata
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
-  find_library(
-    ICU_UC_LIBRARY
-    NAMES icuuc
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
-  find_library(
-    ICU_IN_LIBRARY
-    NAMES icui18n
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
+  add_library(ICU::ICU_data STATIC IMPORTED GLOBAL)
+  add_dependencies(ICU::ICU_data ICU_build)
+  set(ICU_DATA_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                         "${ICU_INCLUDE_DIR}")
+
+  add_library(ICU::ICU_uc STATIC IMPORTED GLOBAL)
+  add_dependencies(ICU::ICU_uc ICU_build)
+  set(ICU_UC_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                     "${ICU_INCLUDE_DIR}")
+
+  add_library(ICU::ICU_in STATIC IMPORTED GLOBAL)
+  add_dependencies(ICU::ICU_in ICU_build)
+  set(ICU_IN_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icui18n${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                     "${ICU_INCLUDE_DIR}")
 endif()
 
 # Create an interface target for ICU
-add_library(ICU::ICU INTERFACE IMPORTED GLOBAL)
-add_dependencies(ICU::ICU ICU)
-target_include_directories(ICU::ICU INTERFACE ${ICU_INCLUDE_DIR})
-target_link_libraries(ICU::ICU INTERFACE ${ICU_DATA_LIBRARY} ${ICU_UC_LIBRARY} ${ICU_IN_LIBRARY})
+add_library(ICU INTERFACE)
+add_dependencies(ICU ICU_build)
+target_link_libraries(ICU INTERFACE ICU::ICU_data ICU::ICU_uc ICU::ICU_in)
+target_include_directories(ICU SYSTEM INTERFACE ${ICU_INCLUDE_DIR})
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 1dc0f0e..7b8208f 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -283,7 +283,8 @@ void set_text_callback(struct transcription_filter_data *gf,
 			// save the last subtitle if it was a full sentence
 			gf->last_transcription_sentence.push_back(result.text);
 			// remove the oldest sentence if the buffer is too long
-			while (gf->last_transcription_sentence.size() > gf->n_context_sentences) {
+			while (gf->last_transcription_sentence.size() >
+			       (size_t)gf->n_context_sentences) {
 				gf->last_transcription_sentence.pop_front();
 			}
 		}
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index a5b3924..6da91d9 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -184,7 +184,7 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter
 	if (gf->n_context_sentences > 0 && !gf->last_transcription_sentence.empty()) {
 		// set the initial prompt to the last transcription sentences (concatenated)
 		std::string initial_prompt = gf->last_transcription_sentence[0];
-		for (int i = 1; i < gf->last_transcription_sentence.size(); ++i) {
+		for (size_t i = 1; i < gf->last_transcription_sentence.size(); ++i) {
 			initial_prompt += " " + gf->last_transcription_sentence[i];
 		}
 		gf->whisper_params.initial_prompt = initial_prompt.c_str();

From 8ef418c50a705d2a2187e8dc3f6177c406701a40 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 22:17:47 -0400
Subject: [PATCH 13/18] refactor: Update ICU library configuration and
 dependencies

---
 CMakeLists.txt       |  2 +-
 cmake/BuildICU.cmake | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 671d9ad..e64f45c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,7 +151,7 @@ if(ENABLE_TESTS)
 
   find_libav(${CMAKE_PROJECT_NAME}-tests)
 
-  target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs ICU::ICU)
+  target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs ICU)
   target_include_directories(${CMAKE_PROJECT_NAME}-tests PRIVATE src)
 
   # install the tests to the release/test directory
diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index 7da9175..a3d45cc 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -64,17 +64,16 @@ if(WIN32)
   install(FILES ${ICU_IN_DLL} DESTINATION "obs-plugins/64bit")
 
   add_library(ICU::ICU_data SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                         "${ICU_INCLUDE_DIR}")
+  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" IMPORTED_IMPLIB
+                                                                                         "${ICU_DATA_LIBRARY}")
 
   add_library(ICU::ICU_uc SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                     "${ICU_INCLUDE_DIR}")
+  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" IMPORTED_IMPLIB
+                                                                                     "${ICU_UC_LIBRARY}")
 
   add_library(ICU::ICU_in SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                     "${ICU_INCLUDE_DIR}")
-
+  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" IMPORTED_IMPLIB
+                                                                                     "${ICU_IN_LIBRARY}")
 else()
   set(ICU_URL
       "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
@@ -128,4 +127,4 @@ endif()
 add_library(ICU INTERFACE)
 add_dependencies(ICU ICU_build)
 target_link_libraries(ICU INTERFACE ICU::ICU_data ICU::ICU_uc ICU::ICU_in)
-target_include_directories(ICU SYSTEM INTERFACE ${ICU_INCLUDE_DIR})
+target_include_directories(ICU INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)

From f53151e29eefdb7a51b9b0a43d5ddd3155d90a06 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 22:30:14 -0400
Subject: [PATCH 14/18] refactor: Update ICU library configuration and
 dependencies

---
 cmake/BuildICU.cmake | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index a3d45cc..c440557 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -20,26 +20,26 @@ if(WIN32)
   FetchContent_MakeAvailable(ICU_build)
 
   # Assuming the ZIP structure, adjust paths as necessary
-  set(ICU_INCLUDE_DIR "${icu_SOURCE_DIR}/include")
-  set(ICU_LIBRARY_DIR "${icu_SOURCE_DIR}/lib64")
-  set(ICU_BINARY_DIR "${icu_SOURCE_DIR}/bin64")
+  set(ICU_INCLUDE_DIR "${icu_build_SOURCE_DIR}/include")
+  set(ICU_LIBRARY_DIR "${icu_build_SOURCE_DIR}/lib64")
+  set(ICU_BINARY_DIR "${icu_build_SOURCE_DIR}/bin64")
 
   # Add ICU libraries
   find_library(
     ICU_DATA_LIBRARY
     NAMES icudt
     PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
+    NO_DEFAULT_PATH REQUIRED)
   find_library(
     ICU_UC_LIBRARY
     NAMES icuuc
     PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
+    NO_DEFAULT_PATH REQUIRED)
   find_library(
     ICU_IN_LIBRARY
     NAMES icuin
     PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH)
+    NO_DEFAULT_PATH REQUIRED)
 
   # find the dlls
   find_file(
@@ -81,8 +81,11 @@ else()
   set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
   if(APPLE)
     set(ICU_PLATFORM "MacOSX")
+    set(ICU_ADDITIONAL_CONFIGURE_COMMAND --with-library-bits=64 CFLAGS=-arch\ ${TARGET_ARCH}
+                                         CXXFLAGS=-arch\ ${TARGET_ARCH} LDFLAGS=-arch\ ${TARGET_ARCH})
   else()
     set(ICU_PLATFORM "Linux")
+    set(ICU_ADDITIONAL_CONFIGURE_COMMAND "")
   endif()
 
   ExternalProject_Add(
@@ -90,7 +93,7 @@ else()
     GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
     GIT_TAG "release-${ICU_VERSION_DASH}"
     CONFIGURE_COMMAND <SOURCE_DIR>/icu4c/source/runConfigureICU ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static
-                      --disable-shared
+                      --disable-shared --disable-debug --enable-release ${ICU_ADDITIONAL_CONFIGURE_COMMAND}
     BUILD_COMMAND make -j4
     BUILD_BYPRODUCTS
       <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
@@ -127,4 +130,4 @@ endif()
 add_library(ICU INTERFACE)
 add_dependencies(ICU ICU_build)
 target_link_libraries(ICU INTERFACE ICU::ICU_data ICU::ICU_uc ICU::ICU_in)
-target_include_directories(ICU INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)
+target_include_directories(ICU SYSTEM INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)

From 439e0ed3f1fe9145f5224bbf5a2e5ed13f9a8909 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 29 Aug 2024 23:23:13 -0400
Subject: [PATCH 15/18] refactor: Update ICU library configuration and
 dependencies

---
 cmake/BuildICU.cmake | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index c440557..567d8f2 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -81,19 +81,20 @@ else()
   set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
   if(APPLE)
     set(ICU_PLATFORM "MacOSX")
-    set(ICU_ADDITIONAL_CONFIGURE_COMMAND --with-library-bits=64 CFLAGS=-arch\ ${TARGET_ARCH}
-                                         CXXFLAGS=-arch\ ${TARGET_ARCH} LDFLAGS=-arch\ ${TARGET_ARCH})
+    set(TARGET_ARCH -arch\ $ENV{MACOS_ARCH})
+    set(ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
   else()
     set(ICU_PLATFORM "Linux")
-    set(ICU_ADDITIONAL_CONFIGURE_COMMAND "")
+    set(ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX "A=A")
   endif()
 
   ExternalProject_Add(
     ICU_build
     GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
     GIT_TAG "release-${ICU_VERSION_DASH}"
-    CONFIGURE_COMMAND <SOURCE_DIR>/icu4c/source/runConfigureICU ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static
-                      --disable-shared --disable-debug --enable-release ${ICU_ADDITIONAL_CONFIGURE_COMMAND}
+    CONFIGURE_COMMAND
+      ${CMAKE_COMMAND} -E env ${ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX} <SOURCE_DIR>/icu4c/source/runConfigureICU
+      ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
     BUILD_COMMAND make -j4
     BUILD_BYPRODUCTS
       <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}

From ca54053588b167d0feb7623e2c1c49ffae0bc64d Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 30 Aug 2024 11:54:05 -0400
Subject: [PATCH 16/18] refactor: Update ICU library configuration and
 dependencies

---
 cmake/BuildICU.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index 567d8f2..77a5173 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -82,19 +82,19 @@ else()
   if(APPLE)
     set(ICU_PLATFORM "MacOSX")
     set(TARGET_ARCH -arch\ $ENV{MACOS_ARCH})
-    set(ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
+    set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
   else()
     set(ICU_PLATFORM "Linux")
-    set(ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX "A=A")
+    set(ICU_BUILD_ENV_VARS "A=A")
   endif()
 
   ExternalProject_Add(
     ICU_build
+    DOWNLOAD_EXTRACT_TIMESTAMP true
     GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
     GIT_TAG "release-${ICU_VERSION_DASH}"
-    CONFIGURE_COMMAND
-      ${CMAKE_COMMAND} -E env ${ICU_ADDITIONAL_CONFIGURE_COMMAND_PREFIX} <SOURCE_DIR>/icu4c/source/runConfigureICU
-      ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
+    CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${ICU_BUILD_ENV_VARS} <SOURCE_DIR>/icu4c/source/runConfigureICU
+                      ${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
     BUILD_COMMAND make -j4
     BUILD_BYPRODUCTS
       <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}

From 1db1764d942572682d4d5afcc9d77de8f144e0f6 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 30 Aug 2024 12:16:09 -0400
Subject: [PATCH 17/18] refactor: Update ICU library configuration and
 dependencies

---
 cmake/BuildICU.cmake | 101 +++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 67 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index 77a5173..a46e722 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -24,56 +24,29 @@ if(WIN32)
   set(ICU_LIBRARY_DIR "${icu_build_SOURCE_DIR}/lib64")
   set(ICU_BINARY_DIR "${icu_build_SOURCE_DIR}/bin64")
 
-  # Add ICU libraries
-  find_library(
-    ICU_DATA_LIBRARY
-    NAMES icudt
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH REQUIRED)
-  find_library(
-    ICU_UC_LIBRARY
-    NAMES icuuc
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH REQUIRED)
-  find_library(
-    ICU_IN_LIBRARY
-    NAMES icuin
-    PATHS ${ICU_LIBRARY_DIR}
-    NO_DEFAULT_PATH REQUIRED)
-
-  # find the dlls
-  find_file(
-    ICU_DATA_DLL
-    NAMES icudt${ICU_VERSION_NO_MINOR}.dll
-    PATHS ${ICU_BINARY_DIR}
-    NO_DEFAULT_PATH)
-  find_file(
-    ICU_UC_DLL
-    NAMES icuuc${ICU_VERSION_NO_MINOR}.dll
-    PATHS ${ICU_BINARY_DIR}
-    NO_DEFAULT_PATH)
-  find_file(
-    ICU_IN_DLL
-    NAMES icuin${ICU_VERSION_NO_MINOR}.dll
-    PATHS ${ICU_BINARY_DIR}
-    NO_DEFAULT_PATH)
-
-  # Copy the DLLs to the output directory
-  install(FILES ${ICU_DATA_DLL} DESTINATION "obs-plugins/64bit")
-  install(FILES ${ICU_UC_DLL} DESTINATION "obs-plugins/64bit")
-  install(FILES ${ICU_IN_DLL} DESTINATION "obs-plugins/64bit")
-
-  add_library(ICU::ICU_data SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" IMPORTED_IMPLIB
-                                                                                         "${ICU_DATA_LIBRARY}")
-
-  add_library(ICU::ICU_uc SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" IMPORTED_IMPLIB
-                                                                                     "${ICU_UC_LIBRARY}")
-
-  add_library(ICU::ICU_in SHARED IMPORTED GLOBAL)
-  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" IMPORTED_IMPLIB
-                                                                                     "${ICU_IN_LIBRARY}")
+  # Define the library names
+  set(ICU_LIBRARIES icudt icuuc icuin)
+
+  foreach(lib ${ICU_LIBRARIES})
+    # Add ICU library
+    find_library(
+      ICU_LIB_${lib}
+      NAMES ${lib}
+      PATHS ${ICU_LIBRARY_DIR}
+      NO_DEFAULT_PATH REQUIRED)
+    # find the dll
+    find_file(
+      ICU_DLL_${lib}
+      NAMES ${lib}${ICU_VERSION_NO_MINOR}.dll
+      PATHS ${ICU_BINARY_DIR}
+      NO_DEFAULT_PATH)
+    # Copy the DLLs to the output directory
+    install(FILES ${ICU_DLL_${lib}} DESTINATION "obs-plugins/64bit")
+    # add the library
+    add_library(ICU::${lib} SHARED IMPORTED GLOBAL)
+    set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIB_${lib}}" IMPORTED_IMPLIB
+                                                                                       "${ICU_LIB_${lib}}")
+  endforeach()
 else()
   set(ICU_URL
       "https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
@@ -108,27 +81,21 @@ else()
   set(ICU_INCLUDE_DIR "${INSTALL_DIR}/include")
   set(ICU_LIBRARY_DIR "${INSTALL_DIR}/lib")
 
-  add_library(ICU::ICU_data STATIC IMPORTED GLOBAL)
-  add_dependencies(ICU::ICU_data ICU_build)
-  set(ICU_DATA_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set_target_properties(ICU::ICU_data PROPERTIES IMPORTED_LOCATION "${ICU_DATA_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                         "${ICU_INCLUDE_DIR}")
-
-  add_library(ICU::ICU_uc STATIC IMPORTED GLOBAL)
-  add_dependencies(ICU::ICU_uc ICU_build)
-  set(ICU_UC_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set_target_properties(ICU::ICU_uc PROPERTIES IMPORTED_LOCATION "${ICU_UC_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                     "${ICU_INCLUDE_DIR}")
+  set(ICU_LIBRARIES icudata icuuc icui18n)
 
-  add_library(ICU::ICU_in STATIC IMPORTED GLOBAL)
-  add_dependencies(ICU::ICU_in ICU_build)
-  set(ICU_IN_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}icui18n${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set_target_properties(ICU::ICU_in PROPERTIES IMPORTED_LOCATION "${ICU_IN_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
-                                                                                     "${ICU_INCLUDE_DIR}")
+  foreach(lib ${ICU_LIBRARIES})
+    add_library(ICU::${lib} STATIC IMPORTED GLOBAL)
+    add_dependencies(ICU::${lib} ICU_build)
+    set(ICU_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
+                                                                                    "${ICU_INCLUDE_DIR}")
+  endforeach(lib ${ICU_LIBRARIES})
 endif()
 
 # Create an interface target for ICU
 add_library(ICU INTERFACE)
 add_dependencies(ICU ICU_build)
-target_link_libraries(ICU INTERFACE ICU::ICU_data ICU::ICU_uc ICU::ICU_in)
+foreach(lib ${ICU_LIBRARIES})
+  target_link_libraries(ICU INTERFACE ICU::${lib})
+endforeach()
 target_include_directories(ICU SYSTEM INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)

From 6f994e5cda2662da5ba9dfade9cf748d77c125a4 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 30 Aug 2024 12:46:12 -0400
Subject: [PATCH 18/18] refactor: Update ICU library configuration and
 dependencies

This commit updates the `BuildICU.cmake` file to set the `CFLAGS`, `CXXFLAGS`, and `LDFLAGS` environment variables to `-fPIC` for Linux platforms. This change aims to ensure that the ICU library is built with position-independent code, improving compatibility and security. Additionally, the `icuin` library has been renamed to `icui18n` to align with the naming convention. These updates enhance the build process and maintain consistency in the ICU library configuration.
---
 cmake/BuildICU.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/BuildICU.cmake b/cmake/BuildICU.cmake
index a46e722..a3c575d 100644
--- a/cmake/BuildICU.cmake
+++ b/cmake/BuildICU.cmake
@@ -58,7 +58,7 @@ else()
     set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
   else()
     set(ICU_PLATFORM "Linux")
-    set(ICU_BUILD_ENV_VARS "A=A")
+    set(ICU_BUILD_ENV_VARS CFLAGS=-fPIC CXXFLAGS=-fPIC LDFLAGS=-fPIC)
   endif()
 
   ExternalProject_Add(
@@ -72,7 +72,7 @@ else()
     BUILD_BYPRODUCTS
       <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
       <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}
-      <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuin${CMAKE_STATIC_LIBRARY_SUFFIX}
+      <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icui18n${CMAKE_STATIC_LIBRARY_SUFFIX}
     INSTALL_COMMAND make install
     BUILD_IN_SOURCE 1)