From 182a0d0b6b7d2e02d17d3442b8d96b29ce5ea42f Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 00:41:23 -0400 Subject: [PATCH 1/6] Bump whisper, clblast, add buffered output --- .github/workflows/build-project.yaml | 2 +- .github/workflows/push.yaml | 1 + CMakeLists.txt | 3 +- cmake/BuildCTranslate2.cmake | 4 +- cmake/BuildWhispercpp.cmake | 28 ++- src/captions-thread.h | 127 ++++++++++ src/model-utils/model-infos.cpp | 16 +- src/transcription-filter-data.h | 4 + src/transcription-filter.cpp | 281 +++++++++++++++-------- src/utils.cpp | 21 ++ src/utils.h | 9 + src/whisper-utils/whisper-language.h | 200 ++++++++-------- src/whisper-utils/whisper-processing.cpp | 26 ++- src/whisper-utils/whisper-processing.h | 2 +- 14 files changed, 495 insertions(+), 229 deletions(-) create mode 100644 src/captions-thread.h create mode 100644 src/utils.cpp create mode 100644 src/utils.h diff --git a/.github/workflows/build-project.yaml b/.github/workflows/build-project.yaml index 21df812..addbd69 100644 --- a/.github/workflows/build-project.yaml +++ b/.github/workflows/build-project.yaml @@ -246,7 +246,7 @@ jobs: needs: check-event strategy: matrix: - cublas: [cpu, 12.2.0, 11.8.0] + cublas: [cpu, clblast, 12.2.0, 11.8.0] defaults: run: shell: pwsh diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index e71de58..27a98e6 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -76,6 +76,7 @@ jobs: variants=( 'windows-x64-cpu;zip|exe' + 'windows-x64-clblast;zip|exe' 'windows-x64-11.8.0;zip|exe' 'windows-x64-12.2.0;zip|exe' 'macos-arm64;tar.xz|pkg' diff --git a/CMakeLists.txt b/CMakeLists.txt index c4f6946..c253488 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,7 @@ target_sources( src/whisper-utils/whisper-processing.cpp src/whisper-utils/whisper-utils.cpp src/whisper-utils/silero-vad-onnx.cpp - src/translation/translation.cpp) + src/translation/translation.cpp + src/utils.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/cmake/BuildCTranslate2.cmake b/cmake/BuildCTranslate2.cmake index df48fdc..41dcc1c 100644 --- a/cmake/BuildCTranslate2.cmake +++ b/cmake/BuildCTranslate2.cmake @@ -21,10 +21,10 @@ elseif(WIN32) # check CPU_OR_CUDA environment variable if(NOT DEFINED ENV{CPU_OR_CUDA}) - message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either CPU or CUDA") + message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`") endif() - if($ENV{CPU_OR_CUDA} STREQUAL "cpu") + if($ENV{CPU_OR_CUDA} STREQUAL "cpu" OR $ENV{CPU_OR_CUDA} STREQUAL "clblast") FetchContent_Declare( ctranslate2_fetch URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.2.0/libctranslate2-windows-4.1.1-Release-cpu.zip diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 6c78e61..50ad450 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -14,12 +14,12 @@ if(APPLE) endif(NOT DEFINED ENV{MACOS_ARCH}) set(WHISPER_CPP_URL - "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.1/whispercpp-macos-$ENV{MACOS_ARCH}-0.0.1.tar.gz" + "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.2/whispercpp-macos-$ENV{MACOS_ARCH}-0.0.2.tar.gz" ) if($ENV{MACOS_ARCH} STREQUAL "x86_64") - set(WHISPER_CPP_HASH "36F39F02F999AAF157EAD3460DD00C8BDAA3D6C4A769A9E4F64E327871B4B11F") + set(WHISPER_CPP_HASH "00C308AF0BFFF7619934403A8080CC9AFC4EDAA328D7587E617150A2C6A33313") elseif($ENV{MACOS_ARCH} STREQUAL "arm64") - set(WHISPER_CPP_HASH "6AF7BB904B03B6208B4281D44465B727FB608A32CABD1394B727937C5F4828A1") + set(WHISPER_CPP_HASH "0478E2079E07FA81BEE77506101003F4A4C8F0DF9E23757BD7E1D25DCBD1DB30") else() message( FATAL_ERROR @@ -45,24 +45,30 @@ elseif(WIN32) if(NOT DEFINED ENV{CPU_OR_CUDA}) message( FATAL_ERROR - "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu` or `11.8.0` or `12.2.0`") + "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`") endif(NOT DEFINED ENV{CPU_OR_CUDA}) - set(CUDA_PREFIX $ENV{CPU_OR_CUDA}) - if(NOT $ENV{CPU_OR_CUDA} STREQUAL "cpu") - set(CUDA_PREFIX "cuda$ENV{CPU_OR_CUDA}") + set(ARCH_PREFIX $ENV{CPU_OR_CUDA}) + if(NOT $ENV{CPU_OR_CUDA} STREQUAL "cpu" AND NOT $ENV{CPU_OR_CUDA} STREQUAL "clblast") + set(ARCH_PREFIX "cuda$ENV{CPU_OR_CUDA}") add_compile_definitions("LOCALVOCAL_WITH_CUDA") + elseif($ENV{CPU_OR_CUDA} STREQUAL "cpu") + add_compile_definitions("LOCALVOCAL_WITH_CPU") + else() + add_compile_definitions("LOCALVOCAL_WITH_CLBLAST") endif() set(WHISPER_CPP_URL - "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.1/whispercpp-windows-${CUDA_PREFIX}-0.0.1.zip" + "https://github.com/occ-ai/occ-ai-dep-whispercpp/releases/download/0.0.2/whispercpp-windows-${ARCH_PREFIX}-0.0.2.zip" ) if($ENV{CPU_OR_CUDA} STREQUAL "cpu") - set(WHISPER_CPP_HASH "5261FCCD18BA52AE7ECD37617452F0514238FAB4B12713F1FCA491F4ABA170AA") + set(WHISPER_CPP_HASH "6DE628A51B9352624A1EC397231591FA3370E6BB42D9364F4F91F11DD18F77D2") + elseif($ENV{CPU_OR_CUDA} STREQUAL "clblast") + set(WHISPER_CPP_HASH "97BF58520F1818B7C9F4E996197F3097934E5E0BBA92B0B016C6B28BE9FF1642") elseif($ENV{CPU_OR_CUDA} STREQUAL "12.2.0") - set(WHISPER_CPP_HASH "1966A6C7347FCB9529140F8097AED306F31C6DDE328836FD6498A980E20B8E6C") + set(WHISPER_CPP_HASH "48C059A3364E0AAD9FB0D4194BA554865928D22A27ECE5E3C116DC672D5D6EDE") elseif($ENV{CPU_OR_CUDA} STREQUAL "11.8.0") - set(WHISPER_CPP_HASH "172F4021E888A89A694373AE0888C04DB99BC11F3A2633270248E03AF5AC762E") + set(WHISPER_CPP_HASH "29A5530E83896DE207F0199535CBBB24DF0D63B1373BA66139AD240BA67120EB") else() message( FATAL_ERROR diff --git a/src/captions-thread.h b/src/captions-thread.h new file mode 100644 index 0000000..5ebf17b --- /dev/null +++ b/src/captions-thread.h @@ -0,0 +1,127 @@ +#ifndef CAPTIONS_THREAD_H +#define CAPTIONS_THREAD_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "plugin-support.h" + +class CaptionMonitor { +public: + // default constructor + CaptionMonitor() = default; + + ~CaptionMonitor() + { + { + std::lock_guard lock(queueMutex); + stop = true; + } + condVar.notify_all(); + workerThread.join(); + } + + void initialize(std::function callback, size_t maxSize, + std::chrono::seconds maxTime) + { + obs_log(LOG_INFO, "CaptionMonitor::initialize"); + this->callback = callback; + this->maxSize = maxSize; + this->maxTime = maxTime; + this->initialized = true; + this->workerThread = std::thread(&CaptionMonitor::monitor, this); + } + + void addWords(const std::vector &words) + { + { + std::lock_guard lock(queueMutex); + for (const auto &word : words) { + wordQueue.push_back(word); + } + this->newDataAvailable = true; + } + obs_log(LOG_INFO, "CaptionMonitor::addWords: number of words in queue: %d", + wordQueue.size()); + condVar.notify_all(); + } + +private: + void monitor() + { + obs_log(LOG_INFO, "CaptionMonitor::monitor"); + auto startTime = std::chrono::steady_clock::now(); + while (true) { + std::unique_lock lock(this->queueMutex); + // wait for new data or stop signal + this->condVar.wait(lock, + [this] { return this->newDataAvailable || this->stop; }); + + if (this->stop) { + obs_log(LOG_INFO, "CaptionMonitor::monitor: stopping"); + break; + } + + if (this->wordQueue.empty()) { + continue; + } + + obs_log(LOG_INFO, "CaptionMonitor::monitor: wordQueue size: %d", + this->wordQueue.size()); + + // emit up to maxSize words from the wordQueue + std::vector emitted; + while (!this->wordQueue.empty() && emitted.size() <= this->maxSize) { + emitted.push_back(this->wordQueue.front()); + this->wordQueue.pop_front(); + } + // emit the caption, joining the words with a space + std::string output; + for (const auto &word : emitted) { + output += word + " "; + } + this->callback(output); + // push back the words that were emitted, in reverse order + for (auto it = emitted.rbegin(); it != emitted.rend(); ++it) { + this->wordQueue.push_front(*it); + } + + if (this->wordQueue.size() >= this->maxSize || + std::chrono::steady_clock::now() - startTime >= this->maxTime) { + // flush the queue if it's full or we've reached the max time + size_t words_to_flush = + std::min(this->wordQueue.size(), this->maxSize); + obs_log(LOG_INFO, "CaptionMonitor::monitor: flushing %d words", + words_to_flush); + for (size_t i = 0; i < words_to_flush; ++i) { + wordQueue.pop_front(); + } + startTime = std::chrono::steady_clock::now(); + } + + newDataAvailable = false; + } + obs_log(LOG_INFO, "CaptionMonitor::monitor: done"); + } + + std::deque wordQueue; + std::thread workerThread; + std::mutex queueMutex; + std::condition_variable condVar; + std::function callback; + size_t maxSize; + std::chrono::seconds maxTime; + bool stop; + bool initialized = false; + bool newDataAvailable = false; +}; + +#endif // CAPTIONS_THREAD_H diff --git a/src/model-utils/model-infos.cpp b/src/model-utils/model-infos.cpp index cd00814..b7cb6b1 100644 --- a/src/model-utils/model-infos.cpp +++ b/src/model-utils/model-infos.cpp @@ -29,7 +29,7 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-base-q5_1.bin", "422F1AE452ADE6F30A004D7E5C6A43195E4433BC370BF23FAC9CC591F01A8898"}}}}, - {"Whisper Base En q5 (57Mb)", + {"Whisper Base English q5 (57Mb)", {"Whisper Base En q5", "ggml-model-whisper-base-en-q5_1", MODEL_TYPE_TRANSCRIPTION, @@ -41,7 +41,7 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-base.bin", "60ED5BC3DD14EEA856493D334349B405782DDCAF0028D4B5DF4088345FBA2EFE"}}}}, - {"Whisper Base En (141Mb)", + {"Whisper Base English (141Mb)", {"Whisper Base En", "ggml-model-whisper-base-en", MODEL_TYPE_TRANSCRIPTION, @@ -59,7 +59,7 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-medium-q5_0.bin", "19FEA4B380C3A618EC4723C3EEF2EB785FFBA0D0538CF43F8F235E7B3B34220F"}}}}, - {"Whisper Medium En q5 (514Mb)", + {"Whisper Medium English q5 (514Mb)", {"Whisper Medium En q5", "ggml-model-whisper-medium-en-q5_0", MODEL_TYPE_TRANSCRIPTION, @@ -71,7 +71,7 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-small-q5_1.bin", "AE85E4A935D7A567BD102FE55AFC16BB595BDB618E11B2FC7591BC08120411BB"}}}}, - {"Whisper Small En q5 (181Mb)", + {"Whisper Small English q5 (181Mb)", {"Whisper Small En q5", "ggml-model-whisper-small-en-q5_1", MODEL_TYPE_TRANSCRIPTION, @@ -83,7 +83,7 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-small.bin", "1BE3A9B2063867B937E64E2EC7483364A79917E157FA98C5D94B5C1FFFEA987B"}}}}, - {"Whisper Small En (465Mb)", + {"Whisper Small English (465Mb)", {"Whisper Small En", "ggml-model-whisper-small-en", MODEL_TYPE_TRANSCRIPTION, @@ -101,19 +101,19 @@ std::map models_info = {{ MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin", "818710568DA3CA15689E31A743197B520007872FF9576237BDA97BD1B469C3D7"}}}}, - {"Whisper Tiny En q5 (31Mb)", + {"Whisper Tiny English q5 (31Mb)", {"Whisper Tiny En q5", "ggml-model-whisper-tiny-en-q5_1", MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin", "C77C5766F1CEF09B6B7D47F21B546CBDDD4157886B3B5D6D4F709E91E66C7C2B"}}}}, - {"Whisper Tiny En q8 (42Mb)", + {"Whisper Tiny English q8 (42Mb)", {"Whisper Tiny En q8", "ggml-model-whisper-tiny-en-q8_0", MODEL_TYPE_TRANSCRIPTION, {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin", "5BC2B3860AA151A4C6E7BB095E1FCCE7CF12C7B020CA08DCEC0C6D018BB7DD94"}}}}, - {"Whisper Tiny En (74Mb)", + {"Whisper Tiny English (74Mb)", {"Whisper Tiny En", "ggml-model-whisper-tiny-en", MODEL_TYPE_TRANSCRIPTION, diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 0b34f5d..374ec2a 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -17,6 +17,7 @@ #include "translation/translation.h" #include "whisper-utils/silero-vad-onnx.h" +#include "captions-thread.h" #define MAX_PREPROC_CHANNELS 10 @@ -89,6 +90,7 @@ struct transcription_filter_data { bool translate = false; std::string source_lang; std::string target_lang; + bool buffered_output = false; // Last transcription result std::string last_text; @@ -113,6 +115,8 @@ struct transcription_filter_data { // translation context struct translation_context translation_ctx; + CaptionMonitor captions_monitor; + // ctor transcription_filter_data() { diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index f62fb2a..e7c28bf 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -10,6 +10,7 @@ #include "whisper-utils/whisper-utils.h" #include "translation/language_codes.h" #include "translation/translation.h" +#include "utils.h" #include #include @@ -220,6 +221,35 @@ inline bool is_valid_lead_byte(const uint8_t *c) return false; } +void send_caption_to_source(const std::string &str_copy, struct transcription_filter_data *gf) +{ + if (!gf->text_source_mutex) { + obs_log(LOG_ERROR, "text_source_mutex is null"); + return; + } + + if (!gf->text_source) { + // attempt to acquire a weak ref to the text source if it's yet available + acquire_weak_text_source_ref(gf); + } + + std::lock_guard lock(*gf->text_source_mutex); + + if (!gf->text_source) { + obs_log(gf->log_level, "text_source is null"); + return; + } + auto target = obs_weak_source_get_source(gf->text_source); + if (!target) { + obs_log(gf->log_level, "text_source target is null"); + return; + } + auto text_settings = obs_source_get_settings(target); + obs_data_set_string(text_settings, "text", str_copy.c_str()); + obs_source_update(target, text_settings); + obs_source_release(target); +} + void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &resultIn) { @@ -302,6 +332,8 @@ void set_text_callback(struct transcription_filter_data *gf, gf->last_text = str_copy; + gf->captions_monitor.addWords(split_words(str_copy)); + if (gf->caption_to_stream) { obs_output_t *streaming_output = obs_frontend_get_streaming_output(); if (streaming_output) { @@ -360,31 +392,10 @@ void set_text_callback(struct transcription_filter_data *gf, gf->sentence_number++; } } else { - if (!gf->text_source_mutex) { - obs_log(LOG_ERROR, "text_source_mutex is null"); - return; + if (!gf->buffered_output) { + // Send the caption to the text source + send_caption_to_source(str_copy, gf); } - - if (!gf->text_source) { - // attempt to acquire a weak ref to the text source if it's yet available - acquire_weak_text_source_ref(gf); - } - - std::lock_guard lock(*gf->text_source_mutex); - - if (!gf->text_source) { - obs_log(gf->log_level, "text_source is null"); - return; - } - auto target = obs_weak_source_get_source(gf->text_source); - if (!target) { - obs_log(gf->log_level, "text_source target is null"); - return; - } - auto text_settings = obs_source_get_settings(target); - obs_data_set_string(text_settings, "text", str_copy.c_str()); - obs_source_update(target, text_settings); - obs_source_release(target); } }; @@ -414,6 +425,7 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->process_while_muted = obs_data_get_bool(s, "process_while_muted"); gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration"); gf->last_sub_render_time = 0; + gf->buffered_output = obs_data_get_bool(s, "buffered_output"); bool new_translate = obs_data_get_bool(s, "translate"); gf->source_lang = obs_data_get_string(s, "translate_source_language"); @@ -552,6 +564,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->rename_file_to_match_recording = obs_data_get_bool(settings, "rename_file_to_match_recording"); gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted"); + gf->buffered_output = obs_data_get_bool(settings, "buffered_output"); for (size_t i = 0; i < gf->channels; i++) { circlebuf_init(&gf->input_buffers[i]); @@ -592,10 +605,62 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) obs_log(gf->log_level, "clear text source data"); gf->text_source = nullptr; const char *subtitle_sources = obs_data_get_string(settings, "subtitle_sources"); - if (subtitle_sources != nullptr) { - gf->text_source_name = bstrdup(subtitle_sources); + if (subtitle_sources == nullptr || strcmp(subtitle_sources, "none") == 0 || + strcmp(subtitle_sources, "(null)") == 0 || strlen(subtitle_sources) == 0) { + obs_log(LOG_INFO, "create text source"); + // check if a source called "LocalVocal Subtitles" exists + obs_source_t *source = obs_get_source_by_name("LocalVocal Subtitles"); + if (source) { + // source exists, release it + obs_source_release(source); + } else { + // create a new OBS text source called "LocalVocal Subtitles" + obs_source_t *scene_as_source = obs_frontend_get_current_scene(); + obs_scene_t *scene = obs_scene_from_source(scene_as_source); + source = obs_source_create("text_ft2_source_v2", "LocalVocal Subtitles", + nullptr, nullptr); + if (source) { + // add source to the current scene + obs_scene_add(scene, source); + // set source settings + obs_data_t *source_settings = obs_source_get_settings(source); + obs_data_set_bool(source_settings, "word_wrap", true); + obs_data_set_int(source_settings, "custom_width", 1760); + obs_data_t *font_data = obs_data_create(); + obs_data_set_string(font_data, "face", "Arial"); + obs_data_set_string(font_data, "style", "Regular"); + obs_data_set_int(font_data, "size", 72); + obs_data_set_int(font_data, "flags", 0); + obs_data_set_obj(source_settings, "font", font_data); + obs_data_release(font_data); + obs_source_update(source, source_settings); + obs_data_release(source_settings); + + // set transform settings + obs_transform_info transform_info; + transform_info.pos.x = 1852.0; + transform_info.pos.y = 1034.0; + transform_info.bounds.x = 1769.0; + transform_info.bounds.y = 145.0; + transform_info.bounds_type = + obs_bounds_type::OBS_BOUNDS_SCALE_INNER; + transform_info.scale.x = 1.0; + transform_info.scale.y = 1.0; + transform_info.rot = 0.0; + obs_sceneitem_t *source_sceneitem = + obs_scene_sceneitem_from_source(scene, source); + obs_sceneitem_set_info(source_sceneitem, &transform_info); + obs_sceneitem_release(source_sceneitem); + + obs_source_release(source); + } + obs_source_release(scene_as_source); + } + gf->text_source_name = bstrdup("LocalVocal Subtitles"); + obs_data_set_string(settings, "subtitle_sources", "LocalVocal Subtitles"); } else { - gf->text_source_name = nullptr; + // set the text source name + gf->text_source_name = bstrdup(subtitle_sources); } obs_log(gf->log_level, "clear paths and whisper context"); gf->whisper_model_file_currently_loaded = ""; @@ -603,6 +668,15 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->whisper_model_path = std::string(""); // The update function will set the model path gf->whisper_context = nullptr; + gf->captions_monitor.initialize( + [gf](const std::string &text) { + obs_log(LOG_INFO, "Captions: %s", text.c_str()); + if (gf->buffered_output) { + send_caption_to_source(text, gf); + } + }, + 20, std::chrono::seconds(10)); + obs_log(gf->log_level, "run update"); // get the settings updated on the filter data struct transcription_filter_update(gf, settings); @@ -662,12 +736,11 @@ bool subs_output_select_changed(obs_properties_t *props, obs_property_t *propert // Show or hide the output filename selection input const char *new_output = obs_data_get_string(settings, "subtitle_sources"); const bool show_hide = (strcmp(new_output, "text_file") == 0); - obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"), show_hide); - obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide); - obs_property_set_visible(obs_properties_get(props, "truncate_output_file"), show_hide); - obs_property_set_visible(obs_properties_get(props, "only_while_recording"), show_hide); - obs_property_set_visible(obs_properties_get(props, "rename_file_to_match_recording"), - show_hide); + for (const std::string &prop_name : + {"subtitle_output_filename", "subtitle_save_srt", "truncate_output_file", + "only_while_recording", "rename_file_to_match_recording"}) { + obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), show_hide); + } return true; } @@ -691,9 +764,10 @@ void transcription_filter_defaults(obs_data_t *s) { obs_log(LOG_INFO, "filter defaults"); + obs_data_set_default_bool(s, "buffered_output", false); obs_data_set_default_bool(s, "vad_enabled", true); obs_data_set_default_int(s, "log_level", LOG_DEBUG); - obs_data_set_default_bool(s, "log_words", true); + obs_data_set_default_bool(s, "log_words", false); obs_data_set_default_bool(s, "caption_to_stream", false); obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny En (74Mb)"); obs_data_set_default_string(s, "whisper_language_select", "en"); @@ -742,76 +816,11 @@ void transcription_filter_defaults(obs_data_t *s) obs_properties_t *transcription_filter_properties(void *data) { - obs_log(LOG_DEBUG, "Add filter properties"); - struct transcription_filter_data *gf = static_cast(data); obs_properties_t *ppts = obs_properties_create(); - obs_properties_add_bool(ppts, "log_words", MT_("log_words")); - obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); - - obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000, - DEFAULT_BUFFER_SIZE_MSEC, 50); - obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, 300, - 50); - - obs_property_t *step_by_step_processing = obs_properties_add_bool( - ppts, "step_by_step_processing", MT_("step_by_step_processing")); - obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000, - DEFAULT_BUFFER_SIZE_MSEC, 50); - obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000, - 50); - - obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props, - obs_property_t *property, - obs_data_t *settings) { - UNUSED_PARAMETER(property); - // Show/Hide the step size input - obs_property_set_visible(obs_properties_get(props, "step_size_msec"), - obs_data_get_bool(settings, "step_by_step_processing")); - return true; - }); - - // add translation option group - obs_properties_t *translation_group = obs_properties_create(); - obs_property_t *translation_group_prop = obs_properties_add_group( - ppts, "translate", MT_("translate"), OBS_GROUP_CHECKABLE, translation_group); - // add target language selection - obs_property_t *prop_tgt = obs_properties_add_list( - translation_group, "translate_target_language", MT_("target_language"), - OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); - obs_property_t *prop_src = obs_properties_add_list( - translation_group, "translate_source_language", MT_("source_language"), - OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); - obs_properties_add_bool(translation_group, "translate_add_context", - MT_("translate_add_context")); - - // Populate the dropdown with the language codes - for (const auto &language : language_codes) { - obs_property_list_add_string(prop_tgt, language.second.c_str(), - language.first.c_str()); - obs_property_list_add_string(prop_src, language.second.c_str(), - language.first.c_str()); - } - - // add callback to enable/disable translation group - obs_property_set_modified_callback(translation_group_prop, [](obs_properties_t *props, - obs_property_t *property, - obs_data_t *settings) { - UNUSED_PARAMETER(property); - // Show/Hide the translation group - const bool translate_enabled = obs_data_get_bool(settings, "translate"); - for (const auto &prop : {"translate_target_language", "translate_source_language", - "translate_add_context"}) { - obs_property_set_visible(obs_properties_get(props, prop), - translate_enabled); - } - return true; - }); - - obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted")); obs_property_t *subs_output = obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); @@ -885,6 +894,43 @@ obs_properties_t *transcription_filter_properties(void *data) return true; }); + // add translation option group + obs_properties_t *translation_group = obs_properties_create(); + obs_property_t *translation_group_prop = obs_properties_add_group( + ppts, "translate", MT_("translate"), OBS_GROUP_CHECKABLE, translation_group); + // add target language selection + obs_property_t *prop_tgt = obs_properties_add_list( + translation_group, "translate_target_language", MT_("target_language"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + obs_property_t *prop_src = obs_properties_add_list( + translation_group, "translate_source_language", MT_("source_language"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + obs_properties_add_bool(translation_group, "translate_add_context", + MT_("translate_add_context")); + + // Populate the dropdown with the language codes + for (const auto &language : language_codes) { + obs_property_list_add_string(prop_tgt, language.second.c_str(), + language.first.c_str()); + obs_property_list_add_string(prop_src, language.second.c_str(), + language.first.c_str()); + } + + // add callback to enable/disable translation group + obs_property_set_modified_callback(translation_group_prop, [](obs_properties_t *props, + obs_property_t *property, + obs_data_t *settings) { + UNUSED_PARAMETER(property); + // Show/Hide the translation group + const bool translate_enabled = obs_data_get_bool(settings, "translate"); + for (const auto &prop : {"translate_target_language", "translate_source_language", + "translate_add_context"}) { + obs_property_set_visible(obs_properties_get(props, prop), + translate_enabled); + } + return true; + }); + obs_property_t *advanced_settings_prop = obs_properties_add_bool(ppts, "advanced_settings", MT_("advanced_settings")); obs_property_set_modified_callback(advanced_settings_prop, [](obs_properties_t *props, @@ -893,11 +939,44 @@ obs_properties_t *transcription_filter_properties(void *data) UNUSED_PARAMETER(property); // If advanced settings is enabled, show the advanced settings group const bool show_hide = obs_data_get_bool(settings, "advanced_settings"); - obs_property_set_visible(obs_properties_get(props, "whisper_params_group"), - show_hide); + for (const std::string &prop_name : + {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec", + "overlap_size_msec", "step_by_step_processing", "min_sub_duration", + "process_while_muted", "buffered_output"}) { + obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), + show_hide); + } return true; }); + obs_properties_add_bool(ppts, "buffered_output", MT_("buffered_output")); + obs_properties_add_bool(ppts, "log_words", MT_("log_words")); + obs_properties_add_bool(ppts, "caption_to_stream", MT_("caption_to_stream")); + + obs_properties_add_int_slider(ppts, "buffer_size_msec", MT_("buffer_size_msec"), 1000, + DEFAULT_BUFFER_SIZE_MSEC, 250); + obs_properties_add_int_slider(ppts, "overlap_size_msec", MT_("overlap_size_msec"), 50, 300, + 50); + + obs_property_t *step_by_step_processing = obs_properties_add_bool( + ppts, "step_by_step_processing", MT_("step_by_step_processing")); + obs_properties_add_int_slider(ppts, "step_size_msec", MT_("step_size_msec"), 1000, + DEFAULT_BUFFER_SIZE_MSEC, 50); + obs_properties_add_int_slider(ppts, "min_sub_duration", MT_("min_sub_duration"), 1000, 5000, + 50); + + obs_property_set_modified_callback(step_by_step_processing, [](obs_properties_t *props, + obs_property_t *property, + obs_data_t *settings) { + UNUSED_PARAMETER(property); + // Show/Hide the step size input + obs_property_set_visible(obs_properties_get(props, "step_size_msec"), + obs_data_get_bool(settings, "step_by_step_processing")); + return true; + }); + + obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted")); + obs_properties_t *whisper_params_group = obs_properties_create(); obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), OBS_GROUP_NORMAL, whisper_params_group); diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100644 index 0000000..6639ae7 --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,21 @@ +#include "utils.h" + +std::vector split_words(const std::string &str_copy) +{ + std::vector words; + std::string word; + for (char c : str_copy) { + if (std::isspace(c)) { + if (!word.empty()) { + words.push_back(word); + word.clear(); + } + } else { + word += c; + } + } + if (!word.empty()) { + words.push_back(word); + } + return words; +} diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..9348417 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,9 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +std::vector split_words(const std::string &str_copy); + +#endif // UTILS_H diff --git a/src/whisper-utils/whisper-language.h b/src/whisper-utils/whisper-language.h index 611ab39..f1bfb99 100644 --- a/src/whisper-utils/whisper-language.h +++ b/src/whisper-utils/whisper-language.h @@ -7,403 +7,403 @@ static const std::map whisper_available_lang = { { "auto", - "auto", + "Auto detect", }, { "en", - "english", + "English", }, { "zh", - "chinese", + "Chinese", }, { "de", - "german", + "German", }, { "es", - "spanish", + "Spanish", }, { "ru", - "russian", + "Russian", }, { "ko", - "korean", + "Korean", }, { "fr", - "french", + "French", }, { "ja", - "japanese", + "Japanese", }, { "pt", - "portuguese", + "Portuguese", }, { "tr", - "turkish", + "Turkish", }, { "pl", - "polish", + "Polish", }, { "ca", - "catalan", + "Catalan", }, { "nl", - "dutch", + "Dutch", }, { "ar", - "arabic", + "Arabic", }, { "sv", - "swedish", + "Swedish", }, { "it", - "italian", + "Italian", }, { "id", - "indonesian", + "Indonesian", }, { "hi", - "hindi", + "Hindi", }, { "fi", - "finnish", + "Finnish", }, { "vi", - "vietnamese", + "Vietnamese", }, { "he", - "hebrew", + "Hebrew", }, { "uk", - "ukrainian", + "Ukrainian", }, { "el", - "greek", + "Greek", }, { "ms", - "malay", + "Malay", }, { "cs", - "czech", + "Czech", }, { "ro", - "romanian", + "Romanian", }, { "da", - "danish", + "Danish", }, { "hu", - "hungarian", + "Hungarian", }, { "ta", - "tamil", + "Tamil", }, { "no", - "norwegian", + "Norwegian", }, { "th", - "thai", + "Thai", }, { "ur", - "urdu", + "Urdu", }, { "hr", - "croatian", + "Croatian", }, { "bg", - "bulgarian", + "Bulgarian", }, { "lt", - "lithuanian", + "Lithuanian", }, { "la", - "latin", + "Latin", }, { "mi", - "maori", + "Maori", }, { "ml", - "malayalam", + "Malayalam", }, { "cy", - "welsh", + "Welsh", }, { "sk", - "slovak", + "Slovak", }, { "te", - "telugu", + "Telugu", }, { "fa", - "persian", + "Persian", }, { "lv", - "latvian", + "Latvian", }, { "bn", - "bengali", + "Bengali", }, { "sr", - "serbian", + "Serbian", }, { "az", - "azerbaijani", + "Azerbaijani", }, { "sl", - "slovenian", + "Slovenian", }, { "kn", - "kannada", + "Kannada", }, { "et", - "estonian", + "Estonian", }, { "mk", - "macedonian", + "Macedonian", }, { "br", - "breton", + "Breton", }, { "eu", - "basque", + "Basque", }, { "is", - "icelandic", + "Icelandic", }, { "hy", - "armenian", + "Armenian", }, { "ne", - "nepali", + "Nepali", }, { "mn", - "mongolian", + "Mongolian", }, { "bs", - "bosnian", + "Bosnian", }, { "kk", - "kazakh", + "Kazakh", }, { "sq", - "albanian", + "Albanian", }, { "sw", - "swahili", + "Swahili", }, { "gl", - "galician", + "Galician", }, { "mr", - "marathi", + "Marathi", }, { "pa", - "punjabi", + "Punjabi", }, { "si", - "sinhala", + "Sinhala", }, { "km", - "khmer", + "Khmer", }, { "sn", - "shona", + "Shona", }, { "yo", - "yoruba", + "Yoruba", }, { "so", - "somali", + "Somali", }, { "af", - "afrikaans", + "Afrikaans", }, { "oc", - "occitan", + "Occitan", }, { "ka", - "georgian", + "Georgian", }, { "be", - "belarusian", + "Belarusian", }, { "tg", - "tajik", + "Tajik", }, { "sd", - "sindhi", + "Sindhi", }, { "gu", - "gujarati", + "Gujarati", }, { "am", - "amharic", + "Amharic", }, { "yi", - "yiddish", + "Yiddish", }, { "lo", - "lao", + "Lao", }, { "uz", - "uzbek", + "Uzbek", }, { "fo", - "faroese", + "Faroese", }, { "ht", - "haitian", + "Haitian", }, { "ps", - "pashto", + "Pashto", }, { "tk", - "turkmen", + "Turkmen", }, { "nn", - "nynorsk", + "Nynorsk", }, { "mt", - "maltese", + "Maltese", }, { "sa", - "sanskrit", + "Sanskrit", }, { "lb", - "luxembourgish", + "Luxembourgish", }, { "my", - "myanmar", + "Myanmar", }, { "bo", - "tibetan", + "Tibetan", }, { "tl", - "tagalog", + "Tagalog", }, { "mg", - "malagasy", + "Malagasy", }, { "as", - "assamese", + "Assamese", }, { "tt", - "tatar", + "Tatar", }, { "haw", - "hawaiian", + "Hawaiian", }, { "ln", - "lingala", + "Lingala", }, { "ha", - "hausa", + "Hausa", }, { "ba", - "bashkir", + "Bashkir", }, { "jw", - "javanese", + "Javanese", }, { "su", - "sundanese", + "Sundanese", }, }; diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 3fb8944..807cb54 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -132,14 +132,21 @@ struct whisper_context *init_whisper_context(const std::string &model_path_in) [](enum ggml_log_level level, const char *text, void *user_data) { UNUSED_PARAMETER(level); UNUSED_PARAMETER(user_data); - obs_log(LOG_INFO, "Whisper: %s", text); + // remove trailing newline + char *text_copy = bstrdup(text); + text_copy[strcspn(text_copy, "\n")] = 0; + obs_log(LOG_INFO, "Whisper: %s", text_copy); + bfree(text_copy); }, nullptr); struct whisper_context_params cparams = whisper_context_default_params(); #ifdef LOCALVOCAL_WITH_CUDA cparams.use_gpu = true; - obs_log(LOG_INFO, "Using GPU for inference, device %d", cparams.gpu_device); + obs_log(LOG_INFO, "Using CUDA GPU for inference, device %d", cparams.gpu_device); +#elif defined(LOCALVOCAL_WITH_CLBLAST) + cparams.use_gpu = true; + obs_log(LOG_INFO, "Using OpenCL for inference"); #else cparams.use_gpu = false; obs_log(LOG_INFO, "Using CPU for inference"); @@ -191,6 +198,16 @@ struct whisper_context *init_whisper_context(const std::string &model_path_in) struct DetectionResultWithText run_whisper_inference(struct transcription_filter_data *gf, const float *pcm32f_data, size_t pcm32f_size) { + if (gf == nullptr) { + obs_log(LOG_ERROR, "run_whisper_inference: gf is null"); + return {DETECTION_RESULT_UNKNOWN, "", 0, 0}; + } + + if (pcm32f_data == nullptr || pcm32f_size == 0) { + obs_log(LOG_ERROR, "run_whisper_inference: pcm32f_data is null or size is 0"); + return {DETECTION_RESULT_UNKNOWN, "", 0, 0}; + } + obs_log(gf->log_level, "%s: processing %d samples, %.3f sec, %d threads", __func__, int(pcm32f_size), float(pcm32f_size) / WHISPER_SAMPLE_RATE, gf->whisper_params.n_threads); @@ -201,8 +218,6 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter return {DETECTION_RESULT_UNKNOWN, "", 0, 0}; } - // set duration in ms - const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE); // Get the duration in ms since the beginning of the stream (gf->start_timestamp_ms) const uint64_t offset_ms = (uint64_t)(std::chrono::duration_cast( @@ -226,6 +241,9 @@ struct DetectionResultWithText run_whisper_inference(struct transcription_filter obs_log(LOG_WARNING, "failed to process audio, error %d", whisper_full_result); return {DETECTION_RESULT_UNKNOWN, "", 0, 0}; } else { + // duration in ms + const uint64_t duration_ms = (uint64_t)(pcm32f_size * 1000 / WHISPER_SAMPLE_RATE); + const int n_segment = 0; const char *text = whisper_full_get_segment_text(gf->whisper_context, n_segment); const int64_t t0 = offset_ms; diff --git a/src/whisper-utils/whisper-processing.h b/src/whisper-utils/whisper-processing.h index edc8a66..3e189fe 100644 --- a/src/whisper-utils/whisper-processing.h +++ b/src/whisper-utils/whisper-processing.h @@ -2,7 +2,7 @@ #define WHISPER_PROCESSING_H // buffer size in msec -#define DEFAULT_BUFFER_SIZE_MSEC 3000 +#define DEFAULT_BUFFER_SIZE_MSEC 2000 // overlap in msec #define DEFAULT_OVERLAP_SIZE_MSEC 100 From be6f027c54e116d5aa3ffb6df68282c9765edcf9 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 00:42:51 -0400 Subject: [PATCH 2/6] Update CPU_OR_CUDA environment variable error messages --- cmake/BuildCTranslate2.cmake | 3 ++- cmake/BuildWhispercpp.cmake | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/BuildCTranslate2.cmake b/cmake/BuildCTranslate2.cmake index 41dcc1c..c0a4eac 100644 --- a/cmake/BuildCTranslate2.cmake +++ b/cmake/BuildCTranslate2.cmake @@ -21,7 +21,8 @@ elseif(WIN32) # check CPU_OR_CUDA environment variable if(NOT DEFINED ENV{CPU_OR_CUDA}) - message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`") + message( + FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either `cpu`, `clblast`, `12.2.0` or `11.8.0`") endif() if($ENV{CPU_OR_CUDA} STREQUAL "cpu" OR $ENV{CPU_OR_CUDA} STREQUAL "clblast") diff --git a/cmake/BuildWhispercpp.cmake b/cmake/BuildWhispercpp.cmake index 50ad450..d9b8d96 100644 --- a/cmake/BuildWhispercpp.cmake +++ b/cmake/BuildWhispercpp.cmake @@ -45,7 +45,8 @@ elseif(WIN32) if(NOT DEFINED ENV{CPU_OR_CUDA}) message( FATAL_ERROR - "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`") + "The CPU_OR_CUDA environment variable is not set. Please set it to either `cpu`, `clblast` or `11.8.0` or `12.2.0`" + ) endif(NOT DEFINED ENV{CPU_OR_CUDA}) set(ARCH_PREFIX $ENV{CPU_OR_CUDA}) From 501a217923e5021f38b1e99573e2de44e7fb2f52 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 00:55:13 -0400 Subject: [PATCH 3/6] Update Cublas validation in Package-Windows.ps1 and initialize function in captions-thread.h --- .github/scripts/Package-Windows.ps1 | 4 +++- src/captions-thread.h | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/scripts/Package-Windows.ps1 b/.github/scripts/Package-Windows.ps1 index cd12f90..e2c86e2 100644 --- a/.github/scripts/Package-Windows.ps1 +++ b/.github/scripts/Package-Windows.ps1 @@ -4,7 +4,7 @@ param( [string] $Target = 'x64', [ValidateSet('Debug', 'RelWithDebInfo', 'Release', 'MinSizeRel')] [string] $Configuration = 'RelWithDebInfo', - [ValidateSet('cpu', '12.2.0', '11.8.0')] + [ValidateSet('cpu', 'cblast', '12.2.0', '11.8.0')] [string] $Cublas = 'cpu', [switch] $BuildInstaller, [switch] $SkipDeps @@ -52,6 +52,8 @@ function Package { # Check if $cublas is cpu or cuda if ( $Cublas -eq 'cpu' ) { $CudaName = 'cpu' + } elseif ( $Cublas -eq 'cblast' ) { + $CudaName = 'cblast' } else { $CudaName = "cuda${Cublas}" } diff --git a/src/captions-thread.h b/src/captions-thread.h index 5ebf17b..5b15ca5 100644 --- a/src/captions-thread.h +++ b/src/captions-thread.h @@ -29,13 +29,13 @@ class CaptionMonitor { workerThread.join(); } - void initialize(std::function callback, size_t maxSize, - std::chrono::seconds maxTime) + void initialize(std::function callback_, size_t maxSize_, + std::chrono::seconds maxTime_) { obs_log(LOG_INFO, "CaptionMonitor::initialize"); - this->callback = callback; - this->maxSize = maxSize; - this->maxTime = maxTime; + this->callback = callback_; + this->maxSize = maxSize_; + this->maxTime = maxTime_; this->initialized = true; this->workerThread = std::thread(&CaptionMonitor::monitor, this); } From 9385a0c987c8b6f7af0e46bdc0de3d6755ca9efc Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 00:59:55 -0400 Subject: [PATCH 4/6] Update Cublas validation and fix typo in Package-Windows.ps1 --- .github/scripts/Package-Windows.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/Package-Windows.ps1 b/.github/scripts/Package-Windows.ps1 index e2c86e2..1721272 100644 --- a/.github/scripts/Package-Windows.ps1 +++ b/.github/scripts/Package-Windows.ps1 @@ -4,7 +4,7 @@ param( [string] $Target = 'x64', [ValidateSet('Debug', 'RelWithDebInfo', 'Release', 'MinSizeRel')] [string] $Configuration = 'RelWithDebInfo', - [ValidateSet('cpu', 'cblast', '12.2.0', '11.8.0')] + [ValidateSet('cpu', 'clblast', '12.2.0', '11.8.0')] [string] $Cublas = 'cpu', [switch] $BuildInstaller, [switch] $SkipDeps From 66f1afc54a55442a8df9325e607e3d32d4f45897 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 08:57:39 -0400 Subject: [PATCH 5/6] Update default whisper model path to Whisper Tiny English (74Mb) --- src/transcription-filter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index e7c28bf..8c05743 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -769,7 +769,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_int(s, "log_level", LOG_DEBUG); obs_data_set_default_bool(s, "log_words", false); obs_data_set_default_bool(s, "caption_to_stream", false); - obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny En (74Mb)"); + obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny English (74Mb)"); obs_data_set_default_string(s, "whisper_language_select", "en"); obs_data_set_default_string(s, "subtitle_sources", "none"); obs_data_set_default_bool(s, "step_by_step_processing", false); From 7873def295294c92a241991af6829c35492c805b Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 18 Apr 2024 09:21:45 -0400 Subject: [PATCH 6/6] Update translation strings for multiple locales --- data/locale/ar-SA.ini | 2 ++ data/locale/de-DE.ini | 2 ++ data/locale/en-US.ini | 4 +++- data/locale/es-ES.ini | 2 ++ data/locale/fr-FR.ini | 2 ++ data/locale/hi-IN.ini | 2 ++ data/locale/ja-JP.ini | 2 ++ data/locale/ko-KR.ini | 2 ++ data/locale/pl-PL.ini | 2 ++ data/locale/pt-BR.ini | 2 ++ data/locale/ru-RU.ini | 2 ++ data/locale/zh-CN.ini | 2 ++ src/captions-thread.h | 9 --------- src/transcription-filter.cpp | 26 +++++++++++++++----------- 14 files changed, 40 insertions(+), 21 deletions(-) diff --git a/data/locale/ar-SA.ini b/data/locale/ar-SA.ini index bdd2fe8..5c08e38 100644 --- a/data/locale/ar-SA.ini +++ b/data/locale/ar-SA.ini @@ -49,3 +49,5 @@ source_language="لغة المصدر" translate="ترجمة (⚠️ زيادة المعالجة)" translate_add_context="الترجمة مع السياق" whisper_translate="ترجمة إلى الإنجليزية (Whisper)" +buffer_size_msec="حجم الذاكرة المؤقتة (ملي ثانية)" +overlap_size_msec="حجم التداخل (ملي ثانية)" diff --git a/data/locale/de-DE.ini b/data/locale/de-DE.ini index e83f8b9..df2e450 100644 --- a/data/locale/de-DE.ini +++ b/data/locale/de-DE.ini @@ -49,3 +49,5 @@ source_language="Quellsprache" translate="Übersetzen (⚠️ erhöhte Verarbeitung)" translate_add_context="Mit Kontext übersetzen" whisper_translate="Ins Englische übersetzen (Flüstern)" +buffer_size_msec="Puffergröße (ms)" +overlap_size_msec="Überlappungsgröße (ms)" diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 97dba52..6f7c1e9 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -12,7 +12,7 @@ text_file_output="Text File output" output_filename="Output filename" whisper_model="Whisper Model" external_model_file="External model file" -whisper_parameters="Advanced Settings" +whisper_parameters="Whisper Model Parameters" language="Language" whisper_sampling_method="Whisper Sampling Method" n_threads="Number of threads" @@ -49,3 +49,5 @@ source_language="Source language" translate="Translate (⚠️ increased processing)" translate_add_context="Translate with context" whisper_translate="Translate to English (Whisper)" +buffer_size_msec="Buffer size (ms)" +overlap_size_msec="Overlap size (ms)" diff --git a/data/locale/es-ES.ini b/data/locale/es-ES.ini index 8634c3f..59dc350 100644 --- a/data/locale/es-ES.ini +++ b/data/locale/es-ES.ini @@ -49,3 +49,5 @@ source_language="Idioma fuente" translate="Traducir (⚠️ procesamiento aumentado)" translate_add_context="Traducir con contexto" whisper_translate="Traducir al inglés (Whisper)" +buffer_size_msec="Tamaño del búfer (ms)" +overlap_size_msec="Tamaño de superposición (ms)" diff --git a/data/locale/fr-FR.ini b/data/locale/fr-FR.ini index 2a8759a..0b6cd0c 100644 --- a/data/locale/fr-FR.ini +++ b/data/locale/fr-FR.ini @@ -49,3 +49,5 @@ source_language="Langue source" translate="Traduire (⚠️ traitement accru)" translate_add_context="Traduire avec contexte" whisper_translate="Traduire en anglais (Whisper)" +buffer_size_msec="Taille du tampon (ms)" +overlap_size_msec="Taille de chevauchement (ms)" diff --git a/data/locale/hi-IN.ini b/data/locale/hi-IN.ini index 6e61512..a7b2d26 100644 --- a/data/locale/hi-IN.ini +++ b/data/locale/hi-IN.ini @@ -49,3 +49,5 @@ source_language="स्रोत भाषा" translate="अनुवाद करें (⚠️ बढ़ी प्रसंस्करण)" translate_add_context="संदर्भ के साथ अनुवाद करें" whisper_translate="अंग्रेजी में अनुवाद करें (व्हिस्पर)" +buffer_size_msec="बफ़र आकार (ms)" +overlap_size_msec="ओवरलैप आकार (ms)" diff --git a/data/locale/ja-JP.ini b/data/locale/ja-JP.ini index 8cce9b5..18445bc 100644 --- a/data/locale/ja-JP.ini +++ b/data/locale/ja-JP.ini @@ -49,3 +49,5 @@ source_language="ソース言語" translate="翻訳 (⚠️処理増加)" translate_add_context="コンテキスト付きで翻訳" whisper_translate="英語に翻訳(ウィスパー)" +buffer_size_msec="バッファサイズ(ms)" +overlap_size_msec="オーバーラップサイズ(ms)" diff --git a/data/locale/ko-KR.ini b/data/locale/ko-KR.ini index 4014456..8dbe564 100644 --- a/data/locale/ko-KR.ini +++ b/data/locale/ko-KR.ini @@ -49,3 +49,5 @@ source_language="원본 언어" translate="번역 (⚠️ 처리 시간 증가)" translate_add_context="컨텍스트와 함께 번역" whisper_translate="영어로 번역 (속삭임)" +buffer_size_msec="버퍼 크기 (ms)" +overlap_size_msec="오버랩 크기 (ms)" diff --git a/data/locale/pl-PL.ini b/data/locale/pl-PL.ini index 1025bb1..64ee55b 100644 --- a/data/locale/pl-PL.ini +++ b/data/locale/pl-PL.ini @@ -49,3 +49,5 @@ source_language="Język źródłowy" translate="Tłumacz (⚠️ zwiększone przetwarzanie)" translate_add_context="Tłumacz z kontekstem" whisper_translate="Tłumacz na angielski (Whisper)" +buffer_size_msec="Rozmiar bufora (ms)" +overlap_size_msec="Rozmiar nakładki (ms)" diff --git a/data/locale/pt-BR.ini b/data/locale/pt-BR.ini index 73ab770..2f0a27e 100644 --- a/data/locale/pt-BR.ini +++ b/data/locale/pt-BR.ini @@ -49,3 +49,5 @@ source_language="Língua de origem" translate="Traduzir (⚠️ o processamento aumentará)" translate_add_context="Traduzir com contexto" whisper_translate="Traduzir para inglês (Whisper)" +buffer_size_msec="Tamanho do buffer (ms)" +overlap_size_msec="Tamanho da sobreposição (ms)" diff --git a/data/locale/ru-RU.ini b/data/locale/ru-RU.ini index 8428761..23090b6 100644 --- a/data/locale/ru-RU.ini +++ b/data/locale/ru-RU.ini @@ -48,3 +48,5 @@ source_language="Исходный язык" translate="Перевести (⚠️ обработка будет увеличена)" translate_add_context="Перевести с контекстом" whisper_translate="Перевести на английский (Whisper)" +buffer_size_msec="Размер буфера (мс)" +overlap_size_msec="Размер перекрытия (мс)" diff --git a/data/locale/zh-CN.ini b/data/locale/zh-CN.ini index a9e382e..a561f35 100644 --- a/data/locale/zh-CN.ini +++ b/data/locale/zh-CN.ini @@ -49,3 +49,5 @@ source_language="源语言" translate="翻译 (⚠️ 增加处理)" translate_add_context="带上下文翻译" whisper_translate="翻译为英语(Whisper)" +buffer_size_msec="缓冲区大小(毫秒)" +overlap_size_msec="重叠大小(毫秒)" diff --git a/src/captions-thread.h b/src/captions-thread.h index 5b15ca5..1cdb079 100644 --- a/src/captions-thread.h +++ b/src/captions-thread.h @@ -32,7 +32,6 @@ class CaptionMonitor { void initialize(std::function callback_, size_t maxSize_, std::chrono::seconds maxTime_) { - obs_log(LOG_INFO, "CaptionMonitor::initialize"); this->callback = callback_; this->maxSize = maxSize_; this->maxTime = maxTime_; @@ -49,8 +48,6 @@ class CaptionMonitor { } this->newDataAvailable = true; } - obs_log(LOG_INFO, "CaptionMonitor::addWords: number of words in queue: %d", - wordQueue.size()); condVar.notify_all(); } @@ -66,7 +63,6 @@ class CaptionMonitor { [this] { return this->newDataAvailable || this->stop; }); if (this->stop) { - obs_log(LOG_INFO, "CaptionMonitor::monitor: stopping"); break; } @@ -74,9 +70,6 @@ class CaptionMonitor { continue; } - obs_log(LOG_INFO, "CaptionMonitor::monitor: wordQueue size: %d", - this->wordQueue.size()); - // emit up to maxSize words from the wordQueue std::vector emitted; while (!this->wordQueue.empty() && emitted.size() <= this->maxSize) { @@ -99,8 +92,6 @@ class CaptionMonitor { // flush the queue if it's full or we've reached the max time size_t words_to_flush = std::min(this->wordQueue.size(), this->maxSize); - obs_log(LOG_INFO, "CaptionMonitor::monitor: flushing %d words", - words_to_flush); for (size_t i = 0; i < words_to_flush; ++i) { wordQueue.pop_front(); } diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 8c05743..c12b2a7 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -332,7 +332,9 @@ void set_text_callback(struct transcription_filter_data *gf, gf->last_text = str_copy; - gf->captions_monitor.addWords(split_words(str_copy)); + if (gf->buffered_output) { + gf->captions_monitor.addWords(split_words(str_copy)); + } if (gf->caption_to_stream) { obs_output_t *streaming_output = obs_frontend_get_streaming_output(); @@ -638,12 +640,14 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) // set transform settings obs_transform_info transform_info; - transform_info.pos.x = 1852.0; - transform_info.pos.y = 1034.0; + transform_info.pos.x = 962.0; + transform_info.pos.y = 959.0; transform_info.bounds.x = 1769.0; transform_info.bounds.y = 145.0; transform_info.bounds_type = obs_bounds_type::OBS_BOUNDS_SCALE_INNER; + transform_info.bounds_alignment = OBS_ALIGN_CENTER; + transform_info.alignment = OBS_ALIGN_CENTER; transform_info.scale.x = 1.0; transform_info.scale.y = 1.0; transform_info.rot = 0.0; @@ -942,7 +946,7 @@ obs_properties_t *transcription_filter_properties(void *data) for (const std::string &prop_name : {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec", "overlap_size_msec", "step_by_step_processing", "min_sub_duration", - "process_while_muted", "buffered_output"}) { + "process_while_muted", "buffered_output", "vad_enabled", "log_level"}) { obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), show_hide); } @@ -977,18 +981,18 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted")); - obs_properties_t *whisper_params_group = obs_properties_create(); - obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), - OBS_GROUP_NORMAL, whisper_params_group); + obs_properties_add_bool(ppts, "vad_enabled", MT_("vad_enabled")); - obs_properties_add_bool(whisper_params_group, "vad_enabled", MT_("vad_enabled")); - obs_property_t *list = obs_properties_add_list(whisper_params_group, "log_level", - MT_("log_level"), OBS_COMBO_TYPE_LIST, - OBS_COMBO_FORMAT_INT); + obs_property_t *list = obs_properties_add_list(ppts, "log_level", MT_("log_level"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); obs_property_list_add_int(list, "DEBUG", LOG_DEBUG); obs_property_list_add_int(list, "INFO", LOG_INFO); obs_property_list_add_int(list, "WARNING", LOG_WARNING); + obs_properties_t *whisper_params_group = obs_properties_create(); + obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), + OBS_GROUP_NORMAL, whisper_params_group); + // Add language selector obs_property_t *whisper_language_select_list = obs_properties_add_list( whisper_params_group, "whisper_language_select", MT_("language"),