Skip to content

Commit f00b8c5

Browse files
committed
Merge remote-tracking branch 'origin/master' into roy.extract_filter_utils_offline_test
2 parents 13a8cc8 + e3c6951 commit f00b8c5

21 files changed

+934
-416
lines changed

CMakeLists.txt

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ else()
9696
include(cmake/FetchOnnxruntime.cmake)
9797
endif()
9898

99+
include(cmake/BuildICU.cmake)
100+
# Add ICU to the target
101+
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
102+
target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
103+
99104
target_sources(
100105
${CMAKE_PROJECT_NAME}
101106
PRIVATE src/plugin-main.c
@@ -114,11 +119,13 @@ target_sources(
114119
src/whisper-utils/whisper-model-utils.cpp
115120
src/whisper-utils/silero-vad-onnx.cpp
116121
src/whisper-utils/token-buffer-thread.cpp
122+
src/whisper-utils/vad-processing.cpp
117123
src/translation/language_codes.cpp
118124
src/translation/translation.cpp
119125
src/translation/translation-utils.cpp
120-
src/ui/filter-replace-dialog.cpp
121-
src/ui/filter-replace-utils.cpp)
126+
src/ui/filter-replace-utils.cpp
127+
src/translation/translation-language-utils.cpp
128+
src/ui/filter-replace-dialog.cpp)
122129

123130
set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
124131

@@ -138,13 +145,15 @@ if(ENABLE_TESTS)
138145
src/whisper-utils/whisper-utils.cpp
139146
src/whisper-utils/silero-vad-onnx.cpp
140147
src/whisper-utils/token-buffer-thread.cpp
148+
src/whisper-utils/vad-processing.cpp
141149
src/translation/language_codes.cpp
142150
src/translation/translation.cpp
143-
src/ui/filter-replace-utils.cpp)
151+
src/ui/filter-replace-utils.cpp
152+
src/translation/translation-language-utils.cpp)
144153

145154
find_libav(${CMAKE_PROJECT_NAME}-tests)
146155

147-
target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs)
156+
target_link_libraries(${CMAKE_PROJECT_NAME}-tests PRIVATE ct2 sentencepiece Whispercpp Ort OBS::libobs ICU)
148157
target_include_directories(${CMAKE_PROJECT_NAME}-tests PRIVATE src)
149158

150159
# install the tests to the release/test directory

cmake/BuildICU.cmake

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
include(FetchContent)
2+
include(ExternalProject)
3+
4+
set(ICU_VERSION "75.1")
5+
set(ICU_VERSION_UNDERSCORE "75_1")
6+
set(ICU_VERSION_DASH "75-1")
7+
set(ICU_VERSION_NO_MINOR "75")
8+
9+
if(WIN32)
10+
set(ICU_URL
11+
"https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-Win64-MSVC2022.zip"
12+
)
13+
set(ICU_HASH "SHA256=7ac9c0dc6ccc1ec809c7d5689b8d831c5b8f6b11ecf70fdccc55f7ae8731ac8f")
14+
15+
FetchContent_Declare(
16+
ICU_build
17+
URL ${ICU_URL}
18+
URL_HASH ${ICU_HASH})
19+
20+
FetchContent_MakeAvailable(ICU_build)
21+
22+
# Assuming the ZIP structure, adjust paths as necessary
23+
set(ICU_INCLUDE_DIR "${icu_build_SOURCE_DIR}/include")
24+
set(ICU_LIBRARY_DIR "${icu_build_SOURCE_DIR}/lib64")
25+
set(ICU_BINARY_DIR "${icu_build_SOURCE_DIR}/bin64")
26+
27+
# Define the library names
28+
set(ICU_LIBRARIES icudt icuuc icuin)
29+
30+
foreach(lib ${ICU_LIBRARIES})
31+
# Add ICU library
32+
find_library(
33+
ICU_LIB_${lib}
34+
NAMES ${lib}
35+
PATHS ${ICU_LIBRARY_DIR}
36+
NO_DEFAULT_PATH REQUIRED)
37+
# find the dll
38+
find_file(
39+
ICU_DLL_${lib}
40+
NAMES ${lib}${ICU_VERSION_NO_MINOR}.dll
41+
PATHS ${ICU_BINARY_DIR}
42+
NO_DEFAULT_PATH)
43+
# Copy the DLLs to the output directory
44+
install(FILES ${ICU_DLL_${lib}} DESTINATION "obs-plugins/64bit")
45+
# add the library
46+
add_library(ICU::${lib} SHARED IMPORTED GLOBAL)
47+
set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIB_${lib}}" IMPORTED_IMPLIB
48+
"${ICU_LIB_${lib}}")
49+
endforeach()
50+
else()
51+
set(ICU_URL
52+
"https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION_DASH}/icu4c-${ICU_VERSION_UNDERSCORE}-src.tgz"
53+
)
54+
set(ICU_HASH "SHA256=cb968df3e4d2e87e8b11c49a5d01c787bd13b9545280fc6642f826527618caef")
55+
if(APPLE)
56+
set(ICU_PLATFORM "MacOSX")
57+
set(TARGET_ARCH -arch\ $ENV{MACOS_ARCH})
58+
set(ICU_BUILD_ENV_VARS CFLAGS=${TARGET_ARCH} CXXFLAGS=${TARGET_ARCH} LDFLAGS=${TARGET_ARCH})
59+
else()
60+
set(ICU_PLATFORM "Linux")
61+
set(ICU_BUILD_ENV_VARS CFLAGS=-fPIC CXXFLAGS=-fPIC LDFLAGS=-fPIC)
62+
endif()
63+
64+
ExternalProject_Add(
65+
ICU_build
66+
DOWNLOAD_EXTRACT_TIMESTAMP true
67+
GIT_REPOSITORY "https://github.com/unicode-org/icu.git"
68+
GIT_TAG "release-${ICU_VERSION_DASH}"
69+
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${ICU_BUILD_ENV_VARS} <SOURCE_DIR>/icu4c/source/runConfigureICU
70+
${ICU_PLATFORM} --prefix=<INSTALL_DIR> --enable-static --disable-shared
71+
BUILD_COMMAND make -j4
72+
BUILD_BYPRODUCTS
73+
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icudata${CMAKE_STATIC_LIBRARY_SUFFIX}
74+
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icuuc${CMAKE_STATIC_LIBRARY_SUFFIX}
75+
<INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}icui18n${CMAKE_STATIC_LIBRARY_SUFFIX}
76+
INSTALL_COMMAND make install
77+
BUILD_IN_SOURCE 1)
78+
79+
ExternalProject_Get_Property(ICU_build INSTALL_DIR)
80+
81+
set(ICU_INCLUDE_DIR "${INSTALL_DIR}/include")
82+
set(ICU_LIBRARY_DIR "${INSTALL_DIR}/lib")
83+
84+
set(ICU_LIBRARIES icudata icuuc icui18n)
85+
86+
foreach(lib ${ICU_LIBRARIES})
87+
add_library(ICU::${lib} STATIC IMPORTED GLOBAL)
88+
add_dependencies(ICU::${lib} ICU_build)
89+
set(ICU_LIBRARY "${ICU_LIBRARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib}${CMAKE_STATIC_LIBRARY_SUFFIX}")
90+
set_target_properties(ICU::${lib} PROPERTIES IMPORTED_LOCATION "${ICU_LIBRARY}" INTERFACE_INCLUDE_DIRECTORIES
91+
"${ICU_INCLUDE_DIR}")
92+
endforeach(lib ${ICU_LIBRARIES})
93+
endif()
94+
95+
# Create an interface target for ICU
96+
add_library(ICU INTERFACE)
97+
add_dependencies(ICU ICU_build)
98+
foreach(lib ${ICU_LIBRARIES})
99+
target_link_libraries(ICU INTERFACE ICU::${lib})
100+
endforeach()
101+
target_include_directories(ICU SYSTEM INTERFACE $<BUILD_INTERFACE:${ICU_INCLUDE_DIR}>)

data/locale/en-US.ini

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
LocalVocalPlugin="LocalVocal Plugin"
22
transcription_filterAudioFilter="LocalVocal Transcription"
3-
vad_enabled="VAD Enabled"
43
vad_threshold="VAD Threshold"
54
log_level="Internal Log Level"
65
log_words="Log Output to Console"
76
caption_to_stream="Stream Captions"
8-
step_by_step_processing="Step-by-step processing (⚠️ increased processing)"
9-
step_size_msec="Step size (ms)"
107
subtitle_sources="Output Destination"
118
none_no_output="None / No output"
129
file_output_enable="Save to File"
@@ -51,7 +48,6 @@ translate="Translation"
5148
translate_add_context="Translate with context"
5249
whisper_translate="Translate to English (Whisper)"
5350
buffer_size_msec="Buffer size (ms)"
54-
overlap_size_msec="Overlap size (ms)"
5551
suppress_sentences="Suppress sentences (each line)"
5652
translate_output="Output Destination"
5753
dtw_token_timestamps="DTW token timestamps"
@@ -85,4 +81,10 @@ buffered_output_parameters="Buffered Output Configuration"
8581
file_output_info="Note: Translation output will be saved to a file in the same directory with the target language added to the name, e.g. 'output_es.srt'."
8682
partial_transcription="Enable Partial Transcription"
8783
partial_transcription_info="Partial transcription will increase processing load on your machine to transcribe content in real-time, which may impact performance."
88-
partial_latency="Latency (ms)"
84+
partial_latency="Latency (ms)"
85+
vad_mode="VAD Mode"
86+
Active_VAD="Active VAD"
87+
Hybrid_VAD="Hybrid VAD"
88+
translate_only_full_sentences="Translate only full sentences"
89+
duration_filter_threshold="Duration filter"
90+
segment_duration="Segment duration"

src/model-utils/model-downloader-ui.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,10 @@ void ModelDownloader::closeEvent(QCloseEvent *e)
6666
{
6767
if (!this->mPrepareToClose)
6868
e->ignore();
69-
else
69+
else {
7070
QDialog::closeEvent(e);
71+
deleteLater();
72+
}
7173
}
7274

7375
void ModelDownloader::close()

src/model-utils/model-downloader-ui.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ public slots:
5050
private:
5151
QVBoxLayout *layout;
5252
QProgressBar *progress_bar;
53-
QThread *download_thread;
54-
ModelDownloadWorker *download_worker;
53+
QPointer<QThread> download_thread;
54+
QPointer<ModelDownloadWorker> download_worker;
5555
// Callback for when the download is finished
5656
download_finished_callback_t download_finished_callback;
5757
bool mPrepareToClose;

src/tests/localvocal-offline-test.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "transcription-filter.h"
1818
#include "transcription-utils.h"
1919
#include "whisper-utils/whisper-utils.h"
20+
#include "whisper-utils/vad-processing.h"
2021
#include "audio-file-utils.h"
2122
#include "translation/language_codes.h"
2223
#include "ui/filter-replace-utils.h"
@@ -149,7 +150,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
149150
// },
150151
// 30, std::chrono::seconds(10));
151152

152-
gf->vad_enabled = true;
153+
gf->vad_mode = VAD_MODE_ACTIVE;
153154
gf->log_words = true;
154155
gf->caption_to_stream = false;
155156
gf->start_timestamp_ms = now_ms();
@@ -158,7 +159,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
158159
gf->buffered_output = false;
159160

160161
gf->target_lang = "";
161-
gf->translation_ctx.add_context = true;
162+
gf->translation_ctx.add_context = 1;
162163
gf->translation_output = "";
163164
gf->translate = false;
164165
gf->sentence_psum_accept_thresh = 0.4;

src/transcription-filter-callbacks.cpp

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ std::string send_sentence_to_translation(const std::string &sentence,
5353
struct transcription_filter_data *gf,
5454
const std::string &source_language)
5555
{
56-
const std::string last_text = gf->last_text;
57-
gf->last_text = sentence;
56+
const std::string last_text = gf->last_text_for_translation;
57+
gf->last_text_for_translation = sentence;
5858
if (gf->translate && !sentence.empty()) {
5959
obs_log(gf->log_level, "Translating text. %s -> %s", source_language.c_str(),
6060
gf->target_lang.c_str());
@@ -199,11 +199,6 @@ void set_text_callback(struct transcription_filter_data *gf,
199199
const DetectionResultWithText &resultIn)
200200
{
201201
DetectionResultWithText result = resultIn;
202-
if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
203-
result.result == DETECTION_RESULT_PARTIAL)) {
204-
gf->last_sub_render_time = now_ms();
205-
gf->cleared_last_sub = false;
206-
}
207202

208203
std::string str_copy = result.text;
209204

@@ -233,20 +228,25 @@ void set_text_callback(struct transcription_filter_data *gf,
233228
}
234229
}
235230

231+
bool should_translate =
232+
gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;
233+
236234
// send the sentence to translation (if enabled)
237235
std::string translated_sentence =
238-
send_sentence_to_translation(str_copy, gf, result.language);
236+
should_translate ? send_sentence_to_translation(str_copy, gf, result.language) : "";
239237

240238
if (gf->translate) {
241239
if (gf->translation_output == "none") {
242240
// overwrite the original text with the translated text
243241
str_copy = translated_sentence;
244242
} else {
245243
if (gf->buffered_output) {
246-
if (result.result == DETECTION_RESULT_SPEECH) {
247-
// buffered output - add the sentence to the monitor
248-
gf->translation_monitor.addSentence(translated_sentence);
249-
}
244+
// buffered output - add the sentence to the monitor
245+
gf->translation_monitor.addSentenceFromStdString(
246+
translated_sentence,
247+
get_time_point_from_ms(result.start_timestamp_ms),
248+
get_time_point_from_ms(result.end_timestamp_ms),
249+
result.result == DETECTION_RESULT_PARTIAL);
250250
} else {
251251
// non-buffered output - send the sentence to the selected source
252252
send_caption_to_source(gf->translation_output, translated_sentence,
@@ -256,9 +256,10 @@ void set_text_callback(struct transcription_filter_data *gf,
256256
}
257257

258258
if (gf->buffered_output) {
259-
if (result.result == DETECTION_RESULT_SPEECH) {
260-
gf->captions_monitor.addSentence(str_copy);
261-
}
259+
gf->captions_monitor.addSentenceFromStdString(
260+
str_copy, get_time_point_from_ms(result.start_timestamp_ms),
261+
get_time_point_from_ms(result.end_timestamp_ms),
262+
result.result == DETECTION_RESULT_PARTIAL);
262263
} else {
263264
// non-buffered output - send the sentence to the selected source
264265
send_caption_to_source(gf->text_source_name, str_copy, gf);
@@ -273,6 +274,21 @@ void set_text_callback(struct transcription_filter_data *gf,
273274
result.result == DETECTION_RESULT_SPEECH) {
274275
send_sentence_to_file(gf, result, str_copy, translated_sentence);
275276
}
277+
278+
if (!result.text.empty() && (result.result == DETECTION_RESULT_SPEECH ||
279+
result.result == DETECTION_RESULT_PARTIAL)) {
280+
gf->last_sub_render_time = now_ms();
281+
gf->cleared_last_sub = false;
282+
if (result.result == DETECTION_RESULT_SPEECH) {
283+
// save the last subtitle if it was a full sentence
284+
gf->last_transcription_sentence.push_back(result.text);
285+
// remove the oldest sentence if the buffer is too long
286+
while (gf->last_transcription_sentence.size() >
287+
(size_t)gf->n_context_sentences) {
288+
gf->last_transcription_sentence.pop_front();
289+
}
290+
}
291+
}
276292
};
277293

278294
void recording_state_callback(enum obs_frontend_event event, void *data)
@@ -314,6 +330,12 @@ void reset_caption_state(transcription_filter_data *gf_)
314330
}
315331
send_caption_to_source(gf_->text_source_name, "", gf_);
316332
send_caption_to_source(gf_->translation_output, "", gf_);
333+
// reset translation context
334+
gf_->last_text_for_translation = "";
335+
gf_->last_text_translation = "";
336+
gf_->translation_ctx.last_input_tokens.clear();
337+
gf_->translation_ctx.last_translation_tokens.clear();
338+
gf_->last_transcription_sentence.clear();
317339
// flush the buffer
318340
{
319341
std::lock_guard<std::mutex> lock(gf_->whisper_buf_mutex);

src/transcription-filter-data.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ struct transcription_filter_data {
3636
size_t sentence_number;
3737
// Minimal subtitle duration in ms
3838
size_t min_sub_duration;
39+
// Maximal subtitle duration in ms
40+
size_t max_sub_duration;
3941
// Last time a subtitle was rendered
4042
uint64_t last_sub_render_time;
4143
bool cleared_last_sub;
@@ -62,7 +64,7 @@ struct transcription_filter_data {
6264
float sentence_psum_accept_thresh;
6365

6466
bool do_silence;
65-
bool vad_enabled;
67+
int vad_mode;
6668
int log_level = LOG_DEBUG;
6769
bool log_words;
6870
bool caption_to_stream;
@@ -84,11 +86,17 @@ struct transcription_filter_data {
8486
bool initial_creation = true;
8587
bool partial_transcription = false;
8688
int partial_latency = 1000;
89+
float duration_filter_threshold = 2.25f;
90+
int segment_duration = 7000;
8791

8892
// Last transcription result
89-
std::string last_text;
93+
std::string last_text_for_translation;
9094
std::string last_text_translation;
9195

96+
// Transcription context sentences
97+
int n_context_sentences;
98+
std::deque<std::string> last_transcription_sentence;
99+
92100
// Text source to output the subtitles
93101
std::string text_source_name;
94102
// Callback to set the text in the output text source (subtitles)
@@ -110,6 +118,7 @@ struct transcription_filter_data {
110118
struct translation_context translation_ctx;
111119
std::string translation_model_index;
112120
std::string translation_model_path_external;
121+
bool translate_only_full_sentences;
113122

114123
bool buffered_output = false;
115124
TokenBufferThread captions_monitor;

0 commit comments

Comments
 (0)