Skip to content

Commit

Permalink
Steamline and refactor (#105)
Browse files Browse the repository at this point in the history
* refactor: Update whispercpp dependency to version 0.0.3

* refactor: Add buffered output parameters for transcription filter

* refactor: Remove unused parameter in set_source_signals function

* refactor: Fix character splitting bug in TokenBufferThread

* refactor: Update buffer size and overlap size in whisper-processing.cpp

* refactor: Remove unused parameter in set_source_signals function

* refactor: Fix floating point precision issue in whisper-processing.cpp

* refactor: Improve remove_leading_trailing_nonalpha function in transcription-utils.cpp

* refactor: Update VAD threshold in transcription filter

* refactor: Update VAD threshold parameter name in silero-vad-onnx.h

* refactor: Update VAD threshold parameter name in silero-vad-onnx.h

* refactor: Update lock_guard parameter name in TokenBufferThread
  • Loading branch information
royshil authored Jun 5, 2024
1 parent 9ecd759 commit 67993f3
Show file tree
Hide file tree
Showing 26 changed files with 1,045 additions and 765 deletions.
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ target_sources(
PRIVATE src/plugin-main.c
src/transcription-filter.cpp
src/transcription-filter.c
src/transcription-filter-callbacks.cpp
src/transcription-filter-utils.cpp
src/transcription-utils.cpp
src/model-utils/model-downloader.cpp
src/model-utils/model-downloader-ui.cpp
Expand All @@ -99,8 +101,7 @@ target_sources(
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/translation/translation-utils.cpp
src/utils.cpp)
src/translation/translation-utils.cpp)

set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})

Expand All @@ -121,8 +122,7 @@ if(ENABLE_TESTS)
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/utils.cpp)
src/translation/translation.cpp)

find_libav(${CMAKE_PROJECT_NAME}-tests)

Expand Down
10 changes: 10 additions & 0 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
LocalVocalPlugin="LocalVocal Plugin"
transcription_filterAudioFilter="LocalVocal Transcription"
vad_enabled="VAD Enabled"
vad_threshold="VAD Threshold"
log_level="Internal Log Level"
log_words="Log Output to Console"
caption_to_stream="Stream Captions"
Expand Down Expand Up @@ -61,3 +62,12 @@ sentence_psum_accept_thresh="Sentence prob. threshold"
external_model_folder="External model folder"
load_external_model="Load external model"
translate_input_tokenization_style="Input token style"
translation_sampling_temperature="Sampling temperature"
translation_repetition_penalty="Repetition penalty"
translation_beam_size="Beam size"
translation_max_decoding_length="Max decoding length"
translation_no_repeat_ngram_size="No-repeat ngram size"
translation_max_input_length="Max input length"
buffered_output_parameters="Buffered output parameters"
buffer_num_lines="Number of lines"
buffer_num_chars_per_line="Characters per line"
54 changes: 54 additions & 0 deletions src/model-utils/model-infos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,60 @@ std::map<std::string, ModelInfo> models_info = {{
"B6E77E474AEEA8F441363ACA7614317C06381F3EACFE10FB9856D5081D1074CC"},
{"https://huggingface.co/jncraton/m2m100_1.2B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"D8F7C76ED2A5E0822BE39F0A4F95A55EB19C78F4593CE609E2EDBC2AEA4D380A"}}}},
{"NLLB 200 1.3B (1.4Gb)",
{"NLLB 200 1.3B",
"nllb-200-1.3b",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/model.bin?download=true",
"72D7533DC7A0E8F10F19A650D4E90FAF9CBFA899DB5411AD124BD5802BD91263"},
{
"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/config.json?download=true",
"0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer.json?download=true",
"E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
"A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/special_tokens_map.json?download=true",
"992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer_config.json?download=true",
"D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
{"NLLB 200 600M (650Mb)",
{"NLLB 200 600M",
"nllb-200-600m",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/model.bin?download=true",
"ED1BEAF75134DE7505315A5223162F56ACFF397EFF6B50638A500D3936FE707B"},
{
"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/config.json?download=true",
"0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer.json?download=true",
"E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
"A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/special_tokens_map.json?download=true",
"992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer_config.json?download=true",
"D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
{"MADLAD 400 3B (2.9Gb)",
{"MADLAD 400 3B",
"madlad-400-3b",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/model.bin?download=true",
"F3C87256A2C888100C179D7DCD7F41DF17C767469546C59D32C7DDE86C740A6B"},
{
"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/config.json?download=true",
"A428C51CD35517554523B3C6B6974A5928BC35E82B130869A543566A34A83B93",
},
{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/shared_vocabulary.txt?download=true",
"C327551CE3CA6EFC7B437E11A267F79979893332DDA8A1D146E2C950815193F8"},
{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/sentencepiece.model?download=true",
"EF11AC9A22C7503492F56D48DCE53BE20E339B63605983E9F27D2CD0E0F3922C"}}}},
{"Whisper Base q5 (57Mb)",
{"Whisper Base q5",
"whisper-base-q5",
Expand Down
30 changes: 16 additions & 14 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <nlohmann/json.hpp>

#include "transcription-filter-data.h"
#include "transcription-filter-utils.h"
#include "transcription-filter.h"
#include "transcription-utils.h"
#include "whisper-utils/whisper-utils.h"
Expand Down Expand Up @@ -84,7 +85,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->sample_rate = sample_rate;
gf->frames = (size_t)((float)gf->sample_rate * 10.0f);
gf->last_num_frames = 0;
gf->step_size_msec = 3000;
gf->min_sub_duration = 3000;
gf->last_sub_render_time = 0;
gf->save_srt = false;
Expand All @@ -110,8 +110,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
memset(gf->copy_buffers[0], 0, gf->channels * gf->frames * sizeof(float));
obs_log(LOG_INFO, " allocated %llu bytes ", gf->channels * gf->frames * sizeof(float));

gf->overlap_ms = 150;
gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms));
obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels,
(int)gf->frames, gf->sample_rate);

Expand Down Expand Up @@ -158,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
gf->whisper_params.duration_ms = 3000;
gf->whisper_params.language = "en";
gf->whisper_params.detect_language = false;
gf->whisper_params.initial_prompt = "";
gf->whisper_params.n_threads = 4;
gf->whisper_params.n_max_text_ctx = 16384;
gf->whisper_params.translate = false;
gf->whisper_params.no_context = true;
gf->whisper_params.no_context = false;
gf->whisper_params.single_segment = true;
gf->whisper_params.print_special = false;
gf->whisper_params.print_progress = false;
Expand All @@ -177,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params.speed_up = false;
gf->whisper_params.suppress_blank = true;
gf->whisper_params.suppress_non_speech_tokens = true;
gf->whisper_params.temperature = 0.1;
gf->whisper_params.temperature = 0.0;
gf->whisper_params.max_initial_ts = 1.0;
gf->whisper_params.length_penalty = -1;
gf->active = true;
Expand All @@ -204,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
// numeral = "0" + numeral;
// }

// save the audio to a .wav file
// // save the audio to a .wav file
// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
Expand Down Expand Up @@ -281,7 +280,7 @@ void set_text_callback(struct transcription_filter_data *gf,
str_copy.c_str(), translated_text.c_str());
}
// overwrite the original text with the translated text
str_copy = str_copy + " -> " + translated_text;
str_copy = str_copy + " | " + translated_text;
} else {
obs_log(gf->log_level, "Failed to translate text");
}
Expand Down Expand Up @@ -385,19 +384,22 @@ int wmain(int argc, wchar_t *argv[])
gf->suppress_sentences =
config["suppress_sentences"].get<std::string>();
}
if (config.contains("overlap_ms")) {
obs_log(LOG_INFO, "Setting overlap_ms to %d",
config["overlap_ms"].get<int>());
gf->overlap_ms = config["overlap_ms"];
gf->overlap_frames = (size_t)((float)gf->sample_rate /
(1000.0f / (float)gf->overlap_ms));
}
if (config.contains("enable_audio_chunks_callback")) {
obs_log(LOG_INFO, "Setting enable_audio_chunks_callback to %s",
config["enable_audio_chunks_callback"] ? "true" : "false");
gf->enable_audio_chunks_callback =
config["enable_audio_chunks_callback"];
}
if (config.contains("temperature")) {
obs_log(LOG_INFO, "Setting temperture to %f",
config["temperature"].get<float>());
gf->whisper_params.temperature = config["temperature"].get<float>();
}
if (config.contains("no_context")) {
obs_log(LOG_INFO, "Setting no_context to %s",
config["no_context"] ? "true" : "false");
gf->whisper_params.no_context = config["no_context"];
}
// set log level
if (logLevelStr == "debug") {
gf->log_level = LOG_DEBUG;
Expand Down
Loading

0 comments on commit 67993f3

Please sign in to comment.