Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Steamline and refactor #105

Merged
merged 12 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ target_sources(
PRIVATE src/plugin-main.c
src/transcription-filter.cpp
src/transcription-filter.c
src/transcription-filter-callbacks.cpp
src/transcription-filter-utils.cpp
src/transcription-utils.cpp
src/model-utils/model-downloader.cpp
src/model-utils/model-downloader-ui.cpp
Expand All @@ -99,8 +101,7 @@ target_sources(
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/translation/translation-utils.cpp
src/utils.cpp)
src/translation/translation-utils.cpp)

set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})

Expand All @@ -121,8 +122,7 @@ if(ENABLE_TESTS)
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/utils.cpp)
src/translation/translation.cpp)

find_libav(${CMAKE_PROJECT_NAME}-tests)

Expand Down
10 changes: 10 additions & 0 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
LocalVocalPlugin="LocalVocal Plugin"
transcription_filterAudioFilter="LocalVocal Transcription"
vad_enabled="VAD Enabled"
vad_threshold="VAD Threshold"
log_level="Internal Log Level"
log_words="Log Output to Console"
caption_to_stream="Stream Captions"
Expand Down Expand Up @@ -61,3 +62,12 @@ sentence_psum_accept_thresh="Sentence prob. threshold"
external_model_folder="External model folder"
load_external_model="Load external model"
translate_input_tokenization_style="Input token style"
translation_sampling_temperature="Sampling temperature"
translation_repetition_penalty="Repetition penalty"
translation_beam_size="Beam size"
translation_max_decoding_length="Max decoding length"
translation_no_repeat_ngram_size="No-repeat ngram size"
translation_max_input_length="Max input length"
buffered_output_parameters="Buffered output parameters"
buffer_num_lines="Number of lines"
buffer_num_chars_per_line="Characters per line"
54 changes: 54 additions & 0 deletions src/model-utils/model-infos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,60 @@ std::map<std::string, ModelInfo> models_info = {{
"B6E77E474AEEA8F441363ACA7614317C06381F3EACFE10FB9856D5081D1074CC"},
{"https://huggingface.co/jncraton/m2m100_1.2B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"D8F7C76ED2A5E0822BE39F0A4F95A55EB19C78F4593CE609E2EDBC2AEA4D380A"}}}},
{"NLLB 200 1.3B (1.4Gb)",
{"NLLB 200 1.3B",
"nllb-200-1.3b",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/model.bin?download=true",
"72D7533DC7A0E8F10F19A650D4E90FAF9CBFA899DB5411AD124BD5802BD91263"},
{
"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/config.json?download=true",
"0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer.json?download=true",
"E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
"A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/special_tokens_map.json?download=true",
"992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer_config.json?download=true",
"D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
{"NLLB 200 600M (650Mb)",
{"NLLB 200 600M",
"nllb-200-600m",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/model.bin?download=true",
"ED1BEAF75134DE7505315A5223162F56ACFF397EFF6B50638A500D3936FE707B"},
{
"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/config.json?download=true",
"0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer.json?download=true",
"E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
"A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/special_tokens_map.json?download=true",
"992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer_config.json?download=true",
"D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
"14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
{"MADLAD 400 3B (2.9Gb)",
{"MADLAD 400 3B",
"madlad-400-3b",
MODEL_TYPE_TRANSLATION,
{{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/model.bin?download=true",
"F3C87256A2C888100C179D7DCD7F41DF17C767469546C59D32C7DDE86C740A6B"},
{
"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/config.json?download=true",
"A428C51CD35517554523B3C6B6974A5928BC35E82B130869A543566A34A83B93",
},
{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/shared_vocabulary.txt?download=true",
"C327551CE3CA6EFC7B437E11A267F79979893332DDA8A1D146E2C950815193F8"},
{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/sentencepiece.model?download=true",
"EF11AC9A22C7503492F56D48DCE53BE20E339B63605983E9F27D2CD0E0F3922C"}}}},
{"Whisper Base q5 (57Mb)",
{"Whisper Base q5",
"whisper-base-q5",
Expand Down
30 changes: 16 additions & 14 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <nlohmann/json.hpp>

#include "transcription-filter-data.h"
#include "transcription-filter-utils.h"
#include "transcription-filter.h"
#include "transcription-utils.h"
#include "whisper-utils/whisper-utils.h"
Expand Down Expand Up @@ -84,7 +85,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->sample_rate = sample_rate;
gf->frames = (size_t)((float)gf->sample_rate * 10.0f);
gf->last_num_frames = 0;
gf->step_size_msec = 3000;
gf->min_sub_duration = 3000;
gf->last_sub_render_time = 0;
gf->save_srt = false;
Expand All @@ -110,8 +110,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
memset(gf->copy_buffers[0], 0, gf->channels * gf->frames * sizeof(float));
obs_log(LOG_INFO, " allocated %llu bytes ", gf->channels * gf->frames * sizeof(float));

gf->overlap_ms = 150;
gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms));
obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels,
(int)gf->frames, gf->sample_rate);

Expand Down Expand Up @@ -158,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
gf->whisper_params.duration_ms = 3000;
gf->whisper_params.language = "en";
gf->whisper_params.detect_language = false;
gf->whisper_params.initial_prompt = "";
gf->whisper_params.n_threads = 4;
gf->whisper_params.n_max_text_ctx = 16384;
gf->whisper_params.translate = false;
gf->whisper_params.no_context = true;
gf->whisper_params.no_context = false;
gf->whisper_params.single_segment = true;
gf->whisper_params.print_special = false;
gf->whisper_params.print_progress = false;
Expand All @@ -177,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->whisper_params.speed_up = false;
gf->whisper_params.suppress_blank = true;
gf->whisper_params.suppress_non_speech_tokens = true;
gf->whisper_params.temperature = 0.1;
gf->whisper_params.temperature = 0.0;
gf->whisper_params.max_initial_ts = 1.0;
gf->whisper_params.length_penalty = -1;
gf->active = true;
Expand All @@ -204,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
// numeral = "0" + numeral;
// }

// save the audio to a .wav file
// // save the audio to a .wav file
// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
Expand Down Expand Up @@ -281,7 +280,7 @@ void set_text_callback(struct transcription_filter_data *gf,
str_copy.c_str(), translated_text.c_str());
}
// overwrite the original text with the translated text
str_copy = str_copy + " -> " + translated_text;
str_copy = str_copy + " | " + translated_text;
} else {
obs_log(gf->log_level, "Failed to translate text");
}
Expand Down Expand Up @@ -385,19 +384,22 @@ int wmain(int argc, wchar_t *argv[])
gf->suppress_sentences =
config["suppress_sentences"].get<std::string>();
}
if (config.contains("overlap_ms")) {
obs_log(LOG_INFO, "Setting overlap_ms to %d",
config["overlap_ms"].get<int>());
gf->overlap_ms = config["overlap_ms"];
gf->overlap_frames = (size_t)((float)gf->sample_rate /
(1000.0f / (float)gf->overlap_ms));
}
if (config.contains("enable_audio_chunks_callback")) {
obs_log(LOG_INFO, "Setting enable_audio_chunks_callback to %s",
config["enable_audio_chunks_callback"] ? "true" : "false");
gf->enable_audio_chunks_callback =
config["enable_audio_chunks_callback"];
}
if (config.contains("temperature")) {
obs_log(LOG_INFO, "Setting temperture to %f",
config["temperature"].get<float>());
gf->whisper_params.temperature = config["temperature"].get<float>();
}
if (config.contains("no_context")) {
obs_log(LOG_INFO, "Setting no_context to %s",
config["no_context"] ? "true" : "false");
gf->whisper_params.no_context = config["no_context"];
}
// set log level
if (logLevelStr == "debug") {
gf->log_level = LOG_DEBUG;
Expand Down
Loading
Loading