locaal-ai · royshil · Jun 5, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 4, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -87,6 +87,8 @@ target_sources(
   PRIVATE src/plugin-main.c
           src/transcription-filter.cpp
           src/transcription-filter.c
+          src/transcription-filter-callbacks.cpp
+          src/transcription-filter-utils.cpp
           src/transcription-utils.cpp
           src/model-utils/model-downloader.cpp
           src/model-utils/model-downloader-ui.cpp
@@ -99,8 +101,7 @@ target_sources(
           src/whisper-utils/token-buffer-thread.cpp
           src/translation/language_codes.cpp
           src/translation/translation.cpp
-          src/translation/translation-utils.cpp
-          src/utils.cpp)
+          src/translation/translation-utils.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
 
@@ -121,8 +122,7 @@ if(ENABLE_TESTS)
             src/whisper-utils/silero-vad-onnx.cpp
             src/whisper-utils/token-buffer-thread.cpp
             src/translation/language_codes.cpp
-            src/translation/translation.cpp
-            src/utils.cpp)
+            src/translation/translation.cpp)
 
   find_libav(${CMAKE_PROJECT_NAME}-tests)
 

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -1,6 +1,7 @@
 LocalVocalPlugin="LocalVocal Plugin"
 transcription_filterAudioFilter="LocalVocal Transcription"
 vad_enabled="VAD Enabled"
+vad_threshold="VAD Threshold"
 log_level="Internal Log Level"
 log_words="Log Output to Console"
 caption_to_stream="Stream Captions"
@@ -61,3 +62,12 @@ sentence_psum_accept_thresh="Sentence prob. threshold"
 external_model_folder="External model folder"
 load_external_model="Load external model"
 translate_input_tokenization_style="Input token style"
+translation_sampling_temperature="Sampling temperature"
+translation_repetition_penalty="Repetition penalty"
+translation_beam_size="Beam size"
+translation_max_decoding_length="Max decoding length"
+translation_no_repeat_ngram_size="No-repeat ngram size"
+translation_max_input_length="Max input length"
+buffered_output_parameters="Buffered output parameters"
+buffer_num_lines="Number of lines"
+buffer_num_chars_per_line="Characters per line"
diff --git a/src/model-utils/model-infos.cpp b/src/model-utils/model-infos.cpp
@@ -45,6 +45,60 @@ std::map<std::string, ModelInfo> models_info = {{
 	    "B6E77E474AEEA8F441363ACA7614317C06381F3EACFE10FB9856D5081D1074CC"},
 	   {"https://huggingface.co/jncraton/m2m100_1.2B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
 	    "D8F7C76ED2A5E0822BE39F0A4F95A55EB19C78F4593CE609E2EDBC2AEA4D380A"}}}},
+	{"NLLB 200 1.3B (1.4Gb)",
+	 {"NLLB 200 1.3B",
+	  "nllb-200-1.3b",
+	  MODEL_TYPE_TRANSLATION,
+	  {{"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/model.bin?download=true",
+	    "72D7533DC7A0E8F10F19A650D4E90FAF9CBFA899DB5411AD124BD5802BD91263"},
+	   {
+		   "https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/config.json?download=true",
+		   "0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
+	   },
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer.json?download=true",
+	    "E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
+	    "A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/special_tokens_map.json?download=true",
+	    "992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/tokenizer_config.json?download=true",
+	    "D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-1.3B-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
+	    "14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
+	{"NLLB 200 600M (650Mb)",
+	 {"NLLB 200 600M",
+	  "nllb-200-600m",
+	  MODEL_TYPE_TRANSLATION,
+	  {{"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/model.bin?download=true",
+	    "ED1BEAF75134DE7505315A5223162F56ACFF397EFF6B50638A500D3936FE707B"},
+	   {
+		   "https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/config.json?download=true",
+		   "0C2F6FA2057C7264D052FB4A62BA3476EEAE70487ACDDFA8E779A53A00CBF44C",
+	   },
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer.json?download=true",
+	    "E316B82DE11D0F951F370943B3C438311629547285129B0B81DADABD01BCA665"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/shared_vocabulary.txt?download=true",
+	    "A132A83330F45514C2476EB81D1D69B3C41762264D16CE0A7EA982E5D6C728E5"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/special_tokens_map.json?download=true",
+	    "992BD4ED610D644D6823081937BCC91BB8878DD556CEA4AE5327F2480361330E"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/tokenizer_config.json?download=true",
+	    "D1AA8C3697D3E35674F97B5B7E9C99D22B010F528E80140257D97316BE90D044"},
+	   {"https://huggingface.co/JustFrederik/nllb-200-distilled-600M-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true",
+	    "14BB8DFB35C0FFDEA7BC01E56CEA38B9E3D5EFCDCB9C251D6B40538E1AAB555A"}}}},
+	{"MADLAD 400 3B (2.9Gb)",
+	 {"MADLAD 400 3B",
+	  "madlad-400-3b",
+	  MODEL_TYPE_TRANSLATION,
+	  {{"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/model.bin?download=true",
+	    "F3C87256A2C888100C179D7DCD7F41DF17C767469546C59D32C7DDE86C740A6B"},
+	   {
+		   "https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/config.json?download=true",
+		   "A428C51CD35517554523B3C6B6974A5928BC35E82B130869A543566A34A83B93",
+	   },
+	   {"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/shared_vocabulary.txt?download=true",
+	    "C327551CE3CA6EFC7B437E11A267F79979893332DDA8A1D146E2C950815193F8"},
+	   {"https://huggingface.co/santhosh/madlad400-3b-ct2/resolve/main/sentencepiece.model?download=true",
+	    "EF11AC9A22C7503492F56D48DCE53BE20E339B63605983E9F27D2CD0E0F3922C"}}}},
 	{"Whisper Base q5 (57Mb)",
 	 {"Whisper Base q5",
 	  "whisper-base-q5",

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
@@ -13,6 +13,7 @@
 #include <nlohmann/json.hpp>
 
 #include "transcription-filter-data.h"
+#include "transcription-filter-utils.h"
 #include "transcription-filter.h"
 #include "transcription-utils.h"
 #include "whisper-utils/whisper-utils.h"
@@ -84,7 +85,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->sample_rate = sample_rate;
 	gf->frames = (size_t)((float)gf->sample_rate * 10.0f);
 	gf->last_num_frames = 0;
-	gf->step_size_msec = 3000;
 	gf->min_sub_duration = 3000;
 	gf->last_sub_render_time = 0;
 	gf->save_srt = false;
@@ -110,8 +110,6 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	memset(gf->copy_buffers[0], 0, gf->channels * gf->frames * sizeof(float));
 	obs_log(LOG_INFO, " allocated %llu bytes ", gf->channels * gf->frames * sizeof(float));
 
-	gf->overlap_ms = 150;
-	gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms));
 	obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels,
 		(int)gf->frames, gf->sample_rate);
 
@@ -158,11 +156,12 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->whisper_params = whisper_full_default_params(whisper_sampling_method);
 	gf->whisper_params.duration_ms = 3000;
 	gf->whisper_params.language = "en";
+	gf->whisper_params.detect_language = false;
 	gf->whisper_params.initial_prompt = "";
 	gf->whisper_params.n_threads = 4;
 	gf->whisper_params.n_max_text_ctx = 16384;
 	gf->whisper_params.translate = false;
-	gf->whisper_params.no_context = true;
+	gf->whisper_params.no_context = false;
 	gf->whisper_params.single_segment = true;
 	gf->whisper_params.print_special = false;
 	gf->whisper_params.print_progress = false;
@@ -177,7 +176,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->whisper_params.speed_up = false;
 	gf->whisper_params.suppress_blank = true;
 	gf->whisper_params.suppress_non_speech_tokens = true;
-	gf->whisper_params.temperature = 0.1;
+	gf->whisper_params.temperature = 0.0;
 	gf->whisper_params.max_initial_ts = 1.0;
 	gf->whisper_params.length_penalty = -1;
 	gf->active = true;
@@ -204,7 +203,7 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 	//     numeral = "0" + numeral;
 	// }
 
-	// save the audio to a .wav file
+	// // save the audio to a .wav file
 	// std::string filename = "audio_chunk_" + numeral + vad_state_str + ".wav";
 	// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
 	// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
@@ -281,7 +280,7 @@ void set_text_callback(struct transcription_filter_data *gf,
 						str_copy.c_str(), translated_text.c_str());
 				}
 				// overwrite the original text with the translated text
-				str_copy = str_copy + " -> " + translated_text;
+				str_copy = str_copy + " | " + translated_text;
 			} else {
 				obs_log(gf->log_level, "Failed to translate text");
 			}
@@ -385,19 +384,22 @@ int wmain(int argc, wchar_t *argv[])
 				gf->suppress_sentences =
 					config["suppress_sentences"].get<std::string>();
 			}
-			if (config.contains("overlap_ms")) {
-				obs_log(LOG_INFO, "Setting overlap_ms to %d",
-					config["overlap_ms"].get<int>());
-				gf->overlap_ms = config["overlap_ms"];
-				gf->overlap_frames = (size_t)((float)gf->sample_rate /
-							      (1000.0f / (float)gf->overlap_ms));
-			}
 			if (config.contains("enable_audio_chunks_callback")) {
 				obs_log(LOG_INFO, "Setting enable_audio_chunks_callback to %s",
 					config["enable_audio_chunks_callback"] ? "true" : "false");
 				gf->enable_audio_chunks_callback =
 					config["enable_audio_chunks_callback"];
 			}
+			if (config.contains("temperature")) {
+				obs_log(LOG_INFO, "Setting temperture to %f",
+					config["temperature"].get<float>());
+				gf->whisper_params.temperature = config["temperature"].get<float>();
+			}
+			if (config.contains("no_context")) {
+				obs_log(LOG_INFO, "Setting no_context to %s",
+					config["no_context"] ? "true" : "false");
+				gf->whisper_params.no_context = config["no_context"];
+			}
 			// set log level
 			if (logLevelStr == "debug") {
 				gf->log_level = LOG_DEBUG;