From 419aee36e4a50f9701304d1829fa09ec97640c49 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 00:18:09 +0900 Subject: [PATCH 01/14] Add an audio function to retrieve the audio data since last time. Without it, `stream --save-audio` produces somehow choppy wav: `stream` calculates t_diff in milliseconds and combine audio pieces which are about step_ms long. WHISPER_SAMPLE_RATE / 1000 == only 16 but surprisingly human ears seem to be able to hear the gap as a noise. --- examples/common-sdl.cpp | 38 ++++++++++++++++++++++++++------------ examples/common-sdl.h | 3 +++ examples/stream/stream.cpp | 3 +-- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp index b61f8cff5fd..6272ce838a7 100644 --- a/examples/common-sdl.cpp +++ b/examples/common-sdl.cpp @@ -130,6 +130,7 @@ bool audio_async::clear() { m_audio_pos = 0; m_audio_len = 0; + m_audio_nxt = 0; } return true; @@ -172,6 +173,28 @@ void audio_async::callback(uint8_t * stream, int len) { } void audio_async::get(int ms, std::vector & result) { + if (ms <= 0) { + ms = m_len_ms; + } + + size_t n_samples = std::min(m_audio_len, (m_sample_rate * ms) / 1000); + + get_n(n_samples, result); +} + +void audio_async::next(std::vector & result) { + size_t n_samples; + + if (m_audio_pos >= m_audio_nxt) { + n_samples = m_audio_pos - m_audio_nxt; + } else { + n_samples = m_audio_len - m_audio_nxt + m_audio_pos; + } + + get_n(n_samples, result); +} + +void audio_async::get_n(size_t n_samples, std::vector & result) { if (!m_dev_id_in) { fprintf(stderr, "%s: no audio device to get audio from!\n", __func__); return; @@ -182,20 +205,9 @@ void audio_async::get(int ms, std::vector & result) { return; } - result.clear(); - { std::lock_guard lock(m_mutex); - if (ms <= 0) { - ms = m_len_ms; - } - - size_t n_samples = (m_sample_rate * ms) / 1000; - if (n_samples > m_audio_len) { - n_samples = m_audio_len; - } - result.resize(n_samples); int s0 = m_audio_pos - n_samples; @@ -205,10 +217,12 @@ void audio_async::get(int ms, std::vector & result) { if (s0 + n_samples > m_audio.size()) { const size_t n0 = m_audio.size() - s0; + m_audio_nxt = n_samples - n0; memcpy(result.data(), &m_audio[s0], n0 * sizeof(float)); - memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float)); + memcpy(&result[n0], &m_audio[0], m_audio_nxt * sizeof(float)); } else { + m_audio_nxt = s0 + n_samples; memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float)); } } diff --git a/examples/common-sdl.h b/examples/common-sdl.h index 9ee8a320724..746493f7c83 100644 --- a/examples/common-sdl.h +++ b/examples/common-sdl.h @@ -30,6 +30,8 @@ class audio_async { // get audio data from the circular buffer void get(int ms, std::vector & audio); + void next(std::vector & audio); + void get_n(size_t n_samples, std::vector & audio); private: SDL_AudioDeviceID m_dev_id_in = 0; @@ -43,6 +45,7 @@ class audio_async { std::vector m_audio; size_t m_audio_pos = 0; size_t m_audio_len = 0; + size_t m_audio_nxt = 0; }; // Return false if need to quit diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 190f68a2c3b..1855329a065 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -241,7 +241,7 @@ int main(int argc, char ** argv) { if (!use_vad) { while (true) { - audio.get(params.step_ms, pcmf32_new); + audio.next(pcmf32_new); if ((int) pcmf32_new.size() > 2*n_samples_step) { fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); @@ -250,7 +250,6 @@ int main(int argc, char ** argv) { } if ((int) pcmf32_new.size() >= n_samples_step) { - audio.clear(); break; } From 289946da8f9edcaf0ef3f178d2683098e821abce Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 10:27:52 +0900 Subject: [PATCH 02/14] Simplify stream's pcmf32 handling Use one deque instead of two vectors (old and new). Old and new are length variables now. Basically: Get `step - new` samples every time. Then substitute `new = (around) step;` The new audio data is simply appended to the deque. (Limit the deque size to 30 seconds.) Pass `old + new` samples to whisper inference. If the data has been consumed, let `old = 0; new = 0;` If some of the data should be kept for the next iter, `old = keep;` If you want to get only N samples next time, `new = step - N;` In VAD mode: `stream --interim --step -3000` will Get 3000ms of audio. Run `vad_simple(step_ms)`. If nothing is detected, get 100ms more audio and retry. If nothing is detected and 3000ms has been passed, go into the interim mode, where `n_segments - 1` segments will be confirmed. (`old -= confirmed_t1`) If `n_segments == 1`, only show the first half of the result. Misc: Increase the default `max_tokens` because 32 is too small for 10 seconds. (Some Japanese speech was garbled.) Write wav as soon as the data is available. `no_timestamps` is the default even for VAD because it is more useful to show to the hard-of-hearing --- examples/stream/stream.cpp | 246 ++++++++++++++++++++++--------------- 1 file changed, 146 insertions(+), 100 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 1855329a065..6665253392a 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -7,6 +7,7 @@ #include "whisper.h" #include +#include #include #include #include @@ -21,7 +22,7 @@ struct whisper_params { int32_t length_ms = 10000; int32_t keep_ms = 200; int32_t capture_id = -1; - int32_t max_tokens = 32; + int32_t max_tokens = 128; int32_t audio_ctx = 0; float vad_thold = 0.6f; @@ -36,6 +37,7 @@ struct whisper_params { bool save_audio = false; // save audio to wav file bool use_gpu = true; bool flash_attn = false; + bool interim = false; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; @@ -65,6 +67,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; } + else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } @@ -72,6 +75,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } + else if (arg == "-int" || arg == "--interim") { params.interim = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -102,6 +106,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true"); + fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false"); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); @@ -109,6 +114,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -int, --interim [%-7s] show interim report in vad every step\n", params.interim ? "true" : "false"); fprintf(stderr, "\n"); } @@ -122,19 +128,16 @@ int main(int argc, char ** argv) { params.keep_ms = std::min(params.keep_ms, params.step_ms); params.length_ms = std::max(params.length_ms, params.step_ms); - const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE; - const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE; - const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; - const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; + const int n_samples_step = (1e-3*abs(params.step_ms))*WHISPER_SAMPLE_RATE; + const int n_samples_len = (1e-3*params.length_ms )*WHISPER_SAMPLE_RATE; + const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; + const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; + const int n_samples_100ms= (1e-3*100.0 )*WHISPER_SAMPLE_RATE; - const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD + const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line - params.no_timestamps = !use_vad; - params.no_context |= use_vad; - params.max_tokens = 0; - // init audio audio_async audio(params.length_ms); @@ -159,9 +162,10 @@ int main(int argc, char ** argv) { struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); - std::vector pcmf32 (n_samples_30s, 0.0f); - std::vector pcmf32_old; - std::vector pcmf32_new(n_samples_30s, 0.0f); + std::vector pcmf32(n_samples_30s, 0.0f); + std::deque pcmf32_deque; + int n_samples_new = 0; + int n_samples_old = 0; std::vector prompt_tokens; @@ -219,17 +223,17 @@ int main(int argc, char ** argv) { wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1); } - printf("[Start speaking]\n"); - fflush(stdout); + fprintf(stderr, "[Start speaking]\n"); + fflush(stderr); auto t_last = std::chrono::high_resolution_clock::now(); + auto t_interim = t_last; + bool is_interim = false; const auto t_start = t_last; + std::string s_to_delete = ""; // main audio loop while (is_running) { - if (params.save_audio) { - wavWriter.write(pcmf32_new.data(), pcmf32_new.size()); - } // handle Ctrl + C is_running = sdl_poll_events(); @@ -238,61 +242,74 @@ int main(int argc, char ** argv) { } // process new audio + const auto t_now = std::chrono::high_resolution_clock::now(); + const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); + + // get new audio + if (n_samples_new > n_samples_step) { + pcmf32.clear(); + } else if (t_diff < abs(params.step_ms)) { + std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff)); + continue; + } else { + audio.next(pcmf32); + } - if (!use_vad) { - while (true) { - audio.next(pcmf32_new); - - if ((int) pcmf32_new.size() > 2*n_samples_step) { - fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__); - audio.clear(); - continue; - } - - if ((int) pcmf32_new.size() >= n_samples_step) { - break; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - } - - const int n_samples_new = pcmf32_new.size(); - - // take up to params.length_ms audio from previous iteration - const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new)); + const int n_samples_buf = pcmf32.size(); - //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size()); + if (params.save_audio && n_samples_buf > 0) { + wavWriter.write(pcmf32.data(), n_samples_buf); + } - pcmf32.resize(n_samples_new + n_samples_take); + copy(pcmf32.begin(), pcmf32.end(), back_inserter(pcmf32_deque)); + if (pcmf32_deque.size() > n_samples_30s) { + pcmf32_deque.erase(pcmf32_deque.begin(), pcmf32_deque.end() - n_samples_30s); + } - for (int i = 0; i < n_samples_take; i++) { - pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i]; - } + n_samples_new += n_samples_buf; + if (!is_interim && n_samples_new > 2*n_samples_step) { + fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); + fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE); + n_samples_old = 0; + n_samples_new = 0; + t_last = t_now; + continue; + } + is_interim = false; - memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float)); + if (!use_vad){ + n_samples_old += n_samples_new; + n_samples_new = 0; + pcmf32.resize(n_samples_old); + copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin()); - pcmf32_old = pcmf32; + t_last = t_now; } else { - const auto t_now = std::chrono::high_resolution_clock::now(); - const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); - - if (t_diff < 2000) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - continue; - } - - audio.get(2000, pcmf32_new); - - if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { - audio.get(params.length_ms, pcmf32); + pcmf32.resize(n_samples_step); + copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin()); + if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { + pcmf32.resize(n_samples_old + n_samples_new); + copy(pcmf32_deque.end() - n_samples_old - n_samples_new, pcmf32_deque.end(), pcmf32.begin()); + n_samples_new = 0; + n_samples_old = 0; + + t_last = t_now; } else { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - continue; + const auto n_interim_diff_ms = std::chrono::duration_cast(t_now - t_interim).count(); + + if (params.interim && n_interim_diff_ms > abs(params.step_ms)) { + is_interim = (n_interim_diff_ms < params.length_ms - abs(params.step_ms)); + n_samples_old += n_samples_new; + n_samples_new = 0; + pcmf32.resize(n_samples_old); + copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin()); + } else { + n_samples_new -= n_samples_100ms; + n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + continue; + } } - - t_last = t_now; } // run the inference @@ -324,80 +341,109 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 6; } + t_interim = std::chrono::high_resolution_clock::now(); // print result; + int n_segments; + bool is_unconfirmed = false; + std::ostringstream text; { - if (!use_vad) { + if (!use_vad || params.interim && params.no_timestamps && s_to_delete.size()) { printf("\33[2K\r"); // print long empty line to clear the previous line - printf("%s", std::string(100, ' ').c_str()); + printf("%s", std::string(s_to_delete.size(), ' ').c_str()); printf("\33[2K\r"); - } else { + } else if (use_vad && !params.no_timestamps) { const int64_t t1 = (t_last - t_start).count()/1000000; const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); - printf("\n"); - printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1); - printf("\n"); + text << std::endl; + text << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl; + text << std::endl; } - const int n_segments = whisper_full_n_segments(ctx); + n_segments = whisper_full_n_segments(ctx); + if (is_interim) { + if (n_segments < 2) { + is_unconfirmed = true; + } else { + n_segments--; + const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 1) * 10; + t_last += std::chrono::milliseconds(t1_ms); + const auto n_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE; + pcmf32.resize(n_confirmed); + n_samples_old -= n_confirmed; + } + } for (int i = 0; i < n_segments; ++i) { - const char * text = whisper_full_get_segment_text(ctx, i); - - if (params.no_timestamps) { - printf("%s", text); - fflush(stdout); + std::string i_text = whisper_full_get_segment_text(ctx, i); - if (params.fname_out.length() > 0) { - fout << text; + if (!use_vad || params.no_timestamps) { + if (i > 0) { + text << std::endl; } + text << i_text; } else { - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + const int64_t t_end = (t_last - t_start).count()/1000000; + const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); + const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = t_beg/10 + whisper_full_get_segment_t1(ctx, i); - std::string output = "[" + to_timestamp(t0, false) + " --> " + to_timestamp(t1, false) + "] " + text; + text << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "] " << i_text; if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - output += " [SPEAKER_TURN]"; + text << " [SPEAKER_TURN]"; } - output += "\n"; - - printf("%s", output.c_str()); - fflush(stdout); - - if (params.fname_out.length() > 0) { - fout << output; - } + text << std::endl; } } - if (params.fname_out.length() > 0) { - fout << std::endl; + if (use_vad && !params.no_timestamps) { + text << std::endl; + text << "### Transcription " << n_iter << " END"; + text << std::endl; } + } - if (use_vad) { - printf("\n"); - printf("### Transcription %d END\n", n_iter); - } + if (params.fname_out.length() > 0) { + fout << text.str(); + fout << std::endl; } ++n_iter; - if (!use_vad && (n_iter % n_new_line) == 0) { + if (is_unconfirmed) { + --n_iter; + // utf-8 cannot be simply cut into two + std::wstring_convert, char32_t> conv; + auto t_u32 = conv.from_bytes(text.str()); + auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() / 2)); + text.str(t_sub + "…"); + } + + printf("%s", text.str().c_str()); + + if (is_unconfirmed || !use_vad && n_samples_old < n_samples_len - n_samples_step) { + s_to_delete = text.str(); + } else { printf("\n"); + s_to_delete = ""; - // keep part of the audio for next iteration to try to mitigate word boundary issues - pcmf32_old = std::vector(pcmf32.end() - n_samples_keep, pcmf32.end()); + if (!use_vad) { + n_iter = 0; + if (n_samples_keep < n_samples_old) { + // keep part of the audio for next iteration to try to mitigate word boundary issues + n_samples_old = n_samples_keep; + } + } // Add tokens of the last full length segment as the prompt if (!params.no_context) { prompt_tokens.clear(); - const int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { const int token_count = whisper_full_n_tokens(ctx, i); for (int j = 0; j < token_count; ++j) { From b27fc1fd1db91be61f14a8aef0d14939ee203a32 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 11:15:21 +0900 Subject: [PATCH 03/14] Add headers for gcc c++ --- examples/stream/stream.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 6665253392a..ac2116bca12 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -261,7 +263,7 @@ int main(int argc, char ** argv) { wavWriter.write(pcmf32.data(), n_samples_buf); } - copy(pcmf32.begin(), pcmf32.end(), back_inserter(pcmf32_deque)); + copy(pcmf32.begin(), pcmf32.end(), std::back_inserter(pcmf32_deque)); if (pcmf32_deque.size() > n_samples_30s) { pcmf32_deque.erase(pcmf32_deque.begin(), pcmf32_deque.end() - n_samples_30s); } From b114ec309f4ebcf0c51799edd70c63e0a26188ce Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 11:59:50 +0900 Subject: [PATCH 04/14] Accept pipe to stream Now it is easy to test with raw PCM data. Try `cat pcmf32.raw | stream` (or `pv -qL 64000 pcmf32.raw | stream` in realtime) Note: I haven't tested WIN32 ifdefs. You can make such data by `ffmpeg -i jfk.wav -f f32le -acodec pcm_f32le jfk.raw` because wav header length (44) is a multiple of `sizeof float` (4) I decided to ignore the data before `[Start speaking]` because such premature data are not good for remote-transcription systems like: ``` mic2pcm | ssh -C remote "stream | lines2googledocs" ``` or ``` mic2some | ssh -C remote "ffmpeg -loglevel fatal -i pipe:0 -tune zerolatency -af atempo=1.1 -f f32le -ar 16000 -acodec pcm_f32le pipe:1 | stream" ``` So if you want to do a strict test, remove the "ignore" part. Otherwise quite a number of bytes will be ignored. --- examples/stream/stream.cpp | 117 +++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 6 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index ac2116bca12..5bbff7140a0 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -16,6 +16,38 @@ #include #include +#ifdef _WIN32 +#include +#include +#else +#include +#include +#endif + +void setStdinNonBlocking() { +#ifdef _WIN32 + DWORD mode; + HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); + GetConsoleMode(stdinHandle, &mode); + mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + SetConsoleMode(stdinHandle, mode); +#else + fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK); +#endif +} + +void setStdinBlocking() { +#if defined(_WIN32) + DWORD mode; + HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); + GetConsoleMode(stdinHandle, &mode); + mode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; + SetConsoleMode(stdinHandle, mode); +#else + fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) & ~O_NONBLOCK); +#endif +} + // command-line parameters struct whisper_params { @@ -143,12 +175,22 @@ int main(int argc, char ** argv) { // init audio audio_async audio(params.length_ms); - if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { - fprintf(stderr, "%s: audio.init() failed!\n", __func__); - return 1; - } + bool piped = !isatty(fileno(stdin)); + + if (piped) { + #ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); + #else + freopen(NULL, "rb", stdin); + #endif + } else { + if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { + fprintf(stderr, "%s: audio.init() failed!\n", __func__); + return 1; + } - audio.resume(); + audio.resume(); + } // whisper init if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){ @@ -225,9 +267,43 @@ int main(int argc, char ** argv) { wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1); } + + // ignore premature stdin + int n_mod = 0; + if (piped) { + const auto n_bytes_len = sizeof(float) * n_samples_len; + setStdinNonBlocking(); + while (true) { + const auto n_bytes_read = read(fileno(stdin), pcmf32.data(), n_bytes_len); + if (n_bytes_read == -1 && errno == EAGAIN) { + break; + } else if (n_bytes_read < 1) { + fprintf(stderr, "stdin ended too early\n"); + is_running = false; + break; + } + n_mod = n_bytes_read % sizeof(float); + if (n_bytes_read < n_bytes_len) { + break; + } + } + } + fprintf(stderr, "[Start speaking]\n"); fflush(stderr); + if (piped) { + // ignore the partial sample + if (n_mod > 0) { + const auto n_remain = sizeof(float) - n_mod; + setStdinBlocking(); + if (n_remain != fread(pcmf32.data(), 1, n_remain, stdin)) { + is_running = false; + } + } + setStdinNonBlocking(); + } + auto t_last = std::chrono::high_resolution_clock::now(); auto t_interim = t_last; bool is_interim = false; @@ -250,6 +326,33 @@ int main(int argc, char ** argv) { // get new audio if (n_samples_new > n_samples_step) { pcmf32.clear(); + } else if (piped) { + pcmf32.resize(n_samples_len); + char *p_buf = (char *)pcmf32.data(); + const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float); + auto n_bytes_wanted = n_samples_len * sizeof(float); + auto n_bytes_read = 0; + while (n_bytes_wanted > 0) { + const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted); + if (n_read == 0 || n_read == -1 && errno != EAGAIN) { + fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno); + is_running = false; + break; + } + n_bytes_read += std::max(0L, n_read); + if (n_bytes_read < n_bytes_min) { + n_bytes_wanted = n_bytes_min - n_bytes_read; + } else { + n_bytes_wanted = n_bytes_read % sizeof(float); + } + if (n_bytes_wanted > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + pcmf32.resize(n_bytes_read / sizeof(float)); + if (!is_running) { + break; + } } else if (t_diff < abs(params.step_ms)) { std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff)); continue; @@ -308,7 +411,9 @@ int main(int argc, char ** argv) { } else { n_samples_new -= n_samples_100ms; n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (!piped) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } continue; } } From 75099f9f87572738c887a9ef62d0d9b1d82b8b53 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 12:32:01 +0900 Subject: [PATCH 05/14] Fix armv7-linux build --- examples/stream/stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 5bbff7140a0..2bbccffd3ba 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -339,7 +339,7 @@ int main(int argc, char ** argv) { is_running = false; break; } - n_bytes_read += std::max(0L, n_read); + n_bytes_read += std::max(0, n_read); if (n_bytes_read < n_bytes_min) { n_bytes_wanted = n_bytes_min - n_bytes_read; } else { From 03b25dd7f305c152ac9cb076efd52903eff9e172 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 12:40:11 +0900 Subject: [PATCH 06/14] Remove unused n_new_line --- examples/stream/stream.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 2bbccffd3ba..3a56aa413b9 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -170,8 +170,6 @@ int main(int argc, char ** argv) { const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD - const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line - // init audio audio_async audio(params.length_ms); @@ -235,7 +233,7 @@ int main(int argc, char ** argv) { params.no_timestamps ? 0 : 1); if (!use_vad) { - fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context); + fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context); } else { fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__); } From 61222da541957ebf72ffdae96bb0662e6eac7367 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 12:44:39 +0900 Subject: [PATCH 07/14] Fix windows build (include fcntl.h) --- examples/stream/stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 3a56aa413b9..f29b8152e93 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -15,12 +15,12 @@ #include #include #include +#include #ifdef _WIN32 #include #include #else -#include #include #endif From 17c760041607796f8aa80fe831a5a6117f0c5f9d Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 13:35:06 +0900 Subject: [PATCH 08/14] Fix inconsistency of ifdef --- examples/stream/stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index f29b8152e93..78ff1338dcf 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -37,7 +37,7 @@ void setStdinNonBlocking() { } void setStdinBlocking() { -#if defined(_WIN32) +#ifdef _WIN32 DWORD mode; HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); GetConsoleMode(stdinHandle, &mode); From 425d3add590ca2a343e7ae3f8d48a803d1b647b0 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Thu, 2 Jan 2025 13:53:26 +0900 Subject: [PATCH 09/14] Fix windows build windows.h defines min unless NOMINMAX is defined --- examples/stream/stream.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 78ff1338dcf..a8e0f81ce0a 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -18,10 +18,11 @@ #include #ifdef _WIN32 -#include -#include + #define NOMINMAX + #include + #include #else -#include + #include #endif void setStdinNonBlocking() { From 0a84581f20ec9380c1f3eacc390e60b594cd23c7 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Fri, 3 Jan 2025 15:28:47 +0900 Subject: [PATCH 10/14] Make `stream` more test-friendly Run `stream --test-pipe --no-vt100 2>/dev/null < pcmf32.raw` to get nearly-reproducible results. If you want to do a strict testing, use `--no-timestamps` as well. ``` cat jfk.raw | ./build/bin/stream -m models/ggml-large-v2.bin --step 2000 --test-pipe -no-vt100 2>/dev/null ( And so my fellow Americans...) ( And so my fellow Americans, ask...) ( And so my fellow Americans, ask not what your country will give you, but what your country will give you.) [00:00:00.000 --> 00:00:30.000] And so my fellow Americans, ask not what your country can do for you. ( Ask what you can do for your) [00:00:02.360 --> 00:00:32.360] Ask what you can do for your country. ``` VAD: ``` cat jfk.raw | ./build/bin/stream -m models/ggml-large-v2.bin --step -2000 --test-pipe -no-vt100 2>/dev/null [00:00:00.000 --> 00:00:03.000] And so, my fellow Americans. [00:00:00.000 --> 00:00:07.920] Ask not what your country can do for you, ask what you can do for your country. ``` --- examples/stream/stream.cpp | 166 ++++++++++++++++++++++++------------- 1 file changed, 107 insertions(+), 59 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index a8e0f81ce0a..f569a5aa0dc 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -73,6 +73,8 @@ struct whisper_params { bool use_gpu = true; bool flash_attn = false; bool interim = false; + bool delete_vt100 = true; + bool test_pipe = false; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; @@ -111,6 +113,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } else if (arg == "-int" || arg == "--interim") { params.interim = true; } + else if (arg == "-nvt" || arg == "--no-vt100") { params.delete_vt100 = false; } + else if ( arg == "--test-pipe") { params.test_pipe = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -150,6 +154,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); fprintf(stderr, " -int, --interim [%-7s] show interim report in vad every step\n", params.interim ? "true" : "false"); + fprintf(stderr, " -nvt, --no-vt100 [%-7s] do not delete unconfirmed result\n", params.delete_vt100 ? "false" : "true"); + fprintf(stderr, " --test-pipe [%-7s] use all data from pipe\n", params.test_pipe ? "true" : "false"); fprintf(stderr, "\n"); } @@ -160,8 +166,8 @@ int main(int argc, char ** argv) { return 1; } - params.keep_ms = std::min(params.keep_ms, params.step_ms); - params.length_ms = std::max(params.length_ms, params.step_ms); + params.keep_ms = std::min(params.keep_ms, abs(params.step_ms)); + params.length_ms = std::max(params.length_ms, abs(params.step_ms)); const int n_samples_step = (1e-3*abs(params.step_ms))*WHISPER_SAMPLE_RATE; const int n_samples_len = (1e-3*params.length_ms )*WHISPER_SAMPLE_RATE; @@ -269,7 +275,7 @@ int main(int argc, char ** argv) { // ignore premature stdin int n_mod = 0; - if (piped) { + if (piped && !params.test_pipe) { const auto n_bytes_len = sizeof(float) * n_samples_len; setStdinNonBlocking(); while (true) { @@ -349,9 +355,6 @@ int main(int argc, char ** argv) { } } pcmf32.resize(n_bytes_read / sizeof(float)); - if (!is_running) { - break; - } } else if (t_diff < abs(params.step_ms)) { std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff)); continue; @@ -371,7 +374,7 @@ int main(int argc, char ** argv) { } n_samples_new += n_samples_buf; - if (!is_interim && n_samples_new > 2*n_samples_step) { + if (!use_vad && n_samples_new > 2*n_samples_step) { fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE); n_samples_old = 0; @@ -379,7 +382,13 @@ int main(int argc, char ** argv) { t_last = t_now; continue; } + + if (n_samples_old + n_samples_new == 0) { + continue; + } + is_interim = false; + bool is_aborted = true; if (!use_vad){ n_samples_old += n_samples_new; @@ -389,11 +398,17 @@ int main(int argc, char ** argv) { t_last = t_now; } else { - pcmf32.resize(n_samples_step); - copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin()); - if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { - pcmf32.resize(n_samples_old + n_samples_new); - copy(pcmf32_deque.end() - n_samples_old - n_samples_new, pcmf32_deque.end(), pcmf32.begin()); + const auto n_samples = std::min(n_samples_len, n_samples_old + n_samples_new); + + is_aborted = (n_samples > n_samples_len); + if (is_running && !is_aborted) { + pcmf32.resize(n_samples_step); + copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin()); + } + + if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { + pcmf32.resize(n_samples); + copy(pcmf32_deque.end() - n_samples, pcmf32_deque.end(), pcmf32.begin()); n_samples_new = 0; n_samples_old = 0; @@ -443,25 +458,50 @@ int main(int argc, char ** argv) { wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); - if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { - fprintf(stderr, "%s: failed to process audio\n", argv[0]); - return 6; + { + auto pcm_size = pcmf32.size(); + if (pcm_size < WHISPER_SAMPLE_RATE * 1.1) { + pcmf32.resize(pcm_size + WHISPER_SAMPLE_RATE, 0.0f); + } + if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + fprintf(stderr, "%s: failed to process audio\n", argv[0]); + return 6; + } + pcmf32.resize(pcm_size); } t_interim = std::chrono::high_resolution_clock::now(); // print result; int n_segments; - bool is_unconfirmed = false; + bool no_confirmed = (!use_vad && n_samples_old < n_samples_len - n_samples_step); std::ostringstream text; { - if (!use_vad || params.interim && params.no_timestamps && s_to_delete.size()) { + if (params.delete_vt100 && s_to_delete.size()) { printf("\33[2K\r"); // print long empty line to clear the previous line printf("%s", std::string(s_to_delete.size(), ' ').c_str()); printf("\33[2K\r"); - } else if (use_vad && !params.no_timestamps) { + } + s_to_delete.clear(); + + n_segments = whisper_full_n_segments(ctx); + no_confirmed = (no_confirmed || is_interim && n_segments <= 1); + if (is_running && is_interim && !no_confirmed) { + const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 2) * 10; + if (t1_ms < abs(params.step_ms)) { + // too short to confirm + no_confirmed = true; + } else { + t_last += std::chrono::milliseconds(t1_ms); + const auto n_samples_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE; + pcmf32.resize(n_samples_confirmed); // for timestamps + n_samples_old -= n_samples_confirmed; + } + } + + if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) { const int64_t t1 = (t_last - t_start).count()/1000000; const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); @@ -470,28 +510,42 @@ int main(int argc, char ** argv) { text << std::endl; } - n_segments = whisper_full_n_segments(ctx); - if (is_interim) { - if (n_segments < 2) { - is_unconfirmed = true; - } else { - n_segments--; - const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 1) * 10; - t_last += std::chrono::milliseconds(t1_ms); - const auto n_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE; - pcmf32.resize(n_confirmed); - n_samples_old -= n_confirmed; - } - } for (int i = 0; i < n_segments; ++i) { std::string i_text = whisper_full_get_segment_text(ctx, i); - if (!use_vad || params.no_timestamps) { + // last segment may be s_to_delete + if (i == n_segments - 1 && is_running && (no_confirmed || is_interim)) { + if (params.no_timestamps && i > 0) { + text << std::endl; + } + if (is_interim) { + // utf-8 cannot be simply cut into two + std::wstring_convert, char32_t> conv; + const auto t_u32 = conv.from_bytes(i_text); + const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.7)); + i_text = t_sub + "…"; + } + if (s_to_delete.size() > 0) { + s_to_delete += " "; + } + s_to_delete += i_text; + if (!params.delete_vt100) { + s_to_delete = "(" + s_to_delete + ")"; + } + break; + } + + if (is_running && no_confirmed) { + if (s_to_delete.size() > 0) { + s_to_delete += " "; + } + s_to_delete += i_text; + } else if (params.no_timestamps) { if (i > 0) { text << std::endl; } text << i_text; - } else { + } else if (!is_running || !(is_interim && i == n_segments - 1)) { const int64_t t_end = (t_last - t_start).count()/1000000; const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i); @@ -507,10 +561,13 @@ int main(int argc, char ** argv) { } } - if (use_vad && !params.no_timestamps) { + if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) { text << std::endl; text << "### Transcription " << n_iter << " END"; text << std::endl; + if (s_to_delete.size() > 0) { + text << std::endl; + } } } @@ -519,42 +576,33 @@ int main(int argc, char ** argv) { fout << std::endl; } - ++n_iter; - - if (is_unconfirmed) { - --n_iter; - // utf-8 cannot be simply cut into two - std::wstring_convert, char32_t> conv; - auto t_u32 = conv.from_bytes(text.str()); - auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() / 2)); - text.str(t_sub + "…"); + if (!no_confirmed) { + ++n_iter; } printf("%s", text.str().c_str()); - if (is_unconfirmed || !use_vad && n_samples_old < n_samples_len - n_samples_step) { - s_to_delete = text.str(); + if (is_running && (no_confirmed || is_interim)) { + printf("%s%s", s_to_delete.c_str(), params.delete_vt100 ? "" : "\n"); + --n_segments; // exclude s_to_delete from context } else { printf("\n"); s_to_delete = ""; - if (!use_vad) { - n_iter = 0; - if (n_samples_keep < n_samples_old) { - // keep part of the audio for next iteration to try to mitigate word boundary issues - n_samples_old = n_samples_keep; - } + if (is_aborted) { + // keep part of the audio for next iteration to try to mitigate word boundary issues + n_samples_old = std::min(n_samples_old, n_samples_keep); } + } - // Add tokens of the last full length segment as the prompt - if (!params.no_context) { - prompt_tokens.clear(); + // Add tokens of the last full length segment as the prompt + if (!no_confirmed && !params.no_context) { + prompt_tokens.clear(); - for (int i = 0; i < n_segments; ++i) { - const int token_count = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < token_count; ++j) { - prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j)); - } + for (int i = 0; i < n_segments; ++i) { + const int token_count = whisper_full_n_tokens(ctx, i); + for (int j = 0; j < token_count; ++j) { + prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j)); } } } From f99263e420b799077d55fb745bce2151a402eb6f Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Sun, 5 Jan 2025 08:47:22 +0900 Subject: [PATCH 11/14] Run vad_simple on entire pcmf32, not on the last step --- examples/stream/stream.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index f569a5aa0dc..97bf6fb0779 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -362,7 +362,7 @@ int main(int argc, char ** argv) { audio.next(pcmf32); } - const int n_samples_buf = pcmf32.size(); + int n_samples_buf = pcmf32.size(); if (params.save_audio && n_samples_buf > 0) { wavWriter.write(pcmf32.data(), n_samples_buf); @@ -390,25 +390,19 @@ int main(int argc, char ** argv) { is_interim = false; bool is_aborted = true; + n_samples_buf = std::min(n_samples_len, n_samples_old + n_samples_new); + pcmf32.resize(n_samples_buf); + copy(pcmf32_deque.end() - n_samples_buf, pcmf32_deque.end(), pcmf32.begin()); + if (!use_vad){ n_samples_old += n_samples_new; n_samples_new = 0; - pcmf32.resize(n_samples_old); - copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin()); t_last = t_now; } else { - const auto n_samples = std::min(n_samples_len, n_samples_old + n_samples_new); - - is_aborted = (n_samples > n_samples_len); - if (is_running && !is_aborted) { - pcmf32.resize(n_samples_step); - copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin()); - } + is_aborted = (n_samples_buf > n_samples_len); if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { - pcmf32.resize(n_samples); - copy(pcmf32_deque.end() - n_samples, pcmf32_deque.end(), pcmf32.begin()); n_samples_new = 0; n_samples_old = 0; From 6302794b659e9ea7b9ada87eb4d3e94db46555a8 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Fri, 17 Jan 2025 15:26:33 +0900 Subject: [PATCH 12/14] Simplify pipe handling it was too complicated for reviewers to accept and had a bug in aligning to sizeof(float) this commit reduces the number of lines this time `stream` doesn't skip the input before `[Start speaking]` but it is usually not so problematic because `read()` reads more than `step_ms` when possible --- examples/stream/stream.cpp | 70 +++++++------------------------------- 1 file changed, 13 insertions(+), 57 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 97bf6fb0779..dfed6143e27 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -37,18 +37,6 @@ void setStdinNonBlocking() { #endif } -void setStdinBlocking() { -#ifdef _WIN32 - DWORD mode; - HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); - GetConsoleMode(stdinHandle, &mode); - mode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT; - SetConsoleMode(stdinHandle, mode); -#else - fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) & ~O_NONBLOCK); -#endif -} - // command-line parameters struct whisper_params { @@ -74,7 +62,6 @@ struct whisper_params { bool flash_attn = false; bool interim = false; bool delete_vt100 = true; - bool test_pipe = false; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; @@ -114,7 +101,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } else if (arg == "-int" || arg == "--interim") { params.interim = true; } else if (arg == "-nvt" || arg == "--no-vt100") { params.delete_vt100 = false; } - else if ( arg == "--test-pipe") { params.test_pipe = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -155,7 +141,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); fprintf(stderr, " -int, --interim [%-7s] show interim report in vad every step\n", params.interim ? "true" : "false"); fprintf(stderr, " -nvt, --no-vt100 [%-7s] do not delete unconfirmed result\n", params.delete_vt100 ? "false" : "true"); - fprintf(stderr, " --test-pipe [%-7s] use all data from pipe\n", params.test_pipe ? "true" : "false"); fprintf(stderr, "\n"); } @@ -188,6 +173,7 @@ int main(int argc, char ** argv) { #else freopen(NULL, "rb", stdin); #endif + setStdinNonBlocking(); } else { if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { fprintf(stderr, "%s: audio.init() failed!\n", __func__); @@ -273,42 +259,9 @@ int main(int argc, char ** argv) { wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1); } - // ignore premature stdin - int n_mod = 0; - if (piped && !params.test_pipe) { - const auto n_bytes_len = sizeof(float) * n_samples_len; - setStdinNonBlocking(); - while (true) { - const auto n_bytes_read = read(fileno(stdin), pcmf32.data(), n_bytes_len); - if (n_bytes_read == -1 && errno == EAGAIN) { - break; - } else if (n_bytes_read < 1) { - fprintf(stderr, "stdin ended too early\n"); - is_running = false; - break; - } - n_mod = n_bytes_read % sizeof(float); - if (n_bytes_read < n_bytes_len) { - break; - } - } - } - fprintf(stderr, "[Start speaking]\n"); fflush(stderr); - if (piped) { - // ignore the partial sample - if (n_mod > 0) { - const auto n_remain = sizeof(float) - n_mod; - setStdinBlocking(); - if (n_remain != fread(pcmf32.data(), 1, n_remain, stdin)) { - is_running = false; - } - } - setStdinNonBlocking(); - } - auto t_last = std::chrono::high_resolution_clock::now(); auto t_interim = t_last; bool is_interim = false; @@ -332,12 +285,15 @@ int main(int argc, char ** argv) { if (n_samples_new > n_samples_step) { pcmf32.clear(); } else if (piped) { - pcmf32.resize(n_samples_len); - char *p_buf = (char *)pcmf32.data(); + // need at least step_ms const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float); + // but try to get length_ms at first auto n_bytes_wanted = n_samples_len * sizeof(float); + pcmf32.resize(n_samples_len); + auto n_bytes_read = 0; while (n_bytes_wanted > 0) { + char *p_buf = (char *)pcmf32.data(); const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted); if (n_read == 0 || n_read == -1 && errno != EAGAIN) { fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno); @@ -348,11 +304,11 @@ int main(int argc, char ** argv) { if (n_bytes_read < n_bytes_min) { n_bytes_wanted = n_bytes_min - n_bytes_read; } else { - n_bytes_wanted = n_bytes_read % sizeof(float); - } - if (n_bytes_wanted > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + const auto n_mod = n_bytes_read % sizeof(float); + n_bytes_wanted = (n_mod != 0) ? sizeof(float) - n_mod : 0; } + const auto est_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE; + std::this_thread::sleep_for(std::chrono::milliseconds(est_ms)); } pcmf32.resize(n_bytes_read / sizeof(float)); } else if (t_diff < abs(params.step_ms)) { @@ -374,7 +330,7 @@ int main(int argc, char ** argv) { } n_samples_new += n_samples_buf; - if (!use_vad && n_samples_new > 2*n_samples_step) { + if (!use_vad && !piped && n_samples_new > 2*n_samples_step) { fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE); n_samples_old = 0; @@ -513,10 +469,10 @@ int main(int argc, char ** argv) { text << std::endl; } if (is_interim) { - // utf-8 cannot be simply cut into two + // utf-8 cannot be simply cut std::wstring_convert, char32_t> conv; const auto t_u32 = conv.from_bytes(i_text); - const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.7)); + const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.9)); i_text = t_sub + "…"; } if (s_to_delete.size() > 0) { From f54a43921581f7c4d54d231a5e2dd580c5ec819b Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Sun, 19 Jan 2025 10:12:12 +0900 Subject: [PATCH 13/14] Make it more reviewer-friendly Update README Follow the behavior in README, in particular, the "sliding window" part Rename variables to easier-to-review names and rewrite if-conditions --- examples/stream/README.md | 13 +- examples/stream/stream.cpp | 252 +++++++++++++++++++------------------ 2 files changed, 144 insertions(+), 121 deletions(-) diff --git a/examples/stream/README.md b/examples/stream/README.md index f07cfb8915c..fb0ba37809d 100644 --- a/examples/stream/README.md +++ b/examples/stream/README.md @@ -12,7 +12,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a ## Sliding window mode with VAD -Setting the `--step` argument to `0` enables the sliding window mode: +Setting the `--step` argument to `0` or a negative value enables the sliding window mode: ```bash ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 @@ -25,6 +25,17 @@ It's best to tune it to the specific use case, but a value around `0.6` should b When silence is detected, it will transcribe the last `--length` milliseconds of audio and output a transcription block that is suitable for parsing. +You can also set the `--interim` argument to force transcription before the VAD detects silence. + +```bash + ./build/bin/stream -m ./models/ggml-base.en.bin -t 6 --step -2000 --length 10000 -vth 0.6 --interim --keep 200 +``` + +This will transcribe the audio, keeping the last segment unconfirmed, every two seconds +even if the VAD says the speech is still ongoing. In this mode, if the sentence doesn't end +in `--length` milliseconds, the time window will not slide. The audio will be cut there +to be transcribed anyway, keeping the last `--keep` milliseconds for the next inference. + ## Building The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index dfed6143e27..49a14603209 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -25,18 +25,6 @@ #include #endif -void setStdinNonBlocking() { -#ifdef _WIN32 - DWORD mode; - HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); - GetConsoleMode(stdinHandle, &mode); - mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); - SetConsoleMode(stdinHandle, mode); -#else - fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK); -#endif -} - // command-line parameters struct whisper_params { @@ -47,6 +35,7 @@ struct whisper_params { int32_t capture_id = -1; int32_t max_tokens = 128; int32_t audio_ctx = 0; + int32_t n_tmp_segs = 1; float vad_thold = 0.6f; float freq_thold = 100.0f; @@ -151,6 +140,9 @@ int main(int argc, char ** argv) { return 1; } + if (params.step_ms == 0) { + params.step_ms = -2000; // reasonable default for VAD + } params.keep_ms = std::min(params.keep_ms, abs(params.step_ms)); params.length_ms = std::max(params.length_ms, abs(params.step_ms)); @@ -161,26 +153,38 @@ int main(int argc, char ** argv) { const int n_samples_100ms= (1e-3*100.0 )*WHISPER_SAMPLE_RATE; const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD + const bool piped = !isatty(fileno(stdin)); // init audio audio_async audio(params.length_ms); - bool piped = !isatty(fileno(stdin)); - if (piped) { - #ifdef _WIN32 - _setmode(_fileno(stdin), _O_BINARY); - #else - freopen(NULL, "rb", stdin); - #endif - setStdinNonBlocking(); - } else { + if (!piped) { if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) { fprintf(stderr, "%s: audio.init() failed!\n", __func__); return 1; } audio.resume(); + } else { + fprintf(stderr, "%s: audio is from stdin, not from microphone\n", __func__); + + #ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); + #else + freopen(NULL, "rb", stdin); + #endif + + // non-blocking mode + #ifdef _WIN32 + DWORD mode; + HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE); + GetConsoleMode(stdinHandle, &mode); + mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT); + SetConsoleMode(stdinHandle, mode); + #else + fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK); + #endif } // whisper init @@ -225,10 +229,10 @@ int main(int argc, char ** argv) { params.translate ? "translate" : "transcribe", params.no_timestamps ? 0 : 1); - if (!use_vad) { - fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context); - } else { + fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context); + if (use_vad) { fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__); + fprintf(stderr, "%s: interim report = %d, temporary segments = %d\n", __func__, params.interim, params.n_tmp_segs); } fprintf(stderr, "\n"); @@ -258,15 +262,13 @@ int main(int argc, char ** argv) { wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1); } - fprintf(stderr, "[Start speaking]\n"); fflush(stderr); auto t_last = std::chrono::high_resolution_clock::now(); - auto t_interim = t_last; - bool is_interim = false; const auto t_start = t_last; - std::string s_to_delete = ""; + auto t_interim = t_last; + std::string s_tmp = ""; // main audio loop while (is_running) { @@ -281,13 +283,26 @@ int main(int argc, char ** argv) { const auto t_now = std::chrono::high_resolution_clock::now(); const auto t_diff = std::chrono::duration_cast(t_now - t_last).count(); - // get new audio - if (n_samples_new > n_samples_step) { - pcmf32.clear(); - } else if (piped) { - // need at least step_ms + if (!piped) { + const auto sleep_ms = abs(params.step_ms) - t_diff; + if (sleep_ms > 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + continue; + } + + audio.next(pcmf32); + + if ((int) pcmf32.size() > 2*n_samples_step) { + fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); + fprintf(stderr, "t_diff = %.2f sec, prev = %.2f sec, got = %.2f sec\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(pcmf32.size())/WHISPER_SAMPLE_RATE); + n_samples_old = 0; + n_samples_new = 0; + t_last = t_now; + continue; + } + } else { + // piped: need at least step_ms but try to get length_ms at first const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float); - // but try to get length_ms at first auto n_bytes_wanted = n_samples_len * sizeof(float); pcmf32.resize(n_samples_len); @@ -296,8 +311,8 @@ int main(int argc, char ** argv) { char *p_buf = (char *)pcmf32.data(); const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted); if (n_read == 0 || n_read == -1 && errno != EAGAIN) { - fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno); - is_running = false; + fprintf(stderr, "read(stdin) returned %zd, errno = %s\n", n_read, strerror(errno)); + is_running = false; // flush all results break; } n_bytes_read += std::max(0, n_read); @@ -307,15 +322,10 @@ int main(int argc, char ** argv) { const auto n_mod = n_bytes_read % sizeof(float); n_bytes_wanted = (n_mod != 0) ? sizeof(float) - n_mod : 0; } - const auto est_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE; - std::this_thread::sleep_for(std::chrono::milliseconds(est_ms)); + const auto est_sleep_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE; + std::this_thread::sleep_for(std::chrono::milliseconds(est_sleep_ms)); } pcmf32.resize(n_bytes_read / sizeof(float)); - } else if (t_diff < abs(params.step_ms)) { - std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff)); - continue; - } else { - audio.next(pcmf32); } int n_samples_buf = pcmf32.size(); @@ -330,49 +340,46 @@ int main(int argc, char ** argv) { } n_samples_new += n_samples_buf; - if (!use_vad && !piped && n_samples_new > 2*n_samples_step) { - fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__); - fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE); - n_samples_old = 0; - n_samples_new = 0; - t_last = t_now; - continue; - } if (n_samples_old + n_samples_new == 0) { continue; } - is_interim = false; - bool is_aborted = true; - - n_samples_buf = std::min(n_samples_len, n_samples_old + n_samples_new); + // prepare pcmf32 for inference + n_samples_buf = n_samples_old + n_samples_new; pcmf32.resize(n_samples_buf); copy(pcmf32_deque.end() - n_samples_buf, pcmf32_deque.end(), pcmf32.begin()); - if (!use_vad){ + // chop the audio unconditionally + bool use_keep_ms = ((!use_vad || params.interim) && n_samples_buf > n_samples_len); + + // interim report in vad mode: once every step_ms, + // run the inference even if vad returns false, + // confirm (n_segments - params.n_tmp_segs) segments, + // and print other segments as s_tmp, which will be deleted + bool is_interim = false; + + if (!use_vad || use_keep_ms || !is_running) { + use_keep_ms = true; n_samples_old += n_samples_new; n_samples_new = 0; t_last = t_now; } else { - is_aborted = (n_samples_buf > n_samples_len); - - if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { + if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) { n_samples_new = 0; n_samples_old = 0; t_last = t_now; } else { - const auto n_interim_diff_ms = std::chrono::duration_cast(t_now - t_interim).count(); + const auto interim_diff_ms = std::chrono::duration_cast(t_now - t_interim).count(); - if (params.interim && n_interim_diff_ms > abs(params.step_ms)) { - is_interim = (n_interim_diff_ms < params.length_ms - abs(params.step_ms)); + if (params.interim && interim_diff_ms > abs(params.step_ms)) { + is_interim = true; n_samples_old += n_samples_new; n_samples_new = 0; - pcmf32.resize(n_samples_old); - copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin()); } else { + // sliding window n_samples_new -= n_samples_100ms; n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms); if (!piped) { @@ -408,145 +415,150 @@ int main(int argc, char ** argv) { wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); + // call whisper_full() with at least 1 sec of buffer { auto pcm_size = pcmf32.size(); if (pcm_size < WHISPER_SAMPLE_RATE * 1.1) { pcmf32.resize(pcm_size + WHISPER_SAMPLE_RATE, 0.0f); } - if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + if (whisper_full(ctx, wparams, pcmf32.data(), pcm_size) != 0) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 6; } pcmf32.resize(pcm_size); } - t_interim = std::chrono::high_resolution_clock::now(); + t_interim = std::chrono::high_resolution_clock::now(); // print result; int n_segments; - bool no_confirmed = (!use_vad && n_samples_old < n_samples_len - n_samples_step); - std::ostringstream text; + bool is_all_tmp = (!use_vad && n_samples_old < n_samples_len - n_samples_step); + std::ostringstream ss_output; + { - if (params.delete_vt100 && s_to_delete.size()) { + if (params.delete_vt100 && s_tmp.size()) { printf("\33[2K\r"); // print long empty line to clear the previous line - printf("%s", std::string(s_to_delete.size(), ' ').c_str()); + printf("%s", std::string(s_tmp.size(), ' ').c_str()); printf("\33[2K\r"); } - s_to_delete.clear(); + s_tmp.clear(); n_segments = whisper_full_n_segments(ctx); - no_confirmed = (no_confirmed || is_interim && n_segments <= 1); - if (is_running && is_interim && !no_confirmed) { - const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 2) * 10; + is_all_tmp = (is_running && (is_all_tmp || is_interim && n_segments <= params.n_tmp_segs)); + if (is_running && is_interim && !is_all_tmp) { + const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - params.n_tmp_segs - 1) * 10; if (t1_ms < abs(params.step_ms)) { // too short to confirm - no_confirmed = true; + is_all_tmp = true; } else { t_last += std::chrono::milliseconds(t1_ms); const auto n_samples_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE; pcmf32.resize(n_samples_confirmed); // for timestamps - n_samples_old -= n_samples_confirmed; + n_samples_old -= n_samples_confirmed; // kept for next iteration } } - if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) { + bool show_n_iter = (use_vad && !params.no_timestamps && !is_all_tmp); + + if (show_n_iter) { const int64_t t1 = (t_last - t_start).count()/1000000; const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); - text << std::endl; - text << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl; - text << std::endl; + ss_output << std::endl; + ss_output << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl; + ss_output << std::endl; } for (int i = 0; i < n_segments; ++i) { - std::string i_text = whisper_full_get_segment_text(ctx, i); + std::string text = whisper_full_get_segment_text(ctx, i); - // last segment may be s_to_delete - if (i == n_segments - 1 && is_running && (no_confirmed || is_interim)) { + // last segment(s) may be s_tmp + if (i >= n_segments - params.n_tmp_segs && is_running && (is_all_tmp || is_interim)) { if (params.no_timestamps && i > 0) { - text << std::endl; + ss_output << std::endl; } if (is_interim) { - // utf-8 cannot be simply cut + // utf-8 cannot be simply cut, so use char32_t std::wstring_convert, char32_t> conv; - const auto t_u32 = conv.from_bytes(i_text); - const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.9)); - i_text = t_sub + "…"; - } - if (s_to_delete.size() > 0) { - s_to_delete += " "; + const auto s_u32 = conv.from_bytes(text); + const auto s_sub = conv.to_bytes(s_u32.substr(0, s_u32.size() * 0.9)); + text = s_sub + "…"; } - s_to_delete += i_text; - if (!params.delete_vt100) { - s_to_delete = "(" + s_to_delete + ")"; + if (s_tmp.size() > 0) { + s_tmp += " "; } - break; + s_tmp += text; + continue; } - if (is_running && no_confirmed) { - if (s_to_delete.size() > 0) { - s_to_delete += " "; + if (is_all_tmp) { + if (s_tmp.size() > 0) { + s_tmp += " "; } - s_to_delete += i_text; + s_tmp += text; } else if (params.no_timestamps) { if (i > 0) { - text << std::endl; + ss_output << std::endl; } - text << i_text; - } else if (!is_running || !(is_interim && i == n_segments - 1)) { + ss_output << text; + } else { const int64_t t_end = (t_last - t_start).count()/1000000; const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE); const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i); const int64_t t1 = t_beg/10 + whisper_full_get_segment_t1(ctx, i); - text << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "] " << i_text; + ss_output << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "] " << text; if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - text << " [SPEAKER_TURN]"; + ss_output << " [SPEAKER_TURN]"; } - text << std::endl; + ss_output << std::endl; } } - if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) { - text << std::endl; - text << "### Transcription " << n_iter << " END"; - text << std::endl; - if (s_to_delete.size() > 0) { - text << std::endl; + if (show_n_iter) { + ss_output << std::endl; + ss_output << "### Transcription " << n_iter << " END" << std::endl; + if (s_tmp.size() > 0) { + ss_output << std::endl; } } } if (params.fname_out.length() > 0) { - fout << text.str(); + fout << ss_output.str(); fout << std::endl; } - if (!no_confirmed) { + if (!is_all_tmp) { ++n_iter; } - printf("%s", text.str().c_str()); + printf("%s", ss_output.str().c_str()); + + if (s_tmp.size() > 0) { + if (!params.delete_vt100) { + s_tmp = "(" + s_tmp + ")\n"; + } + printf("%s", s_tmp.c_str()); - if (is_running && (no_confirmed || is_interim)) { - printf("%s%s", s_to_delete.c_str(), params.delete_vt100 ? "" : "\n"); - --n_segments; // exclude s_to_delete from context + // exclude s_tmp from context + n_segments -= is_all_tmp ? n_segments : params.n_tmp_segs; } else { printf("\n"); - s_to_delete = ""; + s_tmp = ""; - if (is_aborted) { + if (use_keep_ms) { // keep part of the audio for next iteration to try to mitigate word boundary issues n_samples_old = std::min(n_samples_old, n_samples_keep); } } // Add tokens of the last full length segment as the prompt - if (!no_confirmed && !params.no_context) { + if (n_segments > 0 && !params.no_context) { prompt_tokens.clear(); for (int i = 0; i < n_segments; ++i) { From 09111f0c110e69867a30093a6b6d0a7fd5af3100 Mon Sep 17 00:00:00 2001 From: Tamotsu Takahashi Date: Tue, 21 Jan 2025 21:06:29 +0900 Subject: [PATCH 14/14] Fix n_samples_new > n_samples_step case --- examples/stream/stream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index 49a14603209..38a15c448e4 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -302,11 +302,11 @@ int main(int argc, char ** argv) { } } else { // piped: need at least step_ms but try to get length_ms at first - const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float); + const auto n_bytes_min = std::max(0, (n_samples_step - n_samples_new) * sizeof(float)); auto n_bytes_wanted = n_samples_len * sizeof(float); pcmf32.resize(n_samples_len); - auto n_bytes_read = 0; + long n_bytes_read = 0; while (n_bytes_wanted > 0) { char *p_buf = (char *)pcmf32.data(); const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted);