From 419aee36e4a50f9701304d1829fa09ec97640c49 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 00:18:09 +0900
Subject: [PATCH 01/14] Add an audio function to retrieve the audio data since
 last time.

Without it, `stream --save-audio` produces somehow choppy wav:
`stream` calculates t_diff in milliseconds
and combine audio pieces which are about step_ms long.

WHISPER_SAMPLE_RATE / 1000 == only 16

but surprisingly human ears seem to be able to hear the gap
as a noise.
---
 examples/common-sdl.cpp    | 38 ++++++++++++++++++++++++++------------
 examples/common-sdl.h      |  3 +++
 examples/stream/stream.cpp |  3 +--
 3 files changed, 30 insertions(+), 14 deletions(-)
diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp
index b61f8cff5fd..6272ce838a7 100644
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@@ -130,6 +130,7 @@ bool audio_async::clear() {
 
         m_audio_pos = 0;
         m_audio_len = 0;
+        m_audio_nxt = 0;
     }
 
     return true;
@@ -172,6 +173,28 @@ void audio_async::callback(uint8_t * stream, int len) {
 }
 
 void audio_async::get(int ms, std::vector<float> & result) {
+    if (ms <= 0) {
+        ms = m_len_ms;
+    }
+
+    size_t n_samples = std::min<size_t>(m_audio_len, (m_sample_rate * ms) / 1000);
+
+    get_n(n_samples, result);
+}
+
+void audio_async::next(std::vector<float> & result) {
+    size_t n_samples;
+
+    if (m_audio_pos >= m_audio_nxt) {
+        n_samples = m_audio_pos - m_audio_nxt;
+    } else {
+        n_samples = m_audio_len - m_audio_nxt + m_audio_pos;
+    }
+
+    get_n(n_samples, result);
+}
+
+void audio_async::get_n(size_t n_samples, std::vector<float> & result) {
     if (!m_dev_id_in) {
         fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
         return;
@@ -182,20 +205,9 @@ void audio_async::get(int ms, std::vector<float> & result) {
         return;
     }
 
-    result.clear();
-
     {
         std::lock_guard<std::mutex> lock(m_mutex);
 
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
         result.resize(n_samples);
 
         int s0 = m_audio_pos - n_samples;
@@ -205,10 +217,12 @@ void audio_async::get(int ms, std::vector<float> & result) {
 
         if (s0 + n_samples > m_audio.size()) {
             const size_t n0 = m_audio.size() - s0;
+            m_audio_nxt = n_samples - n0;
 
             memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], m_audio_nxt * sizeof(float));
         } else {
+            m_audio_nxt = s0 + n_samples;
             memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
         }
     }
diff --git a/examples/common-sdl.h b/examples/common-sdl.h
index 9ee8a320724..746493f7c83 100644
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@@ -30,6 +30,8 @@ class audio_async {
 
     // get audio data from the circular buffer
     void get(int ms, std::vector<float> & audio);
+    void next(std::vector<float> & audio);
+    void get_n(size_t n_samples, std::vector<float> & audio);
 
 private:
     SDL_AudioDeviceID m_dev_id_in = 0;
@@ -43,6 +45,7 @@ class audio_async {
     std::vector<float> m_audio;
     size_t             m_audio_pos = 0;
     size_t             m_audio_len = 0;
+    size_t             m_audio_nxt = 0;
 };
 
 // Return false if need to quit
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 190f68a2c3b..1855329a065 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
 
         if (!use_vad) {
             while (true) {
-                audio.get(params.step_ms, pcmf32_new);
+                audio.next(pcmf32_new);
 
                 if ((int) pcmf32_new.size() > 2*n_samples_step) {
                     fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
@@ -250,7 +250,6 @@ int main(int argc, char ** argv) {
                 }
 
                 if ((int) pcmf32_new.size() >= n_samples_step) {
-                    audio.clear();
                     break;
                 }
 

From 289946da8f9edcaf0ef3f178d2683098e821abce Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 10:27:52 +0900
Subject: [PATCH 02/14] Simplify stream's pcmf32 handling

Use one deque instead of two vectors (old and new).
Old and new are length variables now.

Basically: Get `step - new` samples every time.
Then substitute `new = (around) step;`
The new audio data is simply appended to the deque.
(Limit the deque size to 30 seconds.)
Pass `old + new` samples to whisper inference.

If the data has been consumed, let `old = 0; new = 0;`
If some of the data should be kept for the next iter, `old = keep;`
If you want to get only N samples next time, `new = step - N;`

In VAD mode: `stream --interim --step -3000` will
Get 3000ms of audio.
Run `vad_simple(step_ms)`.
If nothing is detected, get 100ms more audio and retry.
If nothing is detected and 3000ms has been passed,
go into the interim mode,
where `n_segments - 1` segments will be confirmed.
(`old -= confirmed_t1`)
If `n_segments == 1`, only show the first half of the result.

Misc:
Increase the default `max_tokens` because 32 is too small for 10 seconds.
(Some Japanese speech was garbled.)
Write wav as soon as the data is available.

`no_timestamps` is the default even for VAD
because it is more useful to show to the hard-of-hearing
---
 examples/stream/stream.cpp | 246 ++++++++++++++++++++++---------------
 1 file changed, 146 insertions(+), 100 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 1855329a065..6665253392a 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -7,6 +7,7 @@
 #include "whisper.h"
 
 #include <cassert>
+#include <codecvt>
 #include <cstdio>
 #include <string>
 #include <thread>
@@ -21,7 +22,7 @@ struct whisper_params {
     int32_t length_ms  = 10000;
     int32_t keep_ms    = 200;
     int32_t capture_id = -1;
-    int32_t max_tokens = 32;
+    int32_t max_tokens = 128;
     int32_t audio_ctx  = 0;
 
     float vad_thold    = 0.6f;
@@ -36,6 +37,7 @@ struct whisper_params {
     bool save_audio    = false; // save audio to wav file
     bool use_gpu       = true;
     bool flash_attn    = false;
+    bool interim       = false;
 
     std::string language  = "en";
     std::string model     = "models/ggml-base.en.bin";
@@ -65,6 +67,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
         else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-kc"   || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-nt"   || arg == "--no-timestamps") { params.no_timestamps = true; }
         else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
         else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
         else if (arg == "-f"    || arg == "--file")          { params.fname_out     = argv[++i]; }
@@ -72,6 +75,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-sa"   || arg == "--save-audio")    { params.save_audio    = true; }
         else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
         else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
+        else if (arg == "-int"  || arg == "--interim")       { params.interim       = true; }
 
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -102,6 +106,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
     fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
     fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",              params.no_context ? "false" : "true");
+    fprintf(stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
     fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
     fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
     fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                          params.fname_out.c_str());
@@ -109,6 +114,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -sa,      --save-audio    [%-7s] save the recorded audio to a file\n",              params.save_audio ? "true" : "false");
     fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU inference\n",                          params.use_gpu ? "false" : "true");
     fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention during inference\n",               params.flash_attn ? "true" : "false");
+    fprintf(stderr, "  -int,     --interim       [%-7s] show interim report in vad every step\n",          params.interim ? "true" : "false");
     fprintf(stderr, "\n");
 }
 
@@ -122,19 +128,16 @@ int main(int argc, char ** argv) {
     params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
     params.length_ms = std::max(params.length_ms, params.step_ms);
 
-    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
+    const int n_samples_step = (1e-3*abs(params.step_ms))*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (1e-3*params.length_ms   )*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (1e-3*params.keep_ms     )*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (1e-3*30000.0            )*WHISPER_SAMPLE_RATE;
+    const int n_samples_100ms= (1e-3*100.0              )*WHISPER_SAMPLE_RATE;
 
-    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
+    const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD
 
     const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
 
-    params.no_timestamps  = !use_vad;
-    params.no_context    |= use_vad;
-    params.max_tokens     = 0;
-
     // init audio
 
     audio_async audio(params.length_ms);
@@ -159,9 +162,10 @@ int main(int argc, char ** argv) {
 
     struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
 
-    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
-    std::vector<float> pcmf32_old;
-    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
+    std::vector<float> pcmf32(n_samples_30s, 0.0f);
+    std::deque<float> pcmf32_deque;
+    int n_samples_new = 0;
+    int n_samples_old = 0;
 
     std::vector<whisper_token> prompt_tokens;
 
@@ -219,17 +223,17 @@ int main(int argc, char ** argv) {
 
         wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
     }
-    printf("[Start speaking]\n");
-    fflush(stdout);
+    fprintf(stderr, "[Start speaking]\n");
+    fflush(stderr);
 
     auto t_last  = std::chrono::high_resolution_clock::now();
+    auto t_interim = t_last;
+    bool is_interim = false;
     const auto t_start = t_last;
+    std::string s_to_delete = "";
 
     // main audio loop
     while (is_running) {
-        if (params.save_audio) {
-            wavWriter.write(pcmf32_new.data(), pcmf32_new.size());
-        }
         // handle Ctrl + C
         is_running = sdl_poll_events();
 
@@ -238,61 +242,74 @@ int main(int argc, char ** argv) {
         }
 
         // process new audio
+        const auto t_now  = std::chrono::high_resolution_clock::now();
+        const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
+
+        // get new audio
+        if (n_samples_new > n_samples_step) {
+            pcmf32.clear();
+        } else if (t_diff < abs(params.step_ms)) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff));
+            continue;
+        } else {
+            audio.next(pcmf32);
+        }
 
-        if (!use_vad) {
-            while (true) {
-                audio.next(pcmf32_new);
-
-                if ((int) pcmf32_new.size() > 2*n_samples_step) {
-                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
-                    audio.clear();
-                    continue;
-                }
-
-                if ((int) pcmf32_new.size() >= n_samples_step) {
-                    break;
-                }
-
-                std::this_thread::sleep_for(std::chrono::milliseconds(1));
-            }
-
-            const int n_samples_new = pcmf32_new.size();
-
-            // take up to params.length_ms audio from previous iteration
-            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
+        const int n_samples_buf = pcmf32.size();
 
-            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+        if (params.save_audio && n_samples_buf > 0) {
+            wavWriter.write(pcmf32.data(), n_samples_buf);
+        }
 
-            pcmf32.resize(n_samples_new + n_samples_take);
+        copy(pcmf32.begin(), pcmf32.end(), back_inserter(pcmf32_deque));
+        if (pcmf32_deque.size() > n_samples_30s) {
+            pcmf32_deque.erase(pcmf32_deque.begin(), pcmf32_deque.end() - n_samples_30s);
+        }
 
-            for (int i = 0; i < n_samples_take; i++) {
-                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
-            }
+        n_samples_new += n_samples_buf;
+        if (!is_interim && n_samples_new > 2*n_samples_step) {
+            fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__);
+            fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE);
+            n_samples_old = 0;
+            n_samples_new = 0;
+            t_last = t_now;
+            continue;
+        }
+        is_interim = false;
 
-            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
+        if (!use_vad){
+            n_samples_old += n_samples_new;
+            n_samples_new = 0;
+            pcmf32.resize(n_samples_old);
+            copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin());
 
-            pcmf32_old = pcmf32;
+            t_last = t_now;
         } else {
-            const auto t_now  = std::chrono::high_resolution_clock::now();
-            const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
-
-            if (t_diff < 2000) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-                continue;
-            }
-
-            audio.get(2000, pcmf32_new);
-
-            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
-                audio.get(params.length_ms, pcmf32);
+            pcmf32.resize(n_samples_step);
+            copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin());
+            if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
+                pcmf32.resize(n_samples_old + n_samples_new);
+                copy(pcmf32_deque.end() - n_samples_old - n_samples_new, pcmf32_deque.end(), pcmf32.begin());
+                n_samples_new = 0;
+                n_samples_old = 0;
+
+                t_last = t_now;
             } else {
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-                continue;
+                const auto n_interim_diff_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_interim).count();
+
+                if (params.interim && n_interim_diff_ms > abs(params.step_ms)) {
+                    is_interim = (n_interim_diff_ms < params.length_ms - abs(params.step_ms));
+                    n_samples_old += n_samples_new;
+                    n_samples_new = 0;
+                    pcmf32.resize(n_samples_old);
+                    copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin());
+                } else {
+                    n_samples_new -= n_samples_100ms;
+                    n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    continue;
+                }
             }
-
-            t_last = t_now;
         }
 
         // run the inference
@@ -324,80 +341,109 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 6;
             }
+            t_interim  = std::chrono::high_resolution_clock::now();
 
             // print result;
+            int n_segments;
+            bool is_unconfirmed = false;
+            std::ostringstream text;
             {
-                if (!use_vad) {
+                if (!use_vad || params.interim && params.no_timestamps && s_to_delete.size()) {
                     printf("\33[2K\r");
 
                     // print long empty line to clear the previous line
-                    printf("%s", std::string(100, ' ').c_str());
+                    printf("%s", std::string(s_to_delete.size(), ' ').c_str());
 
                     printf("\33[2K\r");
-                } else {
+                } else if (use_vad && !params.no_timestamps) {
                     const int64_t t1 = (t_last - t_start).count()/1000000;
                     const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
 
-                    printf("\n");
-                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
-                    printf("\n");
+                    text << std::endl;
+                    text << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl;
+                    text << std::endl;
                 }
 
-                const int n_segments = whisper_full_n_segments(ctx);
+                n_segments = whisper_full_n_segments(ctx);
+                if (is_interim) {
+                    if (n_segments < 2) {
+                        is_unconfirmed = true;
+                    } else {
+                        n_segments--;
+                        const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 1) * 10;
+                        t_last += std::chrono::milliseconds(t1_ms);
+                        const auto n_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE;
+                        pcmf32.resize(n_confirmed);
+                        n_samples_old -= n_confirmed;
+                    }
+                }
                 for (int i = 0; i < n_segments; ++i) {
-                    const char * text = whisper_full_get_segment_text(ctx, i);
-
-                    if (params.no_timestamps) {
-                        printf("%s", text);
-                        fflush(stdout);
+                    std::string i_text = whisper_full_get_segment_text(ctx, i);
 
-                        if (params.fname_out.length() > 0) {
-                            fout << text;
+                    if (!use_vad || params.no_timestamps) {
+                        if (i > 0) {
+                            text << std::endl;
                         }
+                        text << i_text;
                     } else {
-                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                        const int64_t t_end = (t_last - t_start).count()/1000000;
+                        const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
+                        const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i);
+                        const int64_t t1 = t_beg/10 + whisper_full_get_segment_t1(ctx, i);
 
-                        std::string output = "[" + to_timestamp(t0, false) + " --> " + to_timestamp(t1, false) + "]  " + text;
+                        text << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "]  " << i_text;
 
                         if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-                            output += " [SPEAKER_TURN]";
+                            text << " [SPEAKER_TURN]";
                         }
 
-                        output += "\n";
-
-                        printf("%s", output.c_str());
-                        fflush(stdout);
-
-                        if (params.fname_out.length() > 0) {
-                            fout << output;
-                        }
+                        text << std::endl;
                     }
                 }
 
-                if (params.fname_out.length() > 0) {
-                    fout << std::endl;
+                if (use_vad && !params.no_timestamps) {
+                    text << std::endl;
+                    text << "### Transcription " << n_iter << " END";
+                    text << std::endl;
                 }
+            }
 
-                if (use_vad) {
-                    printf("\n");
-                    printf("### Transcription %d END\n", n_iter);
-                }
+            if (params.fname_out.length() > 0) {
+                fout << text.str();
+                fout << std::endl;
             }
 
             ++n_iter;
 
-            if (!use_vad && (n_iter % n_new_line) == 0) {
+            if (is_unconfirmed) {
+                --n_iter;
+                // utf-8 cannot be simply cut into two
+                std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+                auto t_u32 = conv.from_bytes(text.str());
+                auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() / 2));
+                text.str(t_sub + "…");
+            }
+
+            printf("%s", text.str().c_str());
+
+            if (is_unconfirmed || !use_vad && n_samples_old < n_samples_len - n_samples_step) {
+                s_to_delete = text.str();
+            } else {
                 printf("\n");
+                s_to_delete = "";
 
-                // keep part of the audio for next iteration to try to mitigate word boundary issues
-                pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
+                if (!use_vad) {
+                    n_iter = 0;
+                    if (n_samples_keep < n_samples_old) {
+                        // keep part of the audio for next iteration to try to mitigate word boundary issues
+                        n_samples_old = n_samples_keep;
+                    }
+                }
 
                 // Add tokens of the last full length segment as the prompt
                 if (!params.no_context) {
                     prompt_tokens.clear();
 
-                    const int n_segments = whisper_full_n_segments(ctx);
                     for (int i = 0; i < n_segments; ++i) {
                         const int token_count = whisper_full_n_tokens(ctx, i);
                         for (int j = 0; j < token_count; ++j) {

From b27fc1fd1db91be61f14a8aef0d14939ee203a32 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 11:15:21 +0900
Subject: [PATCH 03/14] Add headers for gcc c++

---
 examples/stream/stream.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 6665253392a..ac2116bca12 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -9,6 +9,8 @@
 #include <cassert>
 #include <codecvt>
 #include <cstdio>
+#include <deque>
+#include <locale>
 #include <string>
 #include <thread>
 #include <vector>
@@ -261,7 +263,7 @@ int main(int argc, char ** argv) {
             wavWriter.write(pcmf32.data(), n_samples_buf);
         }
 
-        copy(pcmf32.begin(), pcmf32.end(), back_inserter(pcmf32_deque));
+        copy(pcmf32.begin(), pcmf32.end(), std::back_inserter(pcmf32_deque));
         if (pcmf32_deque.size() > n_samples_30s) {
             pcmf32_deque.erase(pcmf32_deque.begin(), pcmf32_deque.end() - n_samples_30s);
         }

From b114ec309f4ebcf0c51799edd70c63e0a26188ce Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 11:59:50 +0900
Subject: [PATCH 04/14] Accept pipe to stream

Now it is easy to test with raw PCM data.
Try `cat pcmf32.raw | stream`
(or `pv -qL 64000 pcmf32.raw | stream` in realtime)

Note: I haven't tested WIN32 ifdefs.

You can make such data by
`ffmpeg -i jfk.wav -f f32le -acodec pcm_f32le jfk.raw`
because wav header length (44) is a multiple of `sizeof float` (4)

I decided to ignore the data before `[Start speaking]`
because such premature data are not good
for remote-transcription systems like:

```
mic2pcm | ssh -C remote "stream | lines2googledocs"
```

or

```
mic2some | ssh -C remote "ffmpeg -loglevel fatal -i pipe:0 -tune zerolatency -af atempo=1.1 -f f32le -ar 16000 -acodec pcm_f32le pipe:1 | stream"
```

So if you want to do a strict test, remove the "ignore" part.
Otherwise quite a number of bytes will be ignored.
---
 examples/stream/stream.cpp | 117 +++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 6 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index ac2116bca12..5bbff7140a0 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -16,6 +16,38 @@
 #include <vector>
 #include <fstream>
 
+#ifdef _WIN32
+#include <windows.h>
+#include <io.h>
+#else
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+void setStdinNonBlocking() {
+#ifdef _WIN32
+    DWORD mode;
+    HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
+    GetConsoleMode(stdinHandle, &mode);
+    mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+    SetConsoleMode(stdinHandle, mode);
+#else
+    fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK);
+#endif
+}
+
+void setStdinBlocking() {
+#if defined(_WIN32)
+    DWORD mode;
+    HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
+    GetConsoleMode(stdinHandle, &mode);
+    mode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
+    SetConsoleMode(stdinHandle, mode);
+#else
+    fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) & ~O_NONBLOCK);
+#endif
+}
+
 
 // command-line parameters
 struct whisper_params {
@@ -143,12 +175,22 @@ int main(int argc, char ** argv) {
     // init audio
 
     audio_async audio(params.length_ms);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
+    bool piped = !isatty(fileno(stdin));
+
+    if (piped) {
+        #ifdef _WIN32
+        _setmode(_fileno(stdin), _O_BINARY);
+        #else
+        freopen(NULL, "rb", stdin);
+        #endif
+    } else {
+        if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+            fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+            return 1;
+        }
 
-    audio.resume();
+        audio.resume();
+    }
 
     // whisper init
     if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1){
@@ -225,9 +267,43 @@ int main(int argc, char ** argv) {
 
         wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
     }
+
+    // ignore premature stdin
+    int n_mod = 0;
+    if (piped) {
+        const auto n_bytes_len = sizeof(float) * n_samples_len;
+        setStdinNonBlocking();
+        while (true) {
+            const auto n_bytes_read = read(fileno(stdin), pcmf32.data(), n_bytes_len);
+            if (n_bytes_read == -1 && errno == EAGAIN) {
+                break;
+            } else if (n_bytes_read < 1) {
+                fprintf(stderr, "stdin ended too early\n");
+                is_running = false;
+                break;
+            }
+            n_mod = n_bytes_read % sizeof(float);
+            if (n_bytes_read < n_bytes_len) {
+                break;
+            }
+        }
+    }
+
     fprintf(stderr, "[Start speaking]\n");
     fflush(stderr);
 
+    if (piped) {
+        // ignore the partial sample
+        if (n_mod > 0) {
+            const auto n_remain = sizeof(float) - n_mod;
+            setStdinBlocking();
+            if (n_remain != fread(pcmf32.data(), 1, n_remain, stdin)) {
+                is_running = false;
+            }
+        }
+        setStdinNonBlocking();
+    }
+
     auto t_last  = std::chrono::high_resolution_clock::now();
     auto t_interim = t_last;
     bool is_interim = false;
@@ -250,6 +326,33 @@ int main(int argc, char ** argv) {
         // get new audio
         if (n_samples_new > n_samples_step) {
             pcmf32.clear();
+        } else if (piped) {
+            pcmf32.resize(n_samples_len);
+            char *p_buf = (char *)pcmf32.data();
+            const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float);
+            auto n_bytes_wanted = n_samples_len * sizeof(float);
+            auto n_bytes_read = 0;
+            while (n_bytes_wanted > 0) {
+                const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted);
+                if (n_read == 0 || n_read == -1 && errno != EAGAIN) {
+                    fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno);
+                    is_running = false;
+                    break;
+                }
+                n_bytes_read += std::max(0L, n_read);
+                if (n_bytes_read < n_bytes_min) {
+                    n_bytes_wanted = n_bytes_min - n_bytes_read;
+                } else {
+                    n_bytes_wanted = n_bytes_read % sizeof(float);
+                }
+                if (n_bytes_wanted > 0) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                }
+            }
+            pcmf32.resize(n_bytes_read / sizeof(float));
+            if (!is_running) {
+                break;
+            }
         } else if (t_diff < abs(params.step_ms)) {
             std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff));
             continue;
@@ -308,7 +411,9 @@ int main(int argc, char ** argv) {
                 } else {
                     n_samples_new -= n_samples_100ms;
                     n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms);
-                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    if (!piped) {
+                        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    }
                     continue;
                 }
             }

From 75099f9f87572738c887a9ef62d0d9b1d82b8b53 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 12:32:01 +0900
Subject: [PATCH 05/14] Fix armv7-linux build

---
 examples/stream/stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 5bbff7140a0..2bbccffd3ba 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -339,7 +339,7 @@ int main(int argc, char ** argv) {
                     is_running = false;
                     break;
                 }
-                n_bytes_read += std::max(0L, n_read);
+                n_bytes_read += std::max<long>(0, n_read);
                 if (n_bytes_read < n_bytes_min) {
                     n_bytes_wanted = n_bytes_min - n_bytes_read;
                 } else {

From 03b25dd7f305c152ac9cb076efd52903eff9e172 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 12:40:11 +0900
Subject: [PATCH 06/14] Remove unused n_new_line

---
 examples/stream/stream.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 2bbccffd3ba..3a56aa413b9 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -170,8 +170,6 @@ int main(int argc, char ** argv) {
 
     const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD
 
-    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
-
     // init audio
 
     audio_async audio(params.length_ms);
@@ -235,7 +233,7 @@ int main(int argc, char ** argv) {
                 params.no_timestamps ? 0 : 1);
 
         if (!use_vad) {
-            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
+            fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context);
         } else {
             fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
         }

From 61222da541957ebf72ffdae96bb0662e6eac7367 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 12:44:39 +0900
Subject: [PATCH 07/14] Fix windows build (include fcntl.h)

---
 examples/stream/stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 3a56aa413b9..f29b8152e93 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -15,12 +15,12 @@
 #include <thread>
 #include <vector>
 #include <fstream>
+#include <fcntl.h>
 
 #ifdef _WIN32
 #include <windows.h>
 #include <io.h>
 #else
-#include <fcntl.h>
 #include <unistd.h>
 #endif
 

From 17c760041607796f8aa80fe831a5a6117f0c5f9d Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 13:35:06 +0900
Subject: [PATCH 08/14] Fix inconsistency of ifdef

---
 examples/stream/stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index f29b8152e93..78ff1338dcf 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -37,7 +37,7 @@ void setStdinNonBlocking() {
 }
 
 void setStdinBlocking() {
-#if defined(_WIN32)
+#ifdef _WIN32
     DWORD mode;
     HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
     GetConsoleMode(stdinHandle, &mode);

From 425d3add590ca2a343e7ae3f8d48a803d1b647b0 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Thu, 2 Jan 2025 13:53:26 +0900
Subject: [PATCH 09/14] Fix windows build

windows.h defines min unless NOMINMAX is defined
---
 examples/stream/stream.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 78ff1338dcf..a8e0f81ce0a 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -18,10 +18,11 @@
 #include <fcntl.h>
 
 #ifdef _WIN32
-#include <windows.h>
-#include <io.h>
+    #define NOMINMAX
+    #include <windows.h>
+    #include <io.h>
 #else
-#include <unistd.h>
+    #include <unistd.h>
 #endif
 
 void setStdinNonBlocking() {

From 0a84581f20ec9380c1f3eacc390e60b594cd23c7 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Fri, 3 Jan 2025 15:28:47 +0900
Subject: [PATCH 10/14] Make `stream` more test-friendly

Run `stream --test-pipe --no-vt100 2>/dev/null < pcmf32.raw`
to get nearly-reproducible results.
If you want to do a strict testing, use `--no-timestamps` as well.

```
cat jfk.raw | ./build/bin/stream -m models/ggml-large-v2.bin --step 2000 --test-pipe -no-vt100 2>/dev/null
( And so my fellow Americans...)
( And so my fellow Americans, ask...)
( And so my fellow Americans, ask not what your country will give you, but what your country will give you.)
[00:00:00.000 --> 00:00:30.000]   And so my fellow Americans, ask not what your country can do for you.

( Ask what you can do for your)
[00:00:02.360 --> 00:00:32.360]   Ask what you can do for your country.
```

VAD:

```
cat jfk.raw | ./build/bin/stream -m models/ggml-large-v2.bin --step -2000 --test-pipe -no-vt100 2>/dev/null

[00:00:00.000 --> 00:00:03.000]   And so, my fellow Americans.

[00:00:00.000 --> 00:00:07.920]   Ask not what your country can do for you, ask what you can do for your country.

```
---
 examples/stream/stream.cpp | 166 ++++++++++++++++++++++++-------------
 1 file changed, 107 insertions(+), 59 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index a8e0f81ce0a..f569a5aa0dc 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -73,6 +73,8 @@ struct whisper_params {
     bool use_gpu       = true;
     bool flash_attn    = false;
     bool interim       = false;
+    bool delete_vt100  = true;
+    bool test_pipe     = false;
 
     std::string language  = "en";
     std::string model     = "models/ggml-base.en.bin";
@@ -111,6 +113,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
         else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
         else if (arg == "-int"  || arg == "--interim")       { params.interim       = true; }
+        else if (arg == "-nvt"  || arg == "--no-vt100")      { params.delete_vt100  = false; }
+        else if (                  arg == "--test-pipe")     { params.test_pipe     = true; }
 
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -150,6 +154,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU inference\n",                          params.use_gpu ? "false" : "true");
     fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention during inference\n",               params.flash_attn ? "true" : "false");
     fprintf(stderr, "  -int,     --interim       [%-7s] show interim report in vad every step\n",          params.interim ? "true" : "false");
+    fprintf(stderr, "  -nvt,     --no-vt100      [%-7s] do not delete unconfirmed result\n",               params.delete_vt100 ? "false" : "true");
+    fprintf(stderr, "            --test-pipe     [%-7s] use all data from pipe\n",                         params.test_pipe ? "true" : "false");
     fprintf(stderr, "\n");
 }
 
@@ -160,8 +166,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
-    params.length_ms = std::max(params.length_ms, params.step_ms);
+    params.keep_ms   = std::min(params.keep_ms,   abs(params.step_ms));
+    params.length_ms = std::max(params.length_ms, abs(params.step_ms));
 
     const int n_samples_step = (1e-3*abs(params.step_ms))*WHISPER_SAMPLE_RATE;
     const int n_samples_len  = (1e-3*params.length_ms   )*WHISPER_SAMPLE_RATE;
@@ -269,7 +275,7 @@ int main(int argc, char ** argv) {
 
     // ignore premature stdin
     int n_mod = 0;
-    if (piped) {
+    if (piped && !params.test_pipe) {
         const auto n_bytes_len = sizeof(float) * n_samples_len;
         setStdinNonBlocking();
         while (true) {
@@ -349,9 +355,6 @@ int main(int argc, char ** argv) {
                 }
             }
             pcmf32.resize(n_bytes_read / sizeof(float));
-            if (!is_running) {
-                break;
-            }
         } else if (t_diff < abs(params.step_ms)) {
             std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff));
             continue;
@@ -371,7 +374,7 @@ int main(int argc, char ** argv) {
         }
 
         n_samples_new += n_samples_buf;
-        if (!is_interim && n_samples_new > 2*n_samples_step) {
+        if (!use_vad && n_samples_new > 2*n_samples_step) {
             fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__);
             fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE);
             n_samples_old = 0;
@@ -379,7 +382,13 @@ int main(int argc, char ** argv) {
             t_last = t_now;
             continue;
         }
+
+        if (n_samples_old + n_samples_new == 0) {
+            continue;
+        }
+
         is_interim = false;
+        bool is_aborted = true;
 
         if (!use_vad){
             n_samples_old += n_samples_new;
@@ -389,11 +398,17 @@ int main(int argc, char ** argv) {
 
             t_last = t_now;
         } else {
-            pcmf32.resize(n_samples_step);
-            copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin());
-            if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
-                pcmf32.resize(n_samples_old + n_samples_new);
-                copy(pcmf32_deque.end() - n_samples_old - n_samples_new, pcmf32_deque.end(), pcmf32.begin());
+            const auto n_samples = std::min(n_samples_len, n_samples_old + n_samples_new);
+
+            is_aborted = (n_samples > n_samples_len);
+            if (is_running && !is_aborted) {
+                pcmf32.resize(n_samples_step);
+                copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin());
+            }
+
+            if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
+                pcmf32.resize(n_samples);
+                copy(pcmf32_deque.end() - n_samples, pcmf32_deque.end(), pcmf32.begin());
                 n_samples_new = 0;
                 n_samples_old = 0;
 
@@ -443,25 +458,50 @@ int main(int argc, char ** argv) {
             wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
             wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
 
-            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                return 6;
+            {
+                auto pcm_size = pcmf32.size();
+                if (pcm_size < WHISPER_SAMPLE_RATE * 1.1) {
+                    pcmf32.resize(pcm_size + WHISPER_SAMPLE_RATE, 0.0f);
+                }
+                if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                    fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                    return 6;
+                }
+                pcmf32.resize(pcm_size);
             }
             t_interim  = std::chrono::high_resolution_clock::now();
 
             // print result;
             int n_segments;
-            bool is_unconfirmed = false;
+            bool no_confirmed = (!use_vad && n_samples_old < n_samples_len - n_samples_step);
             std::ostringstream text;
             {
-                if (!use_vad || params.interim && params.no_timestamps && s_to_delete.size()) {
+                if (params.delete_vt100 && s_to_delete.size()) {
                     printf("\33[2K\r");
 
                     // print long empty line to clear the previous line
                     printf("%s", std::string(s_to_delete.size(), ' ').c_str());
 
                     printf("\33[2K\r");
-                } else if (use_vad && !params.no_timestamps) {
+                }
+                s_to_delete.clear();
+
+                n_segments = whisper_full_n_segments(ctx);
+                no_confirmed = (no_confirmed || is_interim && n_segments <= 1);
+                if (is_running && is_interim && !no_confirmed) {
+                    const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 2) * 10;
+                    if (t1_ms < abs(params.step_ms)) {
+                        // too short to confirm
+                        no_confirmed = true;
+                    } else {
+                        t_last += std::chrono::milliseconds(t1_ms);
+                        const auto n_samples_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE;
+                        pcmf32.resize(n_samples_confirmed); // for timestamps
+                        n_samples_old -= n_samples_confirmed;
+                    }
+                }
+
+                if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) {
                     const int64_t t1 = (t_last - t_start).count()/1000000;
                     const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
 
@@ -470,28 +510,42 @@ int main(int argc, char ** argv) {
                     text << std::endl;
                 }
 
-                n_segments = whisper_full_n_segments(ctx);
-                if (is_interim) {
-                    if (n_segments < 2) {
-                        is_unconfirmed = true;
-                    } else {
-                        n_segments--;
-                        const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 1) * 10;
-                        t_last += std::chrono::milliseconds(t1_ms);
-                        const auto n_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE;
-                        pcmf32.resize(n_confirmed);
-                        n_samples_old -= n_confirmed;
-                    }
-                }
                 for (int i = 0; i < n_segments; ++i) {
                     std::string i_text = whisper_full_get_segment_text(ctx, i);
 
-                    if (!use_vad || params.no_timestamps) {
+                    // last segment may be s_to_delete
+                    if (i == n_segments - 1 && is_running && (no_confirmed || is_interim)) {
+                        if (params.no_timestamps && i > 0) {
+                            text << std::endl;
+                        }
+                        if (is_interim) {
+                            // utf-8 cannot be simply cut into two
+                            std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
+                            const auto t_u32 = conv.from_bytes(i_text);
+                            const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.7));
+                            i_text = t_sub + "…";
+                        }
+                        if (s_to_delete.size() > 0) {
+                            s_to_delete += " ";
+                        }
+                        s_to_delete += i_text;
+                        if (!params.delete_vt100) {
+                            s_to_delete = "(" + s_to_delete + ")";
+                        }
+                        break;
+                    }
+
+                    if (is_running && no_confirmed) {
+                        if (s_to_delete.size() > 0) {
+                            s_to_delete += " ";
+                        }
+                        s_to_delete += i_text;
+                    } else if (params.no_timestamps) {
                         if (i > 0) {
                             text << std::endl;
                         }
                         text << i_text;
-                    } else {
+                    } else if (!is_running || !(is_interim && i == n_segments - 1)) {
                         const int64_t t_end = (t_last - t_start).count()/1000000;
                         const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
                         const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i);
@@ -507,10 +561,13 @@ int main(int argc, char ** argv) {
                     }
                 }
 
-                if (use_vad && !params.no_timestamps) {
+                if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) {
                     text << std::endl;
                     text << "### Transcription " << n_iter << " END";
                     text << std::endl;
+                    if (s_to_delete.size() > 0) {
+                        text << std::endl;
+                    }
                 }
             }
 
@@ -519,42 +576,33 @@ int main(int argc, char ** argv) {
                 fout << std::endl;
             }
 
-            ++n_iter;
-
-            if (is_unconfirmed) {
-                --n_iter;
-                // utf-8 cannot be simply cut into two
-                std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
-                auto t_u32 = conv.from_bytes(text.str());
-                auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() / 2));
-                text.str(t_sub + "…");
+            if (!no_confirmed) {
+                ++n_iter;
             }
 
             printf("%s", text.str().c_str());
 
-            if (is_unconfirmed || !use_vad && n_samples_old < n_samples_len - n_samples_step) {
-                s_to_delete = text.str();
+            if (is_running && (no_confirmed || is_interim)) {
+                printf("%s%s", s_to_delete.c_str(), params.delete_vt100 ? "" : "\n");
+                --n_segments; // exclude s_to_delete from context
             } else {
                 printf("\n");
                 s_to_delete = "";
 
-                if (!use_vad) {
-                    n_iter = 0;
-                    if (n_samples_keep < n_samples_old) {
-                        // keep part of the audio for next iteration to try to mitigate word boundary issues
-                        n_samples_old = n_samples_keep;
-                    }
+                if (is_aborted) {
+                    // keep part of the audio for next iteration to try to mitigate word boundary issues
+                    n_samples_old = std::min(n_samples_old, n_samples_keep);
                 }
+            }
 
-                // Add tokens of the last full length segment as the prompt
-                if (!params.no_context) {
-                    prompt_tokens.clear();
+            // Add tokens of the last full length segment as the prompt
+            if (!no_confirmed && !params.no_context) {
+                prompt_tokens.clear();
 
-                    for (int i = 0; i < n_segments; ++i) {
-                        const int token_count = whisper_full_n_tokens(ctx, i);
-                        for (int j = 0; j < token_count; ++j) {
-                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
-                        }
+                for (int i = 0; i < n_segments; ++i) {
+                    const int token_count = whisper_full_n_tokens(ctx, i);
+                    for (int j = 0; j < token_count; ++j) {
+                        prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
                     }
                 }
             }

From f99263e420b799077d55fb745bce2151a402eb6f Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Sun, 5 Jan 2025 08:47:22 +0900
Subject: [PATCH 11/14] Run vad_simple on entire pcmf32, not on the last step

---
 examples/stream/stream.cpp | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index f569a5aa0dc..97bf6fb0779 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -362,7 +362,7 @@ int main(int argc, char ** argv) {
             audio.next(pcmf32);
         }
 
-        const int n_samples_buf = pcmf32.size();
+        int n_samples_buf = pcmf32.size();
 
         if (params.save_audio && n_samples_buf > 0) {
             wavWriter.write(pcmf32.data(), n_samples_buf);
@@ -390,25 +390,19 @@ int main(int argc, char ** argv) {
         is_interim = false;
         bool is_aborted = true;
 
+        n_samples_buf = std::min(n_samples_len, n_samples_old + n_samples_new);
+        pcmf32.resize(n_samples_buf);
+        copy(pcmf32_deque.end() - n_samples_buf, pcmf32_deque.end(), pcmf32.begin());
+
         if (!use_vad){
             n_samples_old += n_samples_new;
             n_samples_new = 0;
-            pcmf32.resize(n_samples_old);
-            copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin());
 
             t_last = t_now;
         } else {
-            const auto n_samples = std::min(n_samples_len, n_samples_old + n_samples_new);
-
-            is_aborted = (n_samples > n_samples_len);
-            if (is_running && !is_aborted) {
-                pcmf32.resize(n_samples_step);
-                copy(pcmf32_deque.end() - n_samples_step, pcmf32_deque.end(), pcmf32.begin());
-            }
+            is_aborted = (n_samples_buf > n_samples_len);
 
             if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
-                pcmf32.resize(n_samples);
-                copy(pcmf32_deque.end() - n_samples, pcmf32_deque.end(), pcmf32.begin());
                 n_samples_new = 0;
                 n_samples_old = 0;
 

From 6302794b659e9ea7b9ada87eb4d3e94db46555a8 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Fri, 17 Jan 2025 15:26:33 +0900
Subject: [PATCH 12/14] Simplify pipe handling

it was too complicated for reviewers to accept
and had a bug in aligning to sizeof(float)

this commit reduces the number of lines

this time `stream` doesn't skip the input before `[Start speaking]`
but it is usually not so problematic
because `read()` reads more than `step_ms` when possible
---
 examples/stream/stream.cpp | 70 +++++++-------------------------------
 1 file changed, 13 insertions(+), 57 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 97bf6fb0779..dfed6143e27 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -37,18 +37,6 @@ void setStdinNonBlocking() {
 #endif
 }
 
-void setStdinBlocking() {
-#ifdef _WIN32
-    DWORD mode;
-    HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
-    GetConsoleMode(stdinHandle, &mode);
-    mode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
-    SetConsoleMode(stdinHandle, mode);
-#else
-    fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) & ~O_NONBLOCK);
-#endif
-}
-
 
 // command-line parameters
 struct whisper_params {
@@ -74,7 +62,6 @@ struct whisper_params {
     bool flash_attn    = false;
     bool interim       = false;
     bool delete_vt100  = true;
-    bool test_pipe     = false;
 
     std::string language  = "en";
     std::string model     = "models/ggml-base.en.bin";
@@ -114,7 +101,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
         else if (arg == "-int"  || arg == "--interim")       { params.interim       = true; }
         else if (arg == "-nvt"  || arg == "--no-vt100")      { params.delete_vt100  = false; }
-        else if (                  arg == "--test-pipe")     { params.test_pipe     = true; }
 
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -155,7 +141,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention during inference\n",               params.flash_attn ? "true" : "false");
     fprintf(stderr, "  -int,     --interim       [%-7s] show interim report in vad every step\n",          params.interim ? "true" : "false");
     fprintf(stderr, "  -nvt,     --no-vt100      [%-7s] do not delete unconfirmed result\n",               params.delete_vt100 ? "false" : "true");
-    fprintf(stderr, "            --test-pipe     [%-7s] use all data from pipe\n",                         params.test_pipe ? "true" : "false");
     fprintf(stderr, "\n");
 }
 
@@ -188,6 +173,7 @@ int main(int argc, char ** argv) {
         #else
         freopen(NULL, "rb", stdin);
         #endif
+        setStdinNonBlocking();
     } else {
         if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
             fprintf(stderr, "%s: audio.init() failed!\n", __func__);
@@ -273,42 +259,9 @@ int main(int argc, char ** argv) {
         wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
     }
 
-    // ignore premature stdin
-    int n_mod = 0;
-    if (piped && !params.test_pipe) {
-        const auto n_bytes_len = sizeof(float) * n_samples_len;
-        setStdinNonBlocking();
-        while (true) {
-            const auto n_bytes_read = read(fileno(stdin), pcmf32.data(), n_bytes_len);
-            if (n_bytes_read == -1 && errno == EAGAIN) {
-                break;
-            } else if (n_bytes_read < 1) {
-                fprintf(stderr, "stdin ended too early\n");
-                is_running = false;
-                break;
-            }
-            n_mod = n_bytes_read % sizeof(float);
-            if (n_bytes_read < n_bytes_len) {
-                break;
-            }
-        }
-    }
-
     fprintf(stderr, "[Start speaking]\n");
     fflush(stderr);
 
-    if (piped) {
-        // ignore the partial sample
-        if (n_mod > 0) {
-            const auto n_remain = sizeof(float) - n_mod;
-            setStdinBlocking();
-            if (n_remain != fread(pcmf32.data(), 1, n_remain, stdin)) {
-                is_running = false;
-            }
-        }
-        setStdinNonBlocking();
-    }
-
     auto t_last  = std::chrono::high_resolution_clock::now();
     auto t_interim = t_last;
     bool is_interim = false;
@@ -332,12 +285,15 @@ int main(int argc, char ** argv) {
         if (n_samples_new > n_samples_step) {
             pcmf32.clear();
         } else if (piped) {
-            pcmf32.resize(n_samples_len);
-            char *p_buf = (char *)pcmf32.data();
+            // need at least step_ms
             const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float);
+            // but try to get length_ms at first
             auto n_bytes_wanted = n_samples_len * sizeof(float);
+            pcmf32.resize(n_samples_len);
+
             auto n_bytes_read = 0;
             while (n_bytes_wanted > 0) {
+                char *p_buf = (char *)pcmf32.data();
                 const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted);
                 if (n_read == 0 || n_read == -1 && errno != EAGAIN) {
                     fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno);
@@ -348,11 +304,11 @@ int main(int argc, char ** argv) {
                 if (n_bytes_read < n_bytes_min) {
                     n_bytes_wanted = n_bytes_min - n_bytes_read;
                 } else {
-                    n_bytes_wanted = n_bytes_read % sizeof(float);
-                }
-                if (n_bytes_wanted > 0) {
-                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    const auto n_mod = n_bytes_read % sizeof(float);
+                    n_bytes_wanted = (n_mod != 0) ? sizeof(float) - n_mod : 0;
                 }
+                const auto est_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE;
+                std::this_thread::sleep_for(std::chrono::milliseconds(est_ms));
             }
             pcmf32.resize(n_bytes_read / sizeof(float));
         } else if (t_diff < abs(params.step_ms)) {
@@ -374,7 +330,7 @@ int main(int argc, char ** argv) {
         }
 
         n_samples_new += n_samples_buf;
-        if (!use_vad && n_samples_new > 2*n_samples_step) {
+        if (!use_vad && !piped && n_samples_new > 2*n_samples_step) {
             fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__);
             fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE);
             n_samples_old = 0;
@@ -513,10 +469,10 @@ int main(int argc, char ** argv) {
                             text << std::endl;
                         }
                         if (is_interim) {
-                            // utf-8 cannot be simply cut into two
+                            // utf-8 cannot be simply cut
                             std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
                             const auto t_u32 = conv.from_bytes(i_text);
-                            const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.7));
+                            const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.9));
                             i_text = t_sub + "…";
                         }
                         if (s_to_delete.size() > 0) {

From f54a43921581f7c4d54d231a5e2dd580c5ec819b Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Sun, 19 Jan 2025 10:12:12 +0900
Subject: [PATCH 13/14] Make it more reviewer-friendly

Update README

Follow the behavior in README,
in particular, the "sliding window" part

Rename variables to easier-to-review names
and rewrite if-conditions
---
 examples/stream/README.md  |  13 +-
 examples/stream/stream.cpp | 252 +++++++++++++++++++------------------
 2 files changed, 144 insertions(+), 121 deletions(-)

diff --git a/examples/stream/README.md b/examples/stream/README.md
index f07cfb8915c..fb0ba37809d 100644
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@@ -12,7 +12,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 
 ## Sliding window mode with VAD
 
-Setting the `--step` argument to `0` enables the sliding window mode:
+Setting the `--step` argument to `0` or a negative value enables the sliding window mode:
 
 ```bash
  ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
@@ -25,6 +25,17 @@ It's best to tune it to the specific use case, but a value around `0.6` should b
 When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
 a transcription block that is suitable for parsing.
 
+You can also set the `--interim` argument to force transcription before the VAD detects silence.
+
+```bash
+ ./build/bin/stream -m ./models/ggml-base.en.bin -t 6 --step -2000 --length 10000 -vth 0.6 --interim --keep 200
+```
+
+This will transcribe the audio, keeping the last segment unconfirmed, every two seconds
+even if the VAD says the speech is still ongoing. In this mode, if the sentence doesn't end
+in `--length` milliseconds, the time window will not slide. The audio will be cut there
+to be transcribed anyway, keeping the last `--keep` milliseconds for the next inference.
+
 ## Building
 
 The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index dfed6143e27..49a14603209 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -25,18 +25,6 @@
     #include <unistd.h>
 #endif
 
-void setStdinNonBlocking() {
-#ifdef _WIN32
-    DWORD mode;
-    HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
-    GetConsoleMode(stdinHandle, &mode);
-    mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-    SetConsoleMode(stdinHandle, mode);
-#else
-    fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK);
-#endif
-}
-
 
 // command-line parameters
 struct whisper_params {
@@ -47,6 +35,7 @@ struct whisper_params {
     int32_t capture_id = -1;
     int32_t max_tokens = 128;
     int32_t audio_ctx  = 0;
+    int32_t n_tmp_segs = 1;
 
     float vad_thold    = 0.6f;
     float freq_thold   = 100.0f;
@@ -151,6 +140,9 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.step_ms == 0) {
+        params.step_ms = -2000; // reasonable default for VAD
+    }
     params.keep_ms   = std::min(params.keep_ms,   abs(params.step_ms));
     params.length_ms = std::max(params.length_ms, abs(params.step_ms));
 
@@ -161,26 +153,38 @@ int main(int argc, char ** argv) {
     const int n_samples_100ms= (1e-3*100.0              )*WHISPER_SAMPLE_RATE;
 
     const bool use_vad = params.step_ms <= 0; // sliding window mode uses VAD
+    const bool piped = !isatty(fileno(stdin));
 
     // init audio
 
     audio_async audio(params.length_ms);
-    bool piped = !isatty(fileno(stdin));
 
-    if (piped) {
-        #ifdef _WIN32
-        _setmode(_fileno(stdin), _O_BINARY);
-        #else
-        freopen(NULL, "rb", stdin);
-        #endif
-        setStdinNonBlocking();
-    } else {
+    if (!piped) {
         if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
             fprintf(stderr, "%s: audio.init() failed!\n", __func__);
             return 1;
         }
 
         audio.resume();
+    } else {
+        fprintf(stderr, "%s: audio is from stdin, not from microphone\n", __func__);
+
+        #ifdef _WIN32
+        _setmode(_fileno(stdin), _O_BINARY);
+        #else
+        freopen(NULL, "rb", stdin);
+        #endif
+
+        // non-blocking mode
+        #ifdef _WIN32
+        DWORD mode;
+        HANDLE stdinHandle = GetStdHandle(STD_INPUT_HANDLE);
+        GetConsoleMode(stdinHandle, &mode);
+        mode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+        SetConsoleMode(stdinHandle, mode);
+        #else
+        fcntl(fileno(stdin), F_SETFL, fcntl(fileno(stdin), F_GETFL, 0) | O_NONBLOCK);
+        #endif
     }
 
     // whisper init
@@ -225,10 +229,10 @@ int main(int argc, char ** argv) {
                 params.translate ? "translate" : "transcribe",
                 params.no_timestamps ? 0 : 1);
 
-        if (!use_vad) {
-            fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context);
-        } else {
+        fprintf(stderr, "%s: no_context = %d\n", __func__, params.no_context);
+        if (use_vad) {
             fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
+            fprintf(stderr, "%s: interim report = %d, temporary segments = %d\n", __func__, params.interim, params.n_tmp_segs);
         }
 
         fprintf(stderr, "\n");
@@ -258,15 +262,13 @@ int main(int argc, char ** argv) {
 
         wavWriter.open(filename, WHISPER_SAMPLE_RATE, 16, 1);
     }
-
     fprintf(stderr, "[Start speaking]\n");
     fflush(stderr);
 
     auto t_last  = std::chrono::high_resolution_clock::now();
-    auto t_interim = t_last;
-    bool is_interim = false;
     const auto t_start = t_last;
-    std::string s_to_delete = "";
+    auto t_interim = t_last;
+    std::string s_tmp = "";
 
     // main audio loop
     while (is_running) {
@@ -281,13 +283,26 @@ int main(int argc, char ** argv) {
         const auto t_now  = std::chrono::high_resolution_clock::now();
         const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
 
-        // get new audio
-        if (n_samples_new > n_samples_step) {
-            pcmf32.clear();
-        } else if (piped) {
-            // need at least step_ms
+        if (!piped) {
+            const auto sleep_ms = abs(params.step_ms) - t_diff;
+            if (sleep_ms > 0) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
+                continue;
+            }
+
+            audio.next(pcmf32);
+
+            if ((int) pcmf32.size() > 2*n_samples_step) {
+                fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__);
+                fprintf(stderr, "t_diff = %.2f sec, prev = %.2f sec, got = %.2f sec\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(pcmf32.size())/WHISPER_SAMPLE_RATE);
+                n_samples_old = 0;
+                n_samples_new = 0;
+                t_last = t_now;
+                continue;
+            }
+        } else {
+            // piped: need at least step_ms but try to get length_ms at first
             const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float);
-            // but try to get length_ms at first
             auto n_bytes_wanted = n_samples_len * sizeof(float);
             pcmf32.resize(n_samples_len);
 
@@ -296,8 +311,8 @@ int main(int argc, char ** argv) {
                 char *p_buf = (char *)pcmf32.data();
                 const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted);
                 if (n_read == 0 || n_read == -1 && errno != EAGAIN) {
-                    fprintf(stderr, "read(stdin) returned %zd, errno = %d\n", n_read, errno);
-                    is_running = false;
+                    fprintf(stderr, "read(stdin) returned %zd, errno = %s\n", n_read, strerror(errno));
+                    is_running = false; // flush all results
                     break;
                 }
                 n_bytes_read += std::max<long>(0, n_read);
@@ -307,15 +322,10 @@ int main(int argc, char ** argv) {
                     const auto n_mod = n_bytes_read % sizeof(float);
                     n_bytes_wanted = (n_mod != 0) ? sizeof(float) - n_mod : 0;
                 }
-                const auto est_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE;
-                std::this_thread::sleep_for(std::chrono::milliseconds(est_ms));
+                const auto est_sleep_ms = 1000 * n_bytes_wanted / sizeof(float) / WHISPER_SAMPLE_RATE;
+                std::this_thread::sleep_for(std::chrono::milliseconds(est_sleep_ms));
             }
             pcmf32.resize(n_bytes_read / sizeof(float));
-        } else if (t_diff < abs(params.step_ms)) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(abs(params.step_ms) - t_diff));
-            continue;
-        } else {
-            audio.next(pcmf32);
         }
 
         int n_samples_buf = pcmf32.size();
@@ -330,49 +340,46 @@ int main(int argc, char ** argv) {
         }
 
         n_samples_new += n_samples_buf;
-        if (!use_vad && !piped && n_samples_new > 2*n_samples_step) {
-            fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n", __func__);
-            fprintf(stderr, "t_diff = %.2fs, new = %.2fs, buf = %.2fs\n\n", 1e-3*t_diff, float(n_samples_new)/WHISPER_SAMPLE_RATE, float(n_samples_buf)/WHISPER_SAMPLE_RATE);
-            n_samples_old = 0;
-            n_samples_new = 0;
-            t_last = t_now;
-            continue;
-        }
 
         if (n_samples_old + n_samples_new == 0) {
             continue;
         }
 
-        is_interim = false;
-        bool is_aborted = true;
-
-        n_samples_buf = std::min(n_samples_len, n_samples_old + n_samples_new);
+        // prepare pcmf32 for inference
+        n_samples_buf = n_samples_old + n_samples_new;
         pcmf32.resize(n_samples_buf);
         copy(pcmf32_deque.end() - n_samples_buf, pcmf32_deque.end(), pcmf32.begin());
 
-        if (!use_vad){
+        // chop the audio unconditionally
+        bool use_keep_ms = ((!use_vad || params.interim) && n_samples_buf > n_samples_len);
+
+        // interim report in vad mode: once every step_ms,
+        // run the inference even if vad returns false,
+        // confirm (n_segments - params.n_tmp_segs) segments,
+        // and print other segments as s_tmp, which will be deleted
+        bool is_interim = false;
+
+        if (!use_vad || use_keep_ms || !is_running) {
+            use_keep_ms = true;
             n_samples_old += n_samples_new;
             n_samples_new = 0;
 
             t_last = t_now;
         } else {
-            is_aborted = (n_samples_buf > n_samples_len);
-
-            if (!is_running || is_aborted || ::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
+            if (::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, std::min(1000, abs(params.step_ms) / 2), params.vad_thold, params.freq_thold, false)) {
                 n_samples_new = 0;
                 n_samples_old = 0;
 
                 t_last = t_now;
             } else {
-                const auto n_interim_diff_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_interim).count();
+                const auto interim_diff_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_interim).count();
 
-                if (params.interim && n_interim_diff_ms > abs(params.step_ms)) {
-                    is_interim = (n_interim_diff_ms < params.length_ms - abs(params.step_ms));
+                if (params.interim && interim_diff_ms > abs(params.step_ms)) {
+                    is_interim = true;
                     n_samples_old += n_samples_new;
                     n_samples_new = 0;
-                    pcmf32.resize(n_samples_old);
-                    copy(pcmf32_deque.end() - n_samples_old, pcmf32_deque.end(), pcmf32.begin());
                 } else {
+                    // sliding window
                     n_samples_new -= n_samples_100ms;
                     n_samples_old = std::min(n_samples_len, n_samples_old + n_samples_100ms);
                     if (!piped) {
@@ -408,145 +415,150 @@ int main(int argc, char ** argv) {
             wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
             wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
 
+            // call whisper_full() with at least 1 sec of buffer
             {
                 auto pcm_size = pcmf32.size();
                 if (pcm_size < WHISPER_SAMPLE_RATE * 1.1) {
                     pcmf32.resize(pcm_size + WHISPER_SAMPLE_RATE, 0.0f);
                 }
-                if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                if (whisper_full(ctx, wparams, pcmf32.data(), pcm_size) != 0) {
                     fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                     return 6;
                 }
                 pcmf32.resize(pcm_size);
             }
-            t_interim  = std::chrono::high_resolution_clock::now();
+            t_interim = std::chrono::high_resolution_clock::now();
 
             // print result;
             int n_segments;
-            bool no_confirmed = (!use_vad && n_samples_old < n_samples_len - n_samples_step);
-            std::ostringstream text;
+            bool is_all_tmp = (!use_vad && n_samples_old < n_samples_len - n_samples_step);
+            std::ostringstream ss_output;
+
             {
-                if (params.delete_vt100 && s_to_delete.size()) {
+                if (params.delete_vt100 && s_tmp.size()) {
                     printf("\33[2K\r");
 
                     // print long empty line to clear the previous line
-                    printf("%s", std::string(s_to_delete.size(), ' ').c_str());
+                    printf("%s", std::string(s_tmp.size(), ' ').c_str());
 
                     printf("\33[2K\r");
                 }
-                s_to_delete.clear();
+                s_tmp.clear();
 
                 n_segments = whisper_full_n_segments(ctx);
-                no_confirmed = (no_confirmed || is_interim && n_segments <= 1);
-                if (is_running && is_interim && !no_confirmed) {
-                    const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - 2) * 10;
+                is_all_tmp = (is_running && (is_all_tmp || is_interim && n_segments <= params.n_tmp_segs));
+                if (is_running && is_interim && !is_all_tmp) {
+                    const int64_t t1_ms = whisper_full_get_segment_t1(ctx, n_segments - params.n_tmp_segs - 1) * 10;
                     if (t1_ms < abs(params.step_ms)) {
                         // too short to confirm
-                        no_confirmed = true;
+                        is_all_tmp = true;
                     } else {
                         t_last += std::chrono::milliseconds(t1_ms);
                         const auto n_samples_confirmed = (1e-3*t1_ms)*WHISPER_SAMPLE_RATE;
                         pcmf32.resize(n_samples_confirmed); // for timestamps
-                        n_samples_old -= n_samples_confirmed;
+                        n_samples_old -= n_samples_confirmed; // kept for next iteration
                     }
                 }
 
-                if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) {
+                bool show_n_iter = (use_vad && !params.no_timestamps && !is_all_tmp);
+
+                if (show_n_iter) {
                     const int64_t t1 = (t_last - t_start).count()/1000000;
                     const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
 
-                    text << std::endl;
-                    text << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl;
-                    text << std::endl;
+                    ss_output << std::endl;
+                    ss_output << "### Transcription " << n_iter << " START | t0 = " << t0 << " ms | t1 = " << t1 << " ms" << std::endl;
+                    ss_output << std::endl;
                 }
 
                 for (int i = 0; i < n_segments; ++i) {
-                    std::string i_text = whisper_full_get_segment_text(ctx, i);
+                    std::string text = whisper_full_get_segment_text(ctx, i);
 
-                    // last segment may be s_to_delete
-                    if (i == n_segments - 1 && is_running && (no_confirmed || is_interim)) {
+                    // last segment(s) may be s_tmp
+                    if (i >= n_segments - params.n_tmp_segs && is_running && (is_all_tmp || is_interim)) {
                         if (params.no_timestamps && i > 0) {
-                            text << std::endl;
+                            ss_output << std::endl;
                         }
                         if (is_interim) {
-                            // utf-8 cannot be simply cut
+                            // utf-8 cannot be simply cut, so use char32_t
                             std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
-                            const auto t_u32 = conv.from_bytes(i_text);
-                            const auto t_sub = conv.to_bytes(t_u32.substr(0, t_u32.size() * 0.9));
-                            i_text = t_sub + "…";
-                        }
-                        if (s_to_delete.size() > 0) {
-                            s_to_delete += " ";
+                            const auto s_u32 = conv.from_bytes(text);
+                            const auto s_sub = conv.to_bytes(s_u32.substr(0, s_u32.size() * 0.9));
+                            text = s_sub + "…";
                         }
-                        s_to_delete += i_text;
-                        if (!params.delete_vt100) {
-                            s_to_delete = "(" + s_to_delete + ")";
+                        if (s_tmp.size() > 0) {
+                            s_tmp += " ";
                         }
-                        break;
+                        s_tmp += text;
+                        continue;
                     }
 
-                    if (is_running && no_confirmed) {
-                        if (s_to_delete.size() > 0) {
-                            s_to_delete += " ";
+                    if (is_all_tmp) {
+                        if (s_tmp.size() > 0) {
+                            s_tmp += " ";
                         }
-                        s_to_delete += i_text;
+                        s_tmp += text;
                     } else if (params.no_timestamps) {
                         if (i > 0) {
-                            text << std::endl;
+                            ss_output << std::endl;
                         }
-                        text << i_text;
-                    } else if (!is_running || !(is_interim && i == n_segments - 1)) {
+                        ss_output << text;
+                    } else {
                         const int64_t t_end = (t_last - t_start).count()/1000000;
                         const int64_t t_beg = std::max(0.0, t_end - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
                         const int64_t t0 = t_beg/10 + whisper_full_get_segment_t0(ctx, i);
                         const int64_t t1 = t_beg/10 + whisper_full_get_segment_t1(ctx, i);
 
-                        text << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "]  " << i_text;
+                        ss_output << "[" << to_timestamp(t0, false) << " --> " << to_timestamp(t1, false) << "]  " << text;
 
                         if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-                            text << " [SPEAKER_TURN]";
+                            ss_output << " [SPEAKER_TURN]";
                         }
 
-                        text << std::endl;
+                        ss_output << std::endl;
                     }
                 }
 
-                if (use_vad && !params.no_timestamps && (!is_running || !no_confirmed)) {
-                    text << std::endl;
-                    text << "### Transcription " << n_iter << " END";
-                    text << std::endl;
-                    if (s_to_delete.size() > 0) {
-                        text << std::endl;
+                if (show_n_iter) {
+                    ss_output << std::endl;
+                    ss_output << "### Transcription " << n_iter << " END" << std::endl;
+                    if (s_tmp.size() > 0) {
+                        ss_output << std::endl;
                     }
                 }
             }
 
             if (params.fname_out.length() > 0) {
-                fout << text.str();
+                fout << ss_output.str();
                 fout << std::endl;
             }
 
-            if (!no_confirmed) {
+            if (!is_all_tmp) {
                 ++n_iter;
             }
 
-            printf("%s", text.str().c_str());
+            printf("%s", ss_output.str().c_str());
+
+            if (s_tmp.size() > 0) {
+                if (!params.delete_vt100) {
+                    s_tmp = "(" + s_tmp + ")\n";
+                }
+                printf("%s", s_tmp.c_str());
 
-            if (is_running && (no_confirmed || is_interim)) {
-                printf("%s%s", s_to_delete.c_str(), params.delete_vt100 ? "" : "\n");
-                --n_segments; // exclude s_to_delete from context
+                // exclude s_tmp from context
+                n_segments -= is_all_tmp ? n_segments : params.n_tmp_segs;
             } else {
                 printf("\n");
-                s_to_delete = "";
+                s_tmp = "";
 
-                if (is_aborted) {
+                if (use_keep_ms) {
                     // keep part of the audio for next iteration to try to mitigate word boundary issues
                     n_samples_old = std::min(n_samples_old, n_samples_keep);
                 }
             }
 
             // Add tokens of the last full length segment as the prompt
-            if (!no_confirmed && !params.no_context) {
+            if (n_segments > 0 && !params.no_context) {
                 prompt_tokens.clear();
 
                 for (int i = 0; i < n_segments; ++i) {

From 09111f0c110e69867a30093a6b6d0a7fd5af3100 Mon Sep 17 00:00:00 2001
From: Tamotsu Takahashi <ttakah+github@gmail.com>
Date: Tue, 21 Jan 2025 21:06:29 +0900
Subject: [PATCH 14/14] Fix n_samples_new > n_samples_step case

---
 examples/stream/stream.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
index 49a14603209..38a15c448e4 100644
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -302,11 +302,11 @@ int main(int argc, char ** argv) {
             }
         } else {
             // piped: need at least step_ms but try to get length_ms at first
-            const auto n_bytes_min = (n_samples_step - n_samples_new) * sizeof(float);
+            const auto n_bytes_min = std::max<long>(0, (n_samples_step - n_samples_new) * sizeof(float));
             auto n_bytes_wanted = n_samples_len * sizeof(float);
             pcmf32.resize(n_samples_len);
 
-            auto n_bytes_read = 0;
+            long n_bytes_read = 0;
             while (n_bytes_wanted > 0) {
                 char *p_buf = (char *)pcmf32.data();
                 const auto n_read = read(fileno(stdin), p_buf + n_bytes_read, n_bytes_wanted);