@@ -19,6 +19,7 @@ struct whisper_params {
19
19
int32_t max_len = 0 ;
20
20
int32_t best_of = 5 ;
21
21
int32_t beam_size = -1 ;
22
+ int32_t audio_ctx = 0 ;
22
23
23
24
float word_thold = 0 .01f ;
24
25
float entropy_thold = 2 .4f ;
@@ -46,6 +47,8 @@ struct whisper_params {
46
47
47
48
std::vector<std::string> fname_inp = {};
48
49
std::vector<std::string> fname_out = {};
50
+
51
+ std::vector<float > pcmf32 = {}; // mono-channel F32 PCM
49
52
};
50
53
51
54
struct whisper_print_user_data {
@@ -125,13 +128,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
125
128
void cb_log_disable (enum ggml_log_level, const char *, void *) {}
126
129
127
130
int run (whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
128
-
129
131
if (params.no_prints ) {
130
132
whisper_log_set (cb_log_disable, NULL );
131
133
}
132
134
133
- if (params.fname_inp .empty ()) {
134
- fprintf (stderr, " error: no input files specified\n " );
135
+ if (params.fname_inp .empty () && params. pcmf32 . empty () ) {
136
+ fprintf (stderr, " error: no input files or audio buffer specified\n " );
135
137
return 2 ;
136
138
}
137
139
@@ -151,16 +153,29 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
151
153
return 3 ;
152
154
}
153
155
156
+ // if params.pcmf32 is provided, set params.fname_inp to "buffer"
157
+ // this is simpler than further modifications in the code
158
+ if (!params.pcmf32 .empty ()) {
159
+ fprintf (stderr, " info: using audio buffer as input\n " );
160
+ params.fname_inp .clear ();
161
+ params.fname_inp .emplace_back (" buffer" );
162
+ }
163
+
154
164
for (int f = 0 ; f < (int ) params.fname_inp .size (); ++f) {
155
165
const auto fname_inp = params.fname_inp [f];
156
166
const auto fname_out = f < (int )params.fname_out .size () && !params.fname_out [f].empty () ? params.fname_out [f] : params.fname_inp [f];
157
167
158
168
std::vector<float > pcmf32; // mono-channel F32 PCM
159
169
std::vector<std::vector<float >> pcmf32s; // stereo-channel F32 PCM
160
170
161
- if (!::read_wav (fname_inp, pcmf32, pcmf32s, params.diarize )) {
162
- fprintf (stderr, " error: failed to read WAV file '%s'\n " , fname_inp.c_str ());
163
- continue ;
171
+ // read the input audio file if params.pcmf32 is not provided
172
+ if (params.pcmf32 .empty ()) {
173
+ if (!::read_wav (fname_inp, pcmf32, pcmf32s, params.diarize )) {
174
+ fprintf (stderr, " error: failed to read WAV file '%s'\n " , fname_inp.c_str ());
175
+ continue ;
176
+ }
177
+ } else {
178
+ pcmf32 = params.pcmf32 ;
164
179
}
165
180
166
181
// print system information
@@ -180,12 +195,13 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
180
195
fprintf (stderr, " %s: WARNING: model is not multilingual, ignoring language and translation options\n " , __func__);
181
196
}
182
197
}
183
- fprintf (stderr, " %s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n " ,
198
+ fprintf (stderr, " %s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n " ,
184
199
__func__, fname_inp.c_str (), int (pcmf32.size ()), float (pcmf32.size ())/WHISPER_SAMPLE_RATE,
185
200
params.n_threads , params.n_processors ,
186
201
params.language .c_str (),
187
202
params.translate ? " translate" : " transcribe" ,
188
- params.no_timestamps ? 0 : 1 );
203
+ params.no_timestamps ? 0 : 1 ,
204
+ params.audio_ctx );
189
205
190
206
fprintf (stderr, " \n " );
191
207
}
@@ -212,6 +228,7 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
212
228
wparams.entropy_thold = params.entropy_thold ;
213
229
wparams.logprob_thold = params.logprob_thold ;
214
230
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len ;
231
+ wparams.audio_ctx = params.audio_ctx ;
215
232
216
233
wparams.speed_up = params.speed_up ;
217
234
@@ -311,14 +328,28 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
311
328
bool use_gpu = whisper_params.Get (" use_gpu" ).As <Napi::Boolean>();
312
329
bool no_prints = whisper_params.Get (" no_prints" ).As <Napi::Boolean>();
313
330
bool no_timestamps = whisper_params.Get (" no_timestamps" ).As <Napi::Boolean>();
331
+ int32_t audio_ctx = whisper_params.Get (" audio_ctx" ).As <Napi::Number>();
314
332
bool comma_in_time = whisper_params.Get (" comma_in_time" ).As <Napi::Boolean>();
315
333
334
+ Napi::Value pcmf32Value = whisper_params.Get (" pcmf32" );
335
+ std::vector<float > pcmf32_vec;
336
+ if (pcmf32Value.IsTypedArray ()) {
337
+ Napi::Float32Array pcmf32 = pcmf32Value.As <Napi::Float32Array>();
338
+ size_t length = pcmf32.ElementLength ();
339
+ pcmf32_vec.reserve (length);
340
+ for (size_t i = 0 ; i < length; i++) {
341
+ pcmf32_vec.push_back (pcmf32[i]);
342
+ }
343
+ }
344
+
316
345
params.language = language;
317
346
params.model = model;
318
347
params.fname_inp .emplace_back (input);
319
348
params.use_gpu = use_gpu;
320
349
params.no_prints = no_prints;
321
350
params.no_timestamps = no_timestamps;
351
+ params.audio_ctx = audio_ctx;
352
+ params.pcmf32 = pcmf32_vec;
322
353
params.comma_in_time = comma_in_time;
323
354
324
355
Napi::Function callback = info[1 ].As <Napi::Function>();
0 commit comments