Skip to content

Commit 3928dbd

Browse files
pprobstggerganov
andauthored
node : add audio_ctx and audio buffer params (ggml-org#2123)
* node : add audio_ctx param * node : support passing audio buffer directly * node : parse audio_ctx in index.js --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent 2ced6f0 commit 3928dbd

File tree

3 files changed

+48
-9
lines changed

3 files changed

+48
-9
lines changed

examples/addon.node/__test__/whisper.spec.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const whisperParamsMock = {
1616
comma_in_time: false,
1717
translate: true,
1818
no_timestamps: false,
19+
audio_ctx: 0,
1920
};
2021

2122
describe("Run whisper.node", () => {

examples/addon.node/addon.cpp

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct whisper_params {
1919
int32_t max_len = 0;
2020
int32_t best_of = 5;
2121
int32_t beam_size = -1;
22+
int32_t audio_ctx = 0;
2223

2324
float word_thold = 0.01f;
2425
float entropy_thold = 2.4f;
@@ -46,6 +47,8 @@ struct whisper_params {
4647

4748
std::vector<std::string> fname_inp = {};
4849
std::vector<std::string> fname_out = {};
50+
51+
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
4952
};
5053

5154
struct whisper_print_user_data {
@@ -125,13 +128,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
125128
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
126129

127130
int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
128-
129131
if (params.no_prints) {
130132
whisper_log_set(cb_log_disable, NULL);
131133
}
132134

133-
if (params.fname_inp.empty()) {
134-
fprintf(stderr, "error: no input files specified\n");
135+
if (params.fname_inp.empty() && params.pcmf32.empty()) {
136+
fprintf(stderr, "error: no input files or audio buffer specified\n");
135137
return 2;
136138
}
137139

@@ -151,16 +153,29 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
151153
return 3;
152154
}
153155

156+
// if params.pcmf32 is provided, set params.fname_inp to "buffer"
157+
// this is simpler than further modifications in the code
158+
if (!params.pcmf32.empty()) {
159+
fprintf(stderr, "info: using audio buffer as input\n");
160+
params.fname_inp.clear();
161+
params.fname_inp.emplace_back("buffer");
162+
}
163+
154164
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
155165
const auto fname_inp = params.fname_inp[f];
156166
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
157167

158168
std::vector<float> pcmf32; // mono-channel F32 PCM
159169
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
160170

161-
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
162-
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
163-
continue;
171+
// read the input audio file if params.pcmf32 is not provided
172+
if (params.pcmf32.empty()) {
173+
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
174+
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
175+
continue;
176+
}
177+
} else {
178+
pcmf32 = params.pcmf32;
164179
}
165180

166181
// print system information
@@ -180,12 +195,13 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
180195
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
181196
}
182197
}
183-
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
198+
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
184199
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
185200
params.n_threads, params.n_processors,
186201
params.language.c_str(),
187202
params.translate ? "translate" : "transcribe",
188-
params.no_timestamps ? 0 : 1);
203+
params.no_timestamps ? 0 : 1,
204+
params.audio_ctx);
189205

190206
fprintf(stderr, "\n");
191207
}
@@ -212,6 +228,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
212228
wparams.entropy_thold = params.entropy_thold;
213229
wparams.logprob_thold = params.logprob_thold;
214230
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
231+
wparams.audio_ctx = params.audio_ctx;
215232

216233
wparams.speed_up = params.speed_up;
217234

@@ -311,14 +328,28 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
311328
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
312329
bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
313330
bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
331+
int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
314332
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
315333

334+
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
335+
std::vector<float> pcmf32_vec;
336+
if (pcmf32Value.IsTypedArray()) {
337+
Napi::Float32Array pcmf32 = pcmf32Value.As<Napi::Float32Array>();
338+
size_t length = pcmf32.ElementLength();
339+
pcmf32_vec.reserve(length);
340+
for (size_t i = 0; i < length; i++) {
341+
pcmf32_vec.push_back(pcmf32[i]);
342+
}
343+
}
344+
316345
params.language = language;
317346
params.model = model;
318347
params.fname_inp.emplace_back(input);
319348
params.use_gpu = use_gpu;
320349
params.no_prints = no_prints;
321350
params.no_timestamps = no_timestamps;
351+
params.audio_ctx = audio_ctx;
352+
params.pcmf32 = pcmf32_vec;
322353
params.comma_in_time = comma_in_time;
323354

324355
Napi::Function callback = info[1].As<Napi::Function>();

examples/addon.node/index.js

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,20 @@ const whisperParams = {
1616
comma_in_time: false,
1717
translate: true,
1818
no_timestamps: false,
19+
audio_ctx: 0,
1920
};
2021

2122
const arguments = process.argv.slice(2);
2223
const params = Object.fromEntries(
2324
arguments.reduce((pre, item) => {
2425
if (item.startsWith("--")) {
25-
return [...pre, item.slice(2).split("=")];
26+
const [key, value] = item.slice(2).split("=");
27+
if (key === "audio_ctx") {
28+
whisperParams[key] = parseInt(value);
29+
} else {
30+
whisperParams[key] = value;
31+
}
32+
return pre;
2633
}
2734
return pre;
2835
}, [])

0 commit comments

Comments
 (0)