-
Notifications
You must be signed in to change notification settings - Fork 0
/
vosk_api.h
361 lines (293 loc) · 12.2 KB
/
vosk_api.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
// Copyright 2020-2021 Alpha Cephei Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* This header contains the C API for Vosk speech recognition system */
#ifndef VOSK_API_H
#define VOSK_API_H
#ifdef __cplusplus
extern "C" {
#endif
/** Model stores all the data required for recognition
* it contains static data and can be shared across processing
* threads. */
typedef struct VoskModel VoskModel;
/** Speaker model is the same as model but contains the data
* for speaker identification. */
typedef struct VoskSpkModel VoskSpkModel;
/** Recognizer object is the main object which processes data.
* Each recognizer usually runs in own thread and takes audio as input.
* Once audio is processed recognizer returns JSON object as a string
* which represent decoded information - words, confidences, times, n-best lists,
* speaker information and so on */
typedef struct VoskRecognizer VoskRecognizer;
/**
* Batch model object
*/
typedef struct VoskBatchModel VoskBatchModel;
/**
* Batch recognizer object
*/
typedef struct VoskBatchRecognizer VoskBatchRecognizer;
/** Loads model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
* @returns model object or NULL if problem occured */
VoskModel *vosk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_model_free(VoskModel *model);
/** Check if a word can be recognized by the model
* @param word: the word
* @returns the word symbol if @param word exists inside the model
* or -1 otherwise.
* Reminding that word symbol 0 is for <epsilon> */
int vosk_model_find_word(VoskModel *model, const char *word);
/** Loads speaker model data from the file and returns the model object
*
* @param model_path: the path of the model on the filesystem
* @returns model object or NULL if problem occurred */
VoskSpkModel *vosk_spk_model_new(const char *model_path);
/** Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too. */
void vosk_spk_model_free(VoskSpkModel *model);
/** Creates the recognizer object
*
* The recognizers process the speech and return text using shared model data
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate);
/** Creates the recognizer object with speaker recognition
*
* With the speaker recognition mode the recognizer not just recognize
* text but also return speaker vectors one can use for speaker identification
*
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @param spk_model speaker model for speaker identification
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model);
/** Creates the recognizer object with the phrase list
*
* Sometimes when you want to improve recognition accuracy and when you don't need
* to recognize large vocabulary you can specify a list of phrases to recognize. This
* will improve recognizer speed and accuracy but might return [unk] if user said
* something different.
*
* Only recognizers with lookahead models support this type of quick configuration.
* Precompiled HCLG graph models are not supported.
*
* @param model VoskModel containing static data for recognizer. Model can be
* shared across recognizers, even running in different threads.
* @param sample_rate The sample rate of the audio you going to feed into the recognizer.
* Make sure this rate matches the audio content, it is a common
* issue causing accuracy problems.
* @param grammar The string with the list of phrases to recognize as JSON array of strings,
* for example "["one two three four five", "[unk]"]".
*
* @returns recognizer object or NULL if problem occured */
VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar);
/** Adds speaker model to already initialized recognizer
*
* Can add speaker recognition model to already created recognizer. Helps to initialize
* speaker recognition for grammar-based recognizer.
*
* @param spk_model Speaker recognition model */
void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model);
/** Reconfigures recognizer to use grammar
*
* @param recognizer Already running VoskRecognizer
* @param grammar Set of phrases in JSON array of strings or "[]" to use default model graph.
* See also vosk_recognizer_new_grm
*/
void vosk_recognizer_set_grm(VoskRecognizer *recognizer, char const *grammar);
/** Configures recognizer to output n-best results
*
* <pre>
* {
* "alternatives": [
* { "text": "one two three four five", "confidence": 0.97 },
* { "text": "one two three for five", "confidence": 0.03 },
* ]
* }
* </pre>
*
* @param max_alternatives - maximum alternatives to return from recognition results
*/
void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives);
/** Enables words with times in the output
*
* <pre>
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* </pre>
*
* @param words - boolean value
*/
void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);
/** Like above return words and confidences in partial results
*
* @param partial_words - boolean value
*/
void vosk_recognizer_set_partial_words(VoskRecognizer *recognizer, int partial_words);
/** Set NLSML output
* @param nlsml - boolean value
*/
void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);
/** Accept voice data
*
* accept and process new chunk of voice data
*
* @param data - audio data in PCM 16-bit mono format
* @param length - length of the audio data
* @returns 1 if silence is occured and you can retrieve a new utterance with result method
* 0 if decoding continues
* -1 if exception occured */
int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length);
/** Same as above but the version with the short data for language bindings where you have
* audio as array of shorts */
int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length);
/** Same as above but the version with the float data for language bindings where you have
* audio as array of floats */
int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length);
/** Returns speech recognition result
*
* @returns the result in JSON format which contains decoded line, decoded
* words, times in seconds and confidences. You can parse this result
* with any json parser
*
* <pre>
* {
* "text" : "what zero zero zero one"
* }
* </pre>
*
* If alternatives enabled it returns result with alternatives, see also vosk_recognizer_set_max_alternatives().
*
* If word times enabled returns word time, see also vosk_recognizer_set_word_times().
*/
const char *vosk_recognizer_result(VoskRecognizer *recognizer);
/** Returns partial speech recognition
*
* @returns partial speech recognition text which is not yet finalized.
* result may change as recognizer process more data.
*
* <pre>
* {
* "partial" : "cyril one eight zero"
* }
* </pre>
*/
const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer);
/** Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns speech result in JSON format.
*/
const char *vosk_recognizer_final_result(VoskRecognizer *recognizer);
/** Resets the recognizer
*
* Resets current results so the recognition can continue from scratch */
void vosk_recognizer_reset(VoskRecognizer *recognizer);
/** Releases recognizer object
*
* Underlying model is also unreferenced and if needed released */
void vosk_recognizer_free(VoskRecognizer *recognizer);
/** Set log level for Kaldi messages
*
* @param log_level the level
* 0 - default value to print info and error messages but no debug
* less than 0 - don't print info messages
* greater than 0 - more verbose mode
*/
void vosk_set_log_level(int log_level);
/**
* Init, automatically select a CUDA device and allow multithreading.
* Must be called once from the main thread.
* Has no effect if HAVE_CUDA flag is not set.
*/
void vosk_gpu_init();
/**
* Init CUDA device in a multi-threaded environment.
* Must be called for each thread.
* Has no effect if HAVE_CUDA flag is not set.
*/
void vosk_gpu_thread_init();
/** Creates the batch recognizer object
*
* @returns model object or NULL if problem occured */
VoskBatchModel *vosk_batch_model_new(const char *model_path);
/** Releases batch model object */
void vosk_batch_model_free(VoskBatchModel *model);
/** Wait for the processing */
void vosk_batch_model_wait(VoskBatchModel *model);
/** Creates batch recognizer object
* @returns recognizer object or NULL if problem occured */
VoskBatchRecognizer *vosk_batch_recognizer_new(VoskBatchModel *model, float sample_rate);
/** Releases batch recognizer object */
void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
/** Accept batch voice data */
void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, const char *data, int length);
/** Set NLSML output
* @param nlsml - boolean value
*/
void vosk_batch_recognizer_set_nlsml(VoskBatchRecognizer *recognizer, int nlsml);
/** Closes the stream */
void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer);
/** Return results */
const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer);
/** Release and free first retrieved result */
void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer);
/** Get amount of pending chunks for more intelligent waiting */
int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer);
#ifdef __cplusplus
}
#endif
#endif /* VOSK_API_H */