Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VAD based segmentation #97

Merged
merged 9 commits into from
May 16, 2024
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ target_sources(
src/whisper-utils/whisper-model-utils.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/translation/translation-utils.cpp
src/utils.cpp)
Expand All @@ -111,13 +112,15 @@ if(ENABLE_TESTS)
target_sources(
${CMAKE_PROJECT_NAME}-tests
PRIVATE src/tests/localvocal-offline-test.cpp
src/tests/audio-file-utils.cpp
src/transcription-utils.cpp
src/model-utils/model-infos.cpp
src/model-utils/model-find-utils.cpp
src/whisper-utils/whisper-processing.cpp
src/whisper-utils/whisper-utils.cpp
src/whisper-utils/silero-vad-onnx.cpp
src/whisper-utils/token-buffer-thread.cpp
src/translation/language_codes.cpp
src/translation/translation.cpp
src/utils.cpp)

Expand Down
2 changes: 1 addition & 1 deletion cmake/linux/compilerconfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ set(_obs_gcc_c_options
# cmake-format: sortable
-fno-strict-aliasing
-fopenmp-simd
-Wdeprecated-declarations
-Wempty-body
-Wenum-conversion
-Werror=return-type
-Wextra
-Wformat
-Wformat-security
-Wno-conversion
-Wno-deprecated-declarations
-Wno-error=deprecated-declarations
-Wno-float-conversion
-Wno-implicit-fallthrough
Expand Down
4 changes: 4 additions & 0 deletions data/locale/en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,7 @@ dtw_token_timestamps="DTW token timestamps"
buffered_output="Buffered output (Experimental)"
translate_model="Translation Model"
Whisper-Based-Translation="Whisper-Based Translation"
sentence_psum_accept_thresh="Sentence prob. threshold"
external_model_folder="External model folder"
load_external_model="Load external model"
translate_input_tokenization_style="Input token style"
2 changes: 1 addition & 1 deletion src/tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ The JSON config file can look e.g. like
"silero_vad_model_file": ".../obs-localvocal/data/models/silero-vad/silero_vad.onnx",
"ct2_model_folder": ".../obs-localvocal/models/m2m-100-418M",
"fix_utf8": true,
"suppress_sentences": "끝까지 시청해주셔서 감사합니다/n구독과 좋아요 부탁드립니다!/nMBC 뉴스 안영백입니다./nMBC 뉴스 이덕영입니다/n구독과 좋아요 눌러주세요!/n구독과 좋아요 부탁드",
"suppress_sentences": "다음 영상에서 만나요!\nMBC 뉴스 김지경입니다\nMBC 뉴스 김성현입니다\n구독과 좋아요 눌러주세요!\n구독과 좋아요는 저에게 아주 큰\n다음 영상에서 만나요\n끝까지 시청해주셔서 감사합니다\n구독과 좋아요 부탁드립니다!\nMBC 뉴스 이준범입니다\nMBC 뉴스 문재인입니다\nMBC 뉴스 김지연입니다\nMBC 뉴스 안영백입니다.\nMBC 뉴스 이덕영입니다\nMBC 뉴스 김상현입니다\n구독과 좋아요 눌러주세요!\n구독과 좋아요 부탁드",
"overlap_ms": 150,
"log_level": "debug",
"whisper_sampling_method": 0
Expand Down
272 changes: 272 additions & 0 deletions src/tests/audio-file-utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@

#include "audio-file-utils.h"
#include "plugin-support.h"

#include <obs-module.h>

#include <vector>
#include <functional>

#if defined(_WIN32) || defined(__APPLE__)

extern "C" {
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/frame.h>
#include <libavutil/mem.h>
#include <libavutil/opt.h>
#include <libswresample/swresample.h>
#include <libavutil/log.h>
}

std::vector<std::vector<uint8_t>>
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
{
av_log_set_level(AV_LOG_QUIET);

obs_log(LOG_INFO, "Reading audio file %s", filename);

AVFormatContext *formatContext = nullptr;
int ret = avformat_open_input(&formatContext, filename, nullptr, nullptr);
if (ret != 0) {
char errbuf[AV_ERROR_MAX_STRING_SIZE];
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
obs_log(LOG_ERROR, "Error opening file: %s", errbuf);
return {};
}

if (avformat_find_stream_info(formatContext, nullptr) < 0) {
obs_log(LOG_ERROR, "Error finding stream information");
return {};
}

int audioStreamIndex = -1;
for (unsigned int i = 0; i < formatContext->nb_streams; i++) {
if (formatContext->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
audioStreamIndex = i;
break;
}
}

if (audioStreamIndex == -1) {
obs_log(LOG_ERROR, "No audio stream found");
return {};
}

// print information about the file
av_dump_format(formatContext, 0, filename, 0);

// if the sample format is not float, return
if (formatContext->streams[audioStreamIndex]->codecpar->format != AV_SAMPLE_FMT_FLTP) {
obs_log(LOG_ERROR,
"Sample format is not float (it is %s). Encode the audio file with float planar sample format."
" For example, use the command 'ffmpeg -i input.mp3 -f f32le -acodec pcm_f32le output.f32le'",
"convert the audio file to float format.",
av_get_sample_fmt_name(
(AVSampleFormat)formatContext->streams[audioStreamIndex]
->codecpar->format));
return {};
}

initialization_callback(formatContext->streams[audioStreamIndex]->codecpar->sample_rate,
formatContext->streams[audioStreamIndex]->codecpar->channels);

AVCodecParameters *codecParams = formatContext->streams[audioStreamIndex]->codecpar;
const AVCodec *codec = avcodec_find_decoder(codecParams->codec_id);
if (!codec) {
obs_log(LOG_ERROR, "Decoder not found");
return {};
}

AVCodecContext *codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
obs_log(LOG_ERROR, "Failed to allocate codec context");
return {};
}

if (avcodec_parameters_to_context(codecContext, codecParams) < 0) {
obs_log(LOG_ERROR, "Failed to copy codec parameters to codec context");
return {};
}

if (avcodec_open2(codecContext, codec, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to open codec");
return {};
}

AVFrame *frame = av_frame_alloc();
AVPacket packet;

std::vector<std::vector<uint8_t>> buffer(
formatContext->streams[audioStreamIndex]->codecpar->channels);

while (av_read_frame(formatContext, &packet) >= 0) {
if (packet.stream_index == audioStreamIndex) {
if (avcodec_send_packet(codecContext, &packet) == 0) {
while (avcodec_receive_frame(codecContext, frame) == 0) {
// push data to the buffer
for (int j = 0; j < codecContext->channels; j++) {
buffer[j].insert(buffer[j].end(), frame->data[j],
frame->data[j] +
frame->linesize[0]);
}
}
}
}
av_packet_unref(&packet);
}

av_frame_free(&frame);
avcodec_free_context(&codecContext);
avformat_close_input(&formatContext);

return buffer;
}

void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
const size_t frames)
{
av_log_set_level(AV_LOG_QUIET);

AVFormatContext *formatContext = nullptr;
AVCodecContext *codecContext = nullptr;
AVStream *stream = nullptr;
AVFrame *frame = nullptr;
AVPacket packet;
int ret = 0;

avformat_alloc_output_context2(&formatContext, nullptr, nullptr, filename.c_str());
if (!formatContext) {
obs_log(LOG_ERROR, "Failed to allocate output context");
return;
}

const AVCodec *codec = avcodec_find_encoder(AV_CODEC_ID_PCM_F32LE);
if (!codec) {
obs_log(LOG_ERROR, "Failed to find encoder");
return;
}

stream = avformat_new_stream(formatContext, codec);
if (!stream) {
obs_log(LOG_ERROR, "Failed to create new stream");
return;
}

codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
obs_log(LOG_ERROR, "Failed to allocate codec context");
return;
}

codecContext->sample_fmt = AV_SAMPLE_FMT_FLTP;
codecContext->sample_rate = 16000;
codecContext->channels = 1;
codecContext->channel_layout = AV_CH_LAYOUT_MONO;
codecContext->bit_rate = 64000;
codecContext->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;

if (avcodec_open2(codecContext, codec, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to open codec");
return;
}

if (avcodec_parameters_from_context(stream->codecpar, codecContext) < 0) {
obs_log(LOG_ERROR, "Failed to copy codec parameters to stream");
return;
}

if (avio_open(&formatContext->pb, filename.c_str(), AVIO_FLAG_WRITE) < 0) {
obs_log(LOG_ERROR, "Failed to open file");
return;
}

if (avformat_write_header(formatContext, nullptr) < 0) {
obs_log(LOG_ERROR, "Failed to write header");
return;
}

const int frame_size = 1024;
const int frame_size_in_bytes = frame_size * sizeof(float);
frame = av_frame_alloc();
frame->nb_samples = frame_size;
frame->format = codecContext->sample_fmt;
frame->ch_layout = codecContext->ch_layout;

ret = av_frame_get_buffer(frame, 0);
if (ret < 0) {
char errbuf[AV_ERROR_MAX_STRING_SIZE];
av_make_error_string(errbuf, AV_ERROR_MAX_STRING_SIZE, ret);
obs_log(LOG_ERROR, "Failed to allocate frame buffer: %s", errbuf);
return;
}

for (size_t i = 0; i < frames; i += frame_size) {
av_init_packet(&packet);
packet.data = nullptr;
packet.size = 0;

for (int k = 0; k < codecContext->channels; k++) {
if (i + frame_size < frames) {
memcpy(frame->data[k], pcm32f_data + i, frame_size_in_bytes);
} else {
// zero pad the last frame
memset(frame->data[k], 0, frame_size_in_bytes);
memcpy(frame->data[k], pcm32f_data + i,
(frames - i) * sizeof(float));
}
}

ret = avcodec_send_frame(codecContext, frame);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to send frame");
break;
}

ret = avcodec_receive_packet(codecContext, &packet);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to receive packet");
break;
}

av_packet_rescale_ts(&packet, codecContext->time_base, stream->time_base);
packet.stream_index = stream->index;

ret = av_interleaved_write_frame(formatContext, &packet);
if (ret < 0) {
obs_log(LOG_ERROR, "Failed to write frame");
break;
}

av_packet_unref(&packet);
}

if (ret >= 0) {
av_write_trailer(formatContext);
}

av_frame_free(&frame);
avcodec_free_context(&codecContext);
avformat_free_context(formatContext);

if (ret < 0) {
obs_log(LOG_ERROR, "Failed to write audio file %s", filename.c_str());
}
}

#else

std::vector<std::vector<uint8_t>>
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback)
{
obs_log(LOG_ERROR, "Reading audio files is not supported on this platform");
return {};
}

void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
const size_t frames)
{
obs_log(LOG_ERROR, "Writing audio files is not supported on this platform");
}

#endif
10 changes: 10 additions & 0 deletions src/tests/audio-file-utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

#include <vector>
#include <functional>
#include <string>

std::vector<std::vector<uint8_t>>
read_audio_file(const char *filename, std::function<void(int, int)> initialization_callback);

void write_audio_wav_file(const std::string &filename, const float *pcm32f_data,
const size_t frames);
Loading
Loading