diff --git a/Makefile b/Makefile index d45ecb4..4e2c93a 100644 --- a/Makefile +++ b/Makefile @@ -10,8 +10,8 @@ XAV_SO = $(PRIV_DIR)/libxav.so # uncomment to compile with debug logs # XAV_DEBUG_LOGS = -DXAV_DEBUG=1 -HEADERS = $(XAV_DIR)/reader.h $(XAV_DIR)/decoder.h $(XAV_DIR)/utils.h -SOURCES = $(XAV_DIR)/xav_nif.c $(XAV_DIR)/reader.c $(XAV_DIR)/decoder.c $(XAV_DIR)/utils.c +HEADERS = $(XAV_DIR)/reader.h $(XAV_DIR)/decoder.h $(XAV_DIR)/converter.h $(XAV_DIR)/utils.h +SOURCES = $(XAV_DIR)/xav_nif.c $(XAV_DIR)/reader.c $(XAV_DIR)/decoder.c $(XAV_DIR)/converter.c $(XAV_DIR)/utils.c CFLAGS = $(XAV_DEBUG_LOGS) -fPIC -shared IFLAGS = -I$(ERTS_INCLUDE_DIR) -I$(XAV_DIR) diff --git a/c_src/xav/converter.c b/c_src/xav/converter.c new file mode 100644 index 0000000..cecaf99 --- /dev/null +++ b/c_src/xav/converter.c @@ -0,0 +1,66 @@ +#include "converter.h" +#include +#include +#include +#include +#include + +#include "utils.h" + +int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_sample_rate, + enum AVSampleFormat in_sample_fmt, AVChannelLayout out_chlayout, + int out_sample_rate, enum AVSampleFormat out_sample_fmt) { + c->swr_ctx = swr_alloc(); + c->in_sample_rate = in_sample_rate; + c->out_sample_rate = out_sample_rate; + c->out_chlayout = out_chlayout; + c->out_sample_fmt = out_sample_fmt; + + av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout, 0); + av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout, 0); + + av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0); + av_opt_set_int(c->swr_ctx, "out_sample_rate", out_sample_rate, 0); + + av_opt_set_sample_fmt(c->swr_ctx, "in_sample_fmt", in_sample_fmt, 0); + av_opt_set_sample_fmt(c->swr_ctx, "out_sample_fmt", out_sample_fmt, 0); + + return swr_init(c->swr_ctx); +} + +int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_data, + int *out_samples, int *out_size) { + uint8_t **out_data_tmp = NULL; + int max_out_nb_samples = swr_get_out_samples(c->swr_ctx, src_frame->nb_samples); + int out_bytes_per_sample = av_get_bytes_per_sample(c->out_sample_fmt); + + // Some parts of ffmpeg require buffers to by divisible by 32 + // to use fast/aligned SIMD routines - this is what align option is used for. + // See https://stackoverflow.com/questions/35678041/what-is-linesize-alignment-meaning + // Because we return the binary straight to the Erlang, we can disable it. + int ret = av_samples_alloc_array_and_samples(&out_data_tmp, NULL, c->out_chlayout.nb_channels, + max_out_nb_samples, c->out_sample_fmt, 1); + + if (ret < 0) { + XAV_LOG_DEBUG("Couldn't allocate array for out samples."); + return ret; + } + + *out_data = out_data_tmp; + + *out_samples = swr_convert(c->swr_ctx, out_data_tmp, max_out_nb_samples, + (const uint8_t **)src_frame->data, src_frame->nb_samples); + + if (*out_samples < 0) { + XAV_LOG_DEBUG("Couldn't convert samples: %d", *out_samples); + return -1; + } + + XAV_LOG_DEBUG("Converted %d samples per channel", *out_samples); + + *out_size = *out_samples * out_bytes_per_sample * c->out_chlayout.nb_channels; + + return 0; +} + +void converter_free(struct Converter *c) { swr_free(&c->swr_ctx); } \ No newline at end of file diff --git a/c_src/xav/converter.h b/c_src/xav/converter.h new file mode 100644 index 0000000..f2d1ec2 --- /dev/null +++ b/c_src/xav/converter.h @@ -0,0 +1,21 @@ +#ifndef CONVERTER_H +#define CONVERTER_H +#include +#include +#include + +struct Converter { + SwrContext *swr_ctx; + int64_t in_sample_rate; + int64_t out_sample_rate; + AVChannelLayout out_chlayout; + enum AVSampleFormat out_sample_fmt; +}; + +int converter_init(struct Converter *c, AVChannelLayout in_chlayout, int in_sample_rate, + enum AVSampleFormat in_sample_fmt, AVChannelLayout out_chlaout, + int out_sample_rate, enum AVSampleFormat out_sample_fmt); +int converter_convert(struct Converter *c, AVFrame *src_frame, uint8_t ***out_data, + int *out_samples, int *out_size); +void converter_free(struct Converter *converter); +#endif \ No newline at end of file diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c index a949ede..516c332 100644 --- a/c_src/xav/decoder.c +++ b/c_src/xav/decoder.c @@ -3,6 +3,7 @@ int decoder_init(struct Decoder *decoder, const char *codec) { decoder->swr_ctx = NULL; + decoder->out_data = NULL; if (strcmp(codec, "opus") == 0) { decoder->media_type = AVMEDIA_TYPE_AUDIO; @@ -30,6 +31,19 @@ int decoder_init(struct Decoder *decoder, const char *codec) { return -1; } + if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { + AVChannelLayout out_chlayout = decoder->c->ch_layout; + int out_sample_rate = decoder->c->sample_rate; + enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + + int ret = converter_init(&decoder->converter, decoder->c->ch_layout, decoder->c->sample_rate, + decoder->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); + + if (ret < 0) { + return ret; + } + } + return 0; } @@ -59,23 +73,9 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) { decoder->frame_data = frame->data; decoder->frame_linesize = frame->linesize; } - } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO && - av_sample_fmt_is_planar(frame->format) == 1) { - if (decoder->swr_ctx == NULL) { - if (init_swr_ctx_from_frame(&decoder->swr_ctx, frame) != 0) { - return -1; - } - } - - if (convert_to_interleaved(decoder->swr_ctx, frame, decoder->rgb_dst_data, - decoder->rgb_dst_linesize) != 0) { - return -1; - } - - decoder->frame_data = decoder->rgb_dst_data; - decoder->frame_linesize = decoder->rgb_dst_linesize; - } else { - decoder->frame_data = frame->extended_data; + } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { + return converter_convert(&decoder->converter, frame, &decoder->out_data, &decoder->out_samples, + &decoder->out_size); } return 0; diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h index a45021d..20a5d58 100644 --- a/c_src/xav/decoder.h +++ b/c_src/xav/decoder.h @@ -1,6 +1,8 @@ #include #include +#include "converter.h" + struct Decoder { enum AVMediaType media_type; const AVCodec *codec; @@ -14,6 +16,16 @@ struct Decoder { uint8_t **frame_data; int *frame_linesize; + + struct Converter converter; + // Buffer where audio samples are written after conversion. + // We always convet to packed format, so only out_data[0] is set. + uint8_t **out_data; + // Number of samples in out_data buffer + int out_samples; + // Size of out_data buffer. + // This is the same as out_samples * bytes_per_sample(out_format) * out_channels. + int out_size; }; int decoder_init(struct Decoder *decoder, const char *codec); diff --git a/c_src/xav/reader.c b/c_src/xav/reader.c index 9d8998b..59c54cc 100644 --- a/c_src/xav/reader.c +++ b/c_src/xav/reader.c @@ -1,5 +1,6 @@ #include "reader.h" #include "utils.h" +#include #include int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, int device_flag, @@ -21,6 +22,7 @@ int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, in reader->media_type = media_type; reader->in_format_name = NULL; reader->out_format_name = NULL; + reader->out_data = NULL; if (device_flag == 1) { avdevice_register_all(); @@ -69,23 +71,13 @@ int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, in } if (reader->media_type == AVMEDIA_TYPE_AUDIO) { - reader->swr_ctx = swr_alloc(); - enum AVSampleFormat out_sample_fmt = av_get_alt_sample_fmt(reader->c->sample_fmt, 0); - -#if LIBAVUTIL_VERSION_MAJOR >= 58 - av_opt_set_chlayout(reader->swr_ctx, "in_chlayout", &reader->c->ch_layout, 0); - av_opt_set_chlayout(reader->swr_ctx, "out_chlayout", &reader->c->ch_layout, 0); -#else - av_opt_set_channel_layout(reader->swr_ctx, "in_channel_layout", reader->c->channel_layout, 0); - av_opt_set_channel_layout(reader->swr_ctx, "out_channel_layout", reader->c->channel_layout, 0); -#endif - - av_opt_set_int(reader->swr_ctx, "in_sample_rate", reader->c->sample_rate, 0); - av_opt_set_int(reader->swr_ctx, "out_sample_rate", reader->c->sample_rate, 0); - av_opt_set_sample_fmt(reader->swr_ctx, "in_sample_fmt", reader->c->sample_fmt, 0); - av_opt_set_sample_fmt(reader->swr_ctx, "out_sample_fmt", out_sample_fmt, 0); - - ret = swr_init(reader->swr_ctx); + AVChannelLayout out_chlayout = AV_CHANNEL_LAYOUT_MONO; + int out_sample_rate = 16000; + enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT; + + int ret = converter_init(&reader->converter, reader->c->ch_layout, reader->c->sample_rate, + reader->c->sample_fmt, out_chlayout, out_sample_rate, out_sample_fmt); + if (ret < 0) { return ret; } @@ -155,7 +147,6 @@ int reader_next_frame(struct Reader *reader) { ret = avcodec_receive_frame(reader->c, reader->frame); if (ret == 0) { - XAV_LOG_DEBUG("Received frame"); frame_ready = 1; } else if (ret == AVERROR_EOF) { XAV_LOG_DEBUG("EOF"); @@ -193,26 +184,17 @@ int reader_next_frame(struct Reader *reader) { fin: if (reader->media_type == AVMEDIA_TYPE_VIDEO && reader->frame->format != AV_PIX_FMT_RGB24) { - XAV_LOG_DEBUG("Converting to RGB"); + XAV_LOG_DEBUG("Converting video to RGB"); convert_to_rgb(reader->frame, reader->rgb_dst_data, reader->rgb_dst_linesize); reader->frame_data = reader->rgb_dst_data; reader->frame_linesize = reader->rgb_dst_linesize; } else if (reader->media_type == AVMEDIA_TYPE_VIDEO) { reader->frame_data = reader->frame->data; reader->frame_linesize = reader->frame->linesize; - } else if (reader->media_type == AVMEDIA_TYPE_AUDIO && - av_sample_fmt_is_planar(reader->frame->format) == 1) { - XAV_LOG_DEBUG("Converting to interleaved"); - - if (convert_to_interleaved(reader->swr_ctx, reader->frame, reader->rgb_dst_data, - reader->rgb_dst_linesize) != 0) { - return -1; - } - - reader->frame_data = reader->rgb_dst_data; - reader->frame_linesize = reader->rgb_dst_linesize; - } else { - reader->frame_data = reader->frame->extended_data; + } else if (reader->media_type == AVMEDIA_TYPE_AUDIO) { + XAV_LOG_DEBUG("Converting audio to desired out format"); + return converter_convert(&reader->converter, reader->frame, &reader->out_data, + &reader->out_samples, &reader->out_size); } return 0; @@ -226,6 +208,7 @@ void reader_free_frame(struct Reader *reader) { reader->frame_data == reader->rgb_dst_data) { av_freep(&reader->frame_data[0]); } + free(reader->out_data); } void reader_free(struct Reader *reader) { diff --git a/c_src/xav/reader.h b/c_src/xav/reader.h index 6a7a31d..3522502 100644 --- a/c_src/xav/reader.h +++ b/c_src/xav/reader.h @@ -8,8 +8,8 @@ #include #include #include -#include +#include "converter.h" #include "utils.h" struct Reader { @@ -39,6 +39,16 @@ struct Reader { // whether convertion to rgb was needed uint8_t **frame_data; int *frame_linesize; + + struct Converter converter; + // Buffer where audio samples are written after conversion. + // We always convet to packed format, so only out_data[0] is set. + uint8_t **out_data; + // Number of samples in out_data buffer + int out_samples; + // Size of out_data buffer. + // This is the same as out_samples * bytes_per_sample(out_format) * out_channels. + int out_size; }; int reader_init(struct Reader *reader, unsigned char *path, size_t path_size, int device_flag, diff --git a/c_src/xav/utils.c b/c_src/xav/utils.c index 3cc0c02..78610d5 100644 --- a/c_src/xav/utils.c +++ b/c_src/xav/utils.c @@ -1,4 +1,7 @@ #include "utils.h" +#include +#include +#include void print_supported_pix_fmts(AVCodec *codec) { if (codec->pix_fmts == NULL) { @@ -42,31 +45,6 @@ void convert_to_rgb(AVFrame *src_frame, uint8_t *dst_data[], int dst_linesize[]) src_frame->height, dst_data, dst_linesize); } -int convert_to_interleaved(SwrContext *swr_ctx, AVFrame *src_frame, uint8_t **dst_data, - int *dst_linesize) { -#if LIBAVUTIL_VERSION_MAJOR >= 58 - int channels = src_frame->ch_layout.nb_channels; -#else - int channels = src_frame->channels; -#endif - - int samples_per_channel = src_frame->nb_samples; - - int ret = - av_samples_alloc(dst_data, dst_linesize, channels, samples_per_channel, src_frame->format, 0); - if (ret < 0) { - return ret; - } - - ret = swr_convert(swr_ctx, dst_data, samples_per_channel, (const uint8_t **)src_frame->data, - samples_per_channel); - if (ret < 0) { - return ret; - } - - return 0; -} - ERL_NIF_TERM xav_nif_ok(ErlNifEnv *env, ERL_NIF_TERM data_term) { ERL_NIF_TERM ok_term = enif_make_atom(env, "ok"); return enif_make_tuple(env, 2, ok_term, data_term); @@ -83,24 +61,16 @@ ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg) { return enif_raise_exception(env, reason); } -ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, AVFrame *frame, unsigned char *data[], - const char *format_name) { +ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples, + int out_size, const char *out_format, int pts) { ERL_NIF_TERM data_term; -#if LIBAVUTIL_VERSION_MAJOR >= 58 - size_t unpadded_linesize = - frame->nb_samples * av_get_bytes_per_sample(frame->format) * frame->ch_layout.nb_channels; -#else - size_t unpadded_linesize = - frame->nb_samples * av_get_bytes_per_sample(frame->format) * frame->channels; -#endif - - unsigned char *ptr = enif_make_new_binary(env, unpadded_linesize, &data_term); - memcpy(ptr, data[0], unpadded_linesize); + unsigned char *ptr = enif_make_new_binary(env, out_size, &data_term); + memcpy(ptr, out_data[0], out_size); - ERL_NIF_TERM samples_term = enif_make_int(env, frame->nb_samples); - ERL_NIF_TERM format_term = enif_make_atom(env, format_name); - ERL_NIF_TERM pts_term = enif_make_int(env, frame->pts); + ERL_NIF_TERM samples_term = enif_make_int(env, out_samples); + ERL_NIF_TERM format_term = enif_make_atom(env, out_format); + ERL_NIF_TERM pts_term = enif_make_int(env, pts); return enif_make_tuple(env, 4, data_term, format_term, samples_term, pts_term); } diff --git a/c_src/xav/utils.h b/c_src/xav/utils.h index 0e9ca0f..7339cda 100644 --- a/c_src/xav/utils.h +++ b/c_src/xav/utils.h @@ -20,13 +20,11 @@ void print_supported_pix_fmts(AVCodec *codec); int init_swr_ctx_from_frame(SwrContext **swr_ctx, AVFrame *frame); void convert_to_rgb(AVFrame *src_frame, uint8_t *dst_data[], int dst_linesize[]); -int convert_to_interleaved(SwrContext *swr_ctx, AVFrame *src_frame, uint8_t **dst_data, - int *dst_linesize); ERL_NIF_TERM xav_nif_ok(ErlNifEnv *env, ERL_NIF_TERM data_term); ERL_NIF_TERM xav_nif_error(ErlNifEnv *env, char *reason); ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg); ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame, unsigned char *data[], int *linesize, const char *out_format_name); -ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, AVFrame *frame, unsigned char *data[], - const char *out_format_name); +ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples, + int out_size, const char *out_format, int pts); diff --git a/c_src/xav/xav_nif.c b/c_src/xav/xav_nif.c index be48433..7f52e96 100644 --- a/c_src/xav/xav_nif.c +++ b/c_src/xav/xav_nif.c @@ -93,8 +93,10 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { frame_term = xav_nif_video_frame_to_term(env, reader->frame, reader->frame_data, reader->frame_linesize, reader->out_format_name); } else if (reader->media_type == AVMEDIA_TYPE_AUDIO) { - frame_term = xav_nif_audio_frame_to_term(env, reader->frame, reader->frame_data, - reader->out_format_name); + const char *out_format = av_get_sample_fmt_name(reader->converter.out_sample_fmt); + + frame_term = xav_nif_audio_frame_to_term(env, reader->out_data, reader->out_samples, + reader->out_size, out_format, reader->frame->pts); } reader_free_frame(reader); @@ -188,8 +190,10 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { decoder->frame_linesize, "rgb"); } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO) { - frame_term = - xav_nif_audio_frame_to_term(env, frame, decoder->frame_data, decoder->out_format_name); + const char *out_format = av_get_sample_fmt_name(decoder->converter.out_sample_fmt); + + frame_term = xav_nif_audio_frame_to_term(env, decoder->out_data, decoder->out_samples, + decoder->out_size, out_format, frame->pts); } term = xav_nif_ok(env, frame_term); @@ -197,6 +201,11 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { cleanup: av_frame_free(&frame); av_packet_free(&pkt); + + if (decoder->out_data != NULL) { + free(decoder->out_data); + } + return term; } diff --git a/test/fixtures/README.md b/test/fixtures/README.md new file mode 100644 index 0000000..cfa4773 --- /dev/null +++ b/test/fixtures/README.md @@ -0,0 +1,6 @@ +# Fixtures + +* [melnet_sample_0.mp3](https://audio-samples.github.io/) - 22050Hz, 1 channel, fltp +* [harvard.wav](https://www.kaggle.com/datasets/pavanelisetty/sample-audio-files-for-speech-recognition) - 44100Hz, 2 channels, s16 +* harvard.mp3 - created by calling ffmpeg -i harvard.wav harvard.mp3 - 44100Hz, 2 channels, fltp + diff --git a/test/fixtures/harvard.mp3 b/test/fixtures/harvard.mp3 new file mode 100644 index 0000000..2349a45 Binary files /dev/null and b/test/fixtures/harvard.mp3 differ diff --git a/test/fixtures/harvard.wav b/test/fixtures/harvard.wav new file mode 100644 index 0000000..b05ec79 Binary files /dev/null and b/test/fixtures/harvard.wav differ diff --git a/test/reader_test.exs b/test/reader_test.exs index 7ebc5d4..2dd441c 100644 --- a/test/reader_test.exs +++ b/test/reader_test.exs @@ -41,11 +41,29 @@ defmodule Xav.ReaderTest do end end) - @tag :debug test "speech to text" do - # This file has been downloaded from https://audio-samples.github.io/ - # Section: Samples from the model without biasing or priming. - reader = Xav.Reader.new!("./test/fixtures/melnet_sample_0.mp3", read: :audio) + for {path, expected_output} <- [ + # This file has been downloaded from https://audio-samples.github.io/ + # Section: Samples from the model without biasing or priming. + {"./test/fixtures/melnet_sample_0.mp3", + """ + My thought, I have nobody by a beauty and will as you poured. \ + Mr. Rochester has served and that so don't find a simple and \ + devoted aboud to what might in a\ + """}, + {"./test/fixtures/harvard.wav", + """ + The stale smell of old beer lingers. It takes heat to bring out the odor. \ + A cold dip restores health in zest. A salt pickle tastes fine with ham. \ + Tacos all pastora are my favorite. A zestful food is the hot cross bun.\ + """} + ] do + test_speech_to_text(path, expected_output) + end + end + + defp test_speech_to_text(path, expected_output) do + reader = Xav.Reader.new!(path, read: :audio) {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"}) @@ -65,15 +83,7 @@ defmodule Xav.ReaderTest do batch = Nx.Defn.jit_apply(&Function.identity/1, [batch]) assert %{chunks: chunks} = Nx.Serving.run(serving, batch) - assert [ - %{ - text: """ - My thought I have nobody by a beauty and will as you poured. \ - Mr. Rochester has served in that so-done fine-simpless and \ - devoted to bowed, to let might in a\ - """ - } - ] = chunks + assert [%{text: ^expected_output}] = chunks end defp read_frames(reader, acc \\ []) do