diff --git a/demos/common/export_models/README.md b/demos/common/export_models/README.md index 16fd33fb6c..e4ed134772 100644 --- a/demos/common/export_models/README.md +++ b/demos/common/export_models/README.md @@ -98,7 +98,7 @@ options: #### Text Generation CPU Deployment ```console -python export_model.py text_generation --source_model meta-llama/Meta-Llama-3-8B-Instruct --weight-format fp16 --kv_cache_precision u8 --config_file_path models/config_all.json --model_repository_path models +python demos\common\export_models\export_model.py text_generation --source_model meta-llama/Llama-3.2-1B-Instruct --weight-format int4 --kv_cache_precision u8 --config_file_path config.json --model_repository_path audio ``` #### GPU Deployment (Low Concurrency, Limited Memory) diff --git a/src/BUILD b/src/BUILD index 7e8d8b04b4..2446bca768 100644 --- a/src/BUILD +++ b/src/BUILD @@ -562,6 +562,7 @@ ovms_cc_library( "//conditions:default": [], "//:not_disable_mediapipe" : [ "//src/image_gen:image_gen_calculator", + "//src/speech:speech_calculator", "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", "//src/embeddings:embeddingscalculator", diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 05df6ce7ac..48ed47b155 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -506,6 +506,8 @@ static Status createV3HttpPayload( } else { SPDLOG_DEBUG("Model name from deduced from MultiPart field: {}", modelName); } + auto stream = multiPartParser->getFieldByName("stream"); + SPDLOG_ERROR("{}", stream); ensureJsonParserInErrorState(parsedJson); } else if (isApplicationJson) { { diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp index ef4dbeda3b..241c7de283 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.cpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.cpp @@ -61,6 +61,7 @@ const std::string MediapipeGraphDefinition::SCHEDULER_CLASS_NAME{"Mediapipe"}; const std::string MediapipeGraphDefinition::PYTHON_NODE_CALCULATOR_NAME{"PythonExecutorCalculator"}; const std::string MediapipeGraphDefinition::LLM_NODE_CALCULATOR_NAME{"LLMCalculator"}; const std::string MediapipeGraphDefinition::IMAGE_GEN_CALCULATOR_NAME{"ImageGenCalculator"}; +const std::string MediapipeGraphDefinition::SPEECH_NODE_CALCULATOR_NAME{"SpeechCalculator"}; const std::string MediapipeGraphDefinition::EMBEDDINGS_NODE_CALCULATOR_NAME{"EmbeddingsCalculatorOV"}; const std::string MediapipeGraphDefinition::RERANK_NODE_CALCULATOR_NAME{"RerankCalculatorOV"}; @@ -554,6 +555,28 @@ Status MediapipeGraphDefinition::initializeNodes() { rerankServableMap.insert(std::pair>(nodeName, std::move(servable))); rerankServablesCleaningGuard.disableCleaning(); } + if (endsWith(config.node(i).calculator(), SPEECH_NODE_CALCULATOR_NAME)) { + auto& speechServableMap = this->sidePacketMaps.speechServableMap; + ResourcesCleaningGuard speechServablesCleaningGuard(speechServableMap); + if (!config.node(i).node_options().size()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node missing options in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_OPTIONS; + } + if (config.node(i).name().empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name is missing in graph: {}. ", this->name); + return StatusCode::LLM_NODE_MISSING_NAME; + } + std::string nodeName = config.node(i).name(); + if (speechServableMap.find(nodeName) != speechServableMap.end()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Speech node name: {} already used in graph: {}. ", nodeName, this->name); + return StatusCode::LLM_NODE_NAME_ALREADY_EXISTS; + } + mediapipe::SpeechCalculatorOptions nodeOptions; + config.node(i).node_options(0).UnpackTo(&nodeOptions); + std::shared_ptr servable = std::make_shared(nodeOptions.models_path(), nodeOptions.device(), mgconfig.getBasePath(), nodeOptions.mode()); + speechServableMap.insert(std::pair>(nodeName, std::move(servable))); + speechServablesCleaningGuard.disableCleaning(); + } } return StatusCode::OK; } diff --git a/src/mediapipe_internal/mediapipegraphdefinition.hpp b/src/mediapipe_internal/mediapipegraphdefinition.hpp index 1a6e98bfcf..e5d8a700de 100644 --- a/src/mediapipe_internal/mediapipegraphdefinition.hpp +++ b/src/mediapipe_internal/mediapipegraphdefinition.hpp @@ -46,6 +46,7 @@ #include "../sidepacket_servable.hpp" #include "../embeddings/embeddings_servable.hpp" #include "../rerank/rerank_servable.hpp" +#include "../speech/speech_servable.hpp" namespace ovms { class MediapipeGraphDefinitionUnloadGuard; @@ -62,6 +63,7 @@ struct ImageGenerationPipelines; using PythonNodeResourcesMap = std::unordered_map>; using GenAiServableMap = std::unordered_map>; using RerankServableMap = std::unordered_map>; +using SpeechServableMap = std::unordered_map>; using EmbeddingsServableMap = std::unordered_map>; using ImageGenerationPipelinesMap = std::unordered_map>; @@ -71,19 +73,22 @@ struct GraphSidePackets { ImageGenerationPipelinesMap imageGenPipelinesMap; EmbeddingsServableMap embeddingsServableMap; RerankServableMap rerankServableMap; + SpeechServableMap speechServableMap; void clear() { pythonNodeResourcesMap.clear(); genAiServableMap.clear(); imageGenPipelinesMap.clear(); embeddingsServableMap.clear(); rerankServableMap.clear(); + speechServableMap.clear(); } bool empty() { return (pythonNodeResourcesMap.empty() && genAiServableMap.empty() && imageGenPipelinesMap.empty() && embeddingsServableMap.empty() && - rerankServableMap.empty()); + rerankServableMap.empty() && + speechServableMap.empty()); } }; @@ -124,6 +129,7 @@ class MediapipeGraphDefinition { static const std::string IMAGE_GEN_CALCULATOR_NAME; static const std::string EMBEDDINGS_NODE_CALCULATOR_NAME; static const std::string RERANK_NODE_CALCULATOR_NAME; + static const std::string SPEECH_NODE_CALCULATOR_NAME; Status waitForLoaded(std::unique_ptr& unloadGuard, const uint32_t waitForLoadedTimeoutMicroseconds = WAIT_FOR_LOADED_DEFAULT_TIMEOUT_MICROSECONDS); // Pipelines are not versioned and any available definition has constant version equal 1. diff --git a/src/mediapipe_internal/mediapipegraphexecutor.cpp b/src/mediapipe_internal/mediapipegraphexecutor.cpp index aa95bf88ec..5545b13966 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.cpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.cpp @@ -47,6 +47,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter) : name(name), @@ -56,7 +57,7 @@ MediapipeGraphExecutor::MediapipeGraphExecutor( outputTypes(std::move(outputTypes)), inputNames(std::move(inputNames)), outputNames(std::move(outputNames)), - sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap}), + sidePacketMaps({pythonNodeResourcesMap, llmNodeResourcesMap, {}, embeddingsServableMap, rerankServableMap, speechServableMap}), pythonBackend(pythonBackend), currentStreamTimestamp(STARTING_TIMESTAMP), mediapipeServableMetricReporter(mediapipeServableMetricReporter) {} @@ -88,6 +89,7 @@ const std::string MediapipeGraphExecutor::LLM_SESSION_SIDE_PACKET_TAG = "llm"; const std::string MediapipeGraphExecutor::IMAGE_GEN_SESSION_SIDE_PACKET_TAG = "pipes"; const std::string MediapipeGraphExecutor::EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "embeddings_servable"; const std::string MediapipeGraphExecutor::RERANK_SESSION_SIDE_PACKET_TAG = "rerank_servable"; +const std::string MediapipeGraphExecutor::SPEECH_SESSION_SIDE_PACKET_TAG = "speech_servable"; const ::mediapipe::Timestamp MediapipeGraphExecutor::STARTING_TIMESTAMP = ::mediapipe::Timestamp(0); } // namespace ovms diff --git a/src/mediapipe_internal/mediapipegraphexecutor.hpp b/src/mediapipe_internal/mediapipegraphexecutor.hpp index b2468f5540..f3c6907aaa 100644 --- a/src/mediapipe_internal/mediapipegraphexecutor.hpp +++ b/src/mediapipe_internal/mediapipegraphexecutor.hpp @@ -93,6 +93,7 @@ class MediapipeGraphExecutor { static const std::string IMAGE_GEN_SESSION_SIDE_PACKET_TAG; static const std::string EMBEDDINGS_SESSION_SIDE_PACKET_TAG; static const std::string RERANK_SESSION_SIDE_PACKET_TAG; + static const std::string SPEECH_SESSION_SIDE_PACKET_TAG; static const ::mediapipe::Timestamp STARTING_TIMESTAMP; MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -103,6 +104,7 @@ class MediapipeGraphExecutor { const GenAiServableMap& llmNodeResourcesMap, const EmbeddingsServableMap& embeddingsServableMap, const RerankServableMap& rerankServableMap, + const SpeechServableMap& speechServableMap, PythonBackend* pythonBackend, MediapipeServableMetricReporter* mediapipeServableMetricReporter); MediapipeGraphExecutor(const std::string& name, const std::string& version, const ::mediapipe::CalculatorGraphConfig& config, @@ -151,6 +153,8 @@ class MediapipeGraphExecutor { inputSidePackets[EMBEDDINGS_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.embeddingsServableMap).At(STARTING_TIMESTAMP); inputSidePackets[RERANK_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.rerankServableMap).At(STARTING_TIMESTAMP); + inputSidePackets[SPEECH_SESSION_SIDE_PACKET_TAG] = mediapipe::MakePacket(this->sidePacketMaps.speechServableMap).At(STARTING_TIMESTAMP); + MP_RETURN_ON_FAIL(graph.StartRun(inputSidePackets), std::string("start MediaPipe graph: ") + this->name, StatusCode::MEDIAPIPE_GRAPH_START_ERROR); ::mediapipe::Packet packet; diff --git a/src/speech/BUILD b/src/speech/BUILD new file mode 100644 index 0000000000..680a015cc6 --- /dev/null +++ b/src/speech/BUILD @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") +load("//:common_settings.bzl", "ovms_cc_library") + +ovms_cc_library( + name = "speech_servable", + hdrs = ["speech_servable.hpp"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "llm_engine", # in fact this is genai library + srcs = [], + deps = ["@llm_engine//:llm_engine"], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +ovms_cc_library( + name = "speech_calculator", + srcs = ["http_speech_calculator.cc"], + hdrs = ["dr_wav.h", "dr_mp3.h"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:httppayload", + "//src:libovmslogging", + "speech_calculator_cc_proto", + ":speech_servable", + ]+ select({ + "//conditions:default": ["//third_party:genai", ":llm_engine"], + "//:not_genai_bin" : [":llm_engine"], + }), + visibility = ["//visibility:public"], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "speech_calculator_proto", + srcs = ["speech_calculator.proto"], + visibility = ["//visibility:private"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_options_proto", + "@mediapipe//mediapipe/framework:calculator_proto", + ], +) diff --git a/src/speech/http_speech_calculator.cc b/src/speech/http_speech_calculator.cc new file mode 100644 index 0000000000..bd1a0496aa --- /dev/null +++ b/src/speech/http_speech_calculator.cc @@ -0,0 +1,392 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/canonical_errors.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "../http_payload.hpp" +#include "../logging.hpp" +#include +#include + +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#include "absl/strings/escaping.h" +#include "absl/strings/str_cat.h" +#pragma warning(pop) + +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" +#define DR_MP3_IMPLEMENTATION +#pragma warning(push) +#pragma warning(disable : 6386 6262) +#include "dr_mp3.h" +#pragma warning(pop) +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "speech_servable.hpp" + +#ifdef _WIN32 +# include +# include +#endif + +using namespace ovms; + +namespace mediapipe { + +// using SpeechPipelinesMap = std::unordered_map>; + + +const std::string SPEECH_SESSION_SIDE_PACKET_TAG = "SPEECH_NODE_RESOURCES"; + +#define COMMON_SAMPLE_RATE 16000 + +bool is_wav_buffer(const std::string buf) { + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { + return false; + } + + uint32_t chunk_size = *reinterpret_cast(buf.data() + 4); + if (chunk_size + 8 != buf.size()) { + return false; + } + + return true; +} + +ov::genai::RawSpeechInput read_wav(const std::string_view& wav_data) { + drwav wav; + +// if (filename == "-") { +// { +// #ifdef _WIN32 +// _setmode(_fileno(stdin), _O_BINARY); +// #endif + +// uint8_t buf[1024]; +// while (true) { +// const size_t n = fread(buf, 1, sizeof(buf), stdin); +// if (n == 0) { +// break; +// } +// wav_data.insert(wav_data.end(), buf, buf + n); +// } +// } + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to open WAV file from stdin"); + +// fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); +// } else if (is_wav_buffer(filename)) { +// OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), +// "Failed to open WAV file from fname buffer"); +// } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { +// #if defined(WHISPER_FFMPEG) +// OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + +// OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), +// "Failed to read wav data as wav") +// #else +// throw std::runtime_error("failed to open as WAV file"); +// #endif +// } + auto result = drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr); + if(result == false){ + SPDLOG_ERROR("FILE PARSING FAILED {}", result); + throw std::runtime_error("FILE PARSING FAILED"); + } + if (wav.channels != 1 && wav.channels != 2) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be mono or stereo"); + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + } + + const uint64_t n = + wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8ul); + + std::vector pcm16; + pcm16.resize(n * wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + std::vector pcmf32; + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i]) / 32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + } + } + + return pcmf32; +} + +float* resample_audio(const float* input, + size_t input_length, + float input_rate, + float target_rate, + size_t* output_length) { + if (input_rate == target_rate) { + *output_length = input_length; + float* output = (float*)malloc(input_length * sizeof(float)); + if (output) { + memcpy(output, input, input_length * sizeof(float)); + } + return output; + } + + float ratio = input_rate / target_rate; + *output_length = (size_t)(input_length / ratio); + float* output = (float*)malloc(*output_length * sizeof(float)); + + if (!output) { + return NULL; + } + + for (size_t i = 0; i < *output_length; i++) { + float src_idx = i * ratio; + size_t idx0 = (size_t)src_idx; + size_t idx1 = idx0 + 1; + + if (idx1 >= input_length) { + output[i] = input[input_length - 1]; + } else { + float frac = src_idx - idx0; + output[i] = input[idx0] * (1.0f - frac) + input[idx1] * frac; + } + } + + return output; +} + +ov::genai::RawSpeechInput read_mp3(const std::string_view& mp3_data) { + drmp3 mp3; + + SPDLOG_ERROR("1"); + auto result = drmp3_init_memory(&mp3, mp3_data.data(), mp3_data.size(), nullptr); + if(result == false){ + SPDLOG_ERROR("FILE PARSING FAILED {}", result); + throw std::runtime_error("FILE PARSING FAILED"); + } + SPDLOG_ERROR("2"); + if (mp3.channels != 1 && mp3.channels != 2) { + drmp3_uninit(&mp3); + throw std::runtime_error("WAV file must be mono or stereo"); + } + SPDLOG_ERROR("3 {}", mp3.sampleRate); + // if (mp3.sampleRate != COMMON_SAMPLE_RATE) { + // drmp3_uninit(&mp3); + // throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + // } + SPDLOG_ERROR("4"); + const uint64_t n = mp3.totalPCMFrameCount; + SPDLOG_ERROR("mp3.totalPCMFrameCount {} : n {}", mp3.totalPCMFrameCount, n); + std::vector pcmf32; + pcmf32.resize(n * mp3.channels); + drmp3_read_pcm_frames_f32(&mp3, n, pcmf32.data()); + drmp3_uninit(&mp3); + SPDLOG_ERROR("5"); + // convert to mono, float + // std::vector pcmf32; + // pcmf32.resize(n); + // if (mp3.channels == 1) { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[i]) / 32768.0f; + // } + // } else { + // for (uint64_t i = 0; i < n; i++) { + // pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + // } + // } + size_t output_length; + auto buffer = resample_audio(reinterpret_cast(pcmf32.data()), pcmf32.size(), mp3.sampleRate, 16000, &output_length); + std::vector output(buffer, buffer + output_length); + return output; +} + +std::variant> getFileFromPayload(const ovms::MultiPartParser& parser, const std::string& keyName) { + std::string_view value = parser.getFileContentByFieldName(keyName); + if (value.empty()) { + return std::nullopt; + } + return value; +} + +#define SET_OR_RETURN(TYPE, NAME, RHS) \ + auto NAME##_OPT = RHS; \ + RETURN_IF_HOLDS_STATUS(NAME##_OPT) \ + auto NAME = std::get(NAME##_OPT); + +#define RETURN_IF_HOLDS_STATUS(NAME) \ + if (std::holds_alternative(NAME)) { \ + return std::get(NAME); \ + } + +class SpeechCalculator : public CalculatorBase { + static const std::string INPUT_TAG_NAME; + static const std::string OUTPUT_TAG_NAME; + +public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + cc->Inputs().Tag(INPUT_TAG_NAME).Set(); + cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Set(); // TODO: template? + cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {} ] Close", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Open start", cc->NodeName()); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process start", cc->NodeName()); + + SpeechServableMap pipelinesMap = cc->InputSidePackets().Tag(SPEECH_SESSION_SIDE_PACKET_TAG).Get(); + auto it = pipelinesMap.find(cc->NodeName()); + RET_CHECK(it != pipelinesMap.end()) << "Could not find initialized Speech node named: " << cc->NodeName(); + auto pipe = it->second; + + auto payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); + + std::unique_ptr images; // output + std::unique_ptr output; + if (absl::StartsWith(payload.uri, "/v3/audio/transcriptions")) { + if (payload.multipartParser->hasParseError()) + return absl::InvalidArgumentError("Failed to parse multipart data"); + + SET_OR_RETURN(std::optional, file, getFileFromPayload(*payload.multipartParser, "file")); + auto stream = getFileFromPayload(*payload.multipartParser, "stream"); + if(!std::holds_alternative(stream)){ + SPDLOG_ERROR("NO VALUE"); + } + else{ + SPDLOG_ERROR("{}", (std::get>(stream)).value()); + } + if(!file.has_value()){ + return absl::InvalidArgumentError(absl::StrCat("File parsing fails")); + } + // ov::genai::WhisperGenerationConfig config = pipe->whisperPipeline->get_generation_config(); + // // 'task' and 'language' parameters are supported for multilingual models only + // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + // config.task = "transcribe"; + // config.return_timestamps = true; + ov::genai::RawSpeechInput raw_speech; + try { + raw_speech = read_mp3(file.value()); + } catch(std::exception&){ + return absl::InvalidArgumentError("Audio file pasing failed"); + } + std::string result = "{\"text\": \""; + std::unique_lock lock(pipe->whisperPipelineMutex); + result += pipe->whisperPipeline->generate(raw_speech); + result.append("\"}"); + SPDLOG_ERROR("{}",result); + output = std::make_unique(result); + } else if(absl::StartsWith(payload.uri, "/v3/audio/speech")){ + if (payload.parsedJson->HasParseError()) + return absl::InvalidArgumentError("Failed to parse JSON"); + + if (!payload.parsedJson->IsObject()) { + return absl::InvalidArgumentError("JSON body must be an object"); + } + auto inputIt = payload.parsedJson->FindMember("input"); + if (inputIt == payload.parsedJson->MemberEnd()) { + return absl::InvalidArgumentError("input field is missing in JSON body"); + } + if (!inputIt->value.IsString()) { + return absl::InvalidArgumentError("input field is not a string"); + } + auto streamIt = payload.parsedJson->FindMember("stream_format"); + if (streamIt != payload.parsedJson->MemberEnd()) { + SPDLOG_ERROR("STREAM: {}", streamIt->value.GetString()); + } + else{ + SPDLOG_ERROR("NO STREAM"); + } + SPDLOG_ERROR("INPUT: {}", inputIt->value.GetString()); + std::unique_lock lock(pipe->text2SpeechPipelineMutex); + auto gen_speech = pipe->text2SpeechPipeline->generate(inputIt->value.GetString()); + lock.unlock(); + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + format.channels = 1; + format.sampleRate = 16000; // assume it is always 16 KHz + format.bitsPerSample = gen_speech.speeches[0].get_element_type().bitwidth(); + SPDLOG_ERROR("1"); + drwav wav; + void* ppData; + size_t pDataSize; + + auto waveform_size = gen_speech.speeches[0].get_size(); + size_t total_samples = waveform_size * format.channels; + auto waveform_ptr = gen_speech.speeches[0].data(); + OPENVINO_ASSERT(drwav_init_memory_write_sequential_pcm_frames(&wav, &ppData, &pDataSize, &format, total_samples, nullptr), + "Failed to initialize WAV writer"); + SPDLOG_ERROR("2"); + drwav_uint64 frames_written = drwav_write_pcm_frames(&wav, total_samples, waveform_ptr); + OPENVINO_ASSERT(frames_written == total_samples, "Failed to write not all frames"); + SPDLOG_ERROR("3"); + output = std::make_unique(reinterpret_cast(ppData), pDataSize); + drwav_uninit(&wav); + SPDLOG_ERROR("4"); + //drwav_free(ppData, NULL); + }else { + return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri)); + } + + // auto outputOrStatus = generateJSONResponseFromOvTensor(*images); + // RETURN_IF_HOLDS_STATUS(outputOrStatus); + // auto output = std::move(std::get>(outputOrStatus)); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(output.release(), cc->InputTimestamp()); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "SpeechCalculator [Node: {}] Process end", cc->NodeName()); + + return absl::OkStatus(); + } +}; + +const std::string SpeechCalculator::INPUT_TAG_NAME{"HTTP_REQUEST_PAYLOAD"}; +const std::string SpeechCalculator::OUTPUT_TAG_NAME{"HTTP_RESPONSE_PAYLOAD"}; + +REGISTER_CALCULATOR(SpeechCalculator); + +} // namespace mediapipe diff --git a/src/speech/speech_calculator.proto b/src/speech/speech_calculator.proto new file mode 100644 index 0000000000..80ed29ccd6 --- /dev/null +++ b/src/speech/speech_calculator.proto @@ -0,0 +1,40 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +syntax = "proto2"; +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + + +message SpeechCalculatorOptions { + extend mediapipe.CalculatorOptions { + // https://github.com/google/mediapipe/issues/634 have to be unique in app + // no rule to obtain this + optional SpeechCalculatorOptions ext = 116423755; + } + + // fields required for GenAI pipeline initialization + required string models_path = 1; + optional string device = 2; + optional string plugin_config = 3; + enum Mode { + TEXT_TO_SPEECH = 0; + SPEECH_TO_TEXT = 1; + } + + required Mode mode = 4 [default = TEXT_TO_SPEECH]; +} diff --git a/src/speech/speech_servable.hpp b/src/speech/speech_servable.hpp new file mode 100644 index 0000000000..8097552d64 --- /dev/null +++ b/src/speech/speech_servable.hpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_graph.h" +#pragma GCC diagnostic pop +#pragma warning(pop) + +#include "openvino/genai/whisper_pipeline.hpp" +#include "openvino/genai/speech_generation/text2speech_pipeline.hpp" +#include "src/speech/speech_calculator.pb.h" + + +namespace ovms { + +struct SpeechServable { + std::filesystem::path parsedModelsPath; + std::shared_ptr whisperPipeline; + std::shared_ptr text2SpeechPipeline; + std::mutex whisperPipelineMutex; + std::mutex text2SpeechPipelineMutex; + + SpeechServable(const std::string& modelDir, const std::string& targetDevice, const std::string& graphPath, mediapipe::SpeechCalculatorOptions::Mode mode) { + auto fsModelsPath = std::filesystem::path(modelDir); + if (fsModelsPath.is_relative()) { + parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath); + } else { + parsedModelsPath = fsModelsPath.string(); + } + if(mode == mediapipe::SpeechCalculatorOptions::TEXT_TO_SPEECH){ + text2SpeechPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } + else{ + whisperPipeline = std::make_shared(parsedModelsPath.string(), targetDevice); + } + } +}; + +using SpeechServableMap = std::unordered_map>; +} // namespace ovms diff --git a/src/version.hpp b/src/version.hpp index fbebc03710..a1f181f6cc 100644 --- a/src/version.hpp +++ b/src/version.hpp @@ -16,7 +16,7 @@ #ifndef SRC_VERSION_HPP_ #define SRC_VERSION_HPP_ #define PROJECT_NAME "OpenVINO Model Server" -#define PROJECT_VERSION "REPLACE_PROJECT_VERSION" -#define OPENVINO_NAME "REPLACE_OPENVINO_NAME" -#define BAZEL_BUILD_FLAGS "REPLACE_BAZEL_BUILD_FLAGS" +#define PROJECT_VERSION "2025.3.0.8524d72c" +#define OPENVINO_NAME "2025.3.0.0.dev20250812" +#define BAZEL_BUILD_FLAGS "--config=win_mp_on_py_off" #endif // SRC_VERSION_HPP_"