diff --git a/CMakeLists.txt b/CMakeLists.txt index a0f4a38..14187b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ add_subdirectory(natural_language_processing/fugumt-en-ja) add_subdirectory(natural_language_processing/sentence_transformers) add_subdirectory(natural_language_processing/t5_whisper_medical) add_subdirectory(natural_language_processing/multilingual-e5) +add_subdirectory(natural_language_processing/soundchoice-g2p) add_subdirectory(object_detection/yolov3-tiny) add_subdirectory(object_detection/yolox) diff --git a/README.md b/README.md index 2ede73c..1f7fdec 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ cd object_detection/yolox |[sentence_transformers](/natural_language_processing/sentence_transformers) | [sentence transformers](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) | Pytorch | 1.2.7 and later | |[t5_whisper_medical](/natural_language_processing/t5_whisper_medical) | error correction of medical terms using t5 | Pytorch | 1.2.13 and later | | |[multilingual-e5](/natural_language_processing/multilingual-e5) | [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base) | Pytorch | 1.2.15 and later | [JP](https://medium.com/axinc/multilingual-e5-%E5%A4%9A%E8%A8%80%E8%AA%9E%E3%81%AE%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%82%92embedding%E3%81%99%E3%82%8B%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%A2%E3%83%87%E3%83%AB-71f1dec7c4f0) | +|[soundchoice-g2p](/natural_language_processing/soundchoice-g2p) | [Hugging Face - speechbrain/soundchoice-g2p](https://huggingface.co/speechbrain/soundchoice-g2p) | Pytorch | 1.2.16 and later | | ## Object detection diff --git a/natural_language_processing/soundchoice-g2p/CMakeLists.txt b/natural_language_processing/soundchoice-g2p/CMakeLists.txt new file mode 100644 index 0000000..fd9356d --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1) + +set (PROJECT_NAME soundchoice-g2p) +set (SRC_FILES ${PROJECT_NAME}.cpp) +set (CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + +message(${INCLUDE_PATH}) +message(${LIBRARY_PATH}) + +project(${PROJECT_NAME} CXX) + +include_directories(${INCLUDE_PATH}) +link_directories(${LIBRARY_PATH}) + +add_executable(${PROJECT_NAME} ${SRC_FILES}) + +target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_11) +target_link_libraries(${PROJECT_NAME} ailia) +set (CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR}) +install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION .) diff --git a/natural_language_processing/soundchoice-g2p/LICENSE b/natural_language_processing/soundchoice-g2p/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/natural_language_processing/soundchoice-g2p/README.md b/natural_language_processing/soundchoice-g2p/README.md new file mode 100644 index 0000000..c66449a --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/README.md @@ -0,0 +1,56 @@ +# SoundChoice: Grapheme-to-Phoneme Models with Semantic Disambiguation + +## Input + +Text to recognize + +- Example +``` +To be or not to be, that is the question +``` + +## Output + +Phoneme +``` +T-UW- -B-IY- -AO-R- -N-AA-T- -T-UW- -B-IY- -DH-AE-T- -IH-Z- -DH-AH- -K-W-EH-S-CH-AH-N +``` + +## Requirements +This model requires additional module. + +``` +pip3 install transformers +``` + +## Usage +Automatically downloads the onnx and prototxt files on the first run. +It is necessary to be connected to the Internet while downloading. + +For the sample text, +```bash +$ python3 soundchoice-g2p.py +``` + +If you want to specify the input text, put the text after the `--input` option. +```bash +$ python3 soundchoice-g2p.py --input TEXT +``` + +## Reference + +- [Hugging Face - speechbrain/soundchoice-g2p](https://huggingface.co/speechbrain/soundchoice-g2p) + +## Framework + +Pytorch + +## Model Format + +ONNX opset=17 + +## Netron + +[soundchoice-g2p_atn.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/soundchoice-g2p/soundchoice-g2p_atn.onnx.prototxt) +[soundchoice-g2p_emb.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/soundchoice-g2p/soundchoice-g2p_emb.onnx.prototxt) +[rnn_beam_searcher.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/soundchoice-g2p/rnn_beam_searcher.onnx.prototxt) diff --git a/natural_language_processing/soundchoice-g2p/soundchoice-g2p.bat b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.bat new file mode 100644 index 0000000..44d2f51 --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.bat @@ -0,0 +1,32 @@ +@echo off +setlocal enabledelayedexpansion +cd %~dp0 + +set MODEL=soundchoice-g2p +set FILE1=soundchoice-g2p_atn.onnx +set FILE2=soundchoice-g2p_emb.onnx +set FILE3=rnn_beam_searcher.onnx +set FILE4=vocab.txt + +rem download +if not "%1" == "-h" if not "%1" == "--help" ( + if not exist %FILE1% ( + echo Downloading onnx file... ^(save path: %FILE1%^) + curl https://storage.googleapis.com/ailia-models/%MODEL%/%FILE1% -o %FILE1% + ) + if not exist %FILE2% ( + echo Downloading onnx file... ^(save path: %FILE2%^) + curl https://storage.googleapis.com/ailia-models/%MODEL%/%FILE2% -o %FILE2% + ) + if not exist %FILE3% ( + echo Downloading onnx file... ^(save path: %FILE3%^) + curl https://storage.googleapis.com/ailia-models/%MODEL%/%FILE3% -o %FILE3% + ) + if not exist %FILE4% ( + echo Downloading onnx file... ^(save path: %FILE4%^) + curl https://storage.googleapis.com/ailia-models/%MODEL%/%FILE4% -o %FILE4% + ) + echo ONNX file are prepared^^! +) +rem execute +.\%MODEL%.exe %* diff --git a/natural_language_processing/soundchoice-g2p/soundchoice-g2p.cpp b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.cpp new file mode 100644 index 0000000..da0b0bf --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.cpp @@ -0,0 +1,676 @@ +/******************************************************************* +* +* DESCRIPTION: +* AILIA SoundChoice G2P sample +* AUTHOR: +* +* DATE:2024/04/30 +* +*******************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef UNICODE + +#include "ailia.h" + +bool debug = true; +bool debug_token = false; + + +// ====================== +// Parameters +// ====================== + +#if defined(_WIN32) || defined(_WIN64) +#define PRINT_OUT(...) fprintf_s(stdout, __VA_ARGS__) +#define PRINT_ERR(...) fprintf_s(stderr, __VA_ARGS__) +#else +#define PRINT_OUT(...) fprintf(stdout, __VA_ARGS__) +#define PRINT_ERR(...) fprintf(stderr, __VA_ARGS__) +#endif + +#define BENCHMARK_ITERS 5 + +#define MODEL_N 3 + +#define MODEL_BERT 0 +#define MODEL_ENCODER 1 +#define MODEL_DECODER 2 + +const char *MODEL_NAME[3] = {"soundchoice-g2p_emb.onnx", "soundchoice-g2p_atn.onnx", "rnn_beam_searcher.onnx"}; + +static bool benchmark = false; +static int args_env_id = -1; + +//const int REF_TOKEN_SIZE = 13; +//std::string reference_text = "To be or not to be, that is the question"; +//const int reference_token[REF_TOKEN_SIZE] = {101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003, 1996, 3160, 102}; + +const int REF_TOKEN_SIZE = 14; +std::string reference_text = "To be or not to be, that is the questionary"; +const int reference_token[REF_TOKEN_SIZE] = {101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003, 1996, 3160, 5649, 102}; + +const int BERT_EMBEDDING_SIZE = 768; +const int BERT_HIDDEN_LAYER_N = 4; + +// ====================== +// Arguemnt Parser +// ====================== + +static void print_usage() +{ + PRINT_OUT("usage: soundchoice-g2p [-h] [-i TEXT] [-b] [-e ENV_ID]\n"); + return; +} + + +static void print_help() +{ + PRINT_OUT("\n"); + PRINT_OUT("soundchoice-g2p model\n"); + PRINT_OUT("\n"); + PRINT_OUT("optional arguments:\n"); + PRINT_OUT(" -h, --help show this help message and exit\n"); + PRINT_OUT(" -i FILE, --input FILE\n"); + PRINT_OUT(" The input file.\n"); + PRINT_OUT(" -b, --benchmark Running the inference on the same input 5 times to\n"); + PRINT_OUT(" measure execution performance. (Cannot be used in\n"); + PRINT_OUT(" video mode)\n"); + PRINT_OUT(" -e ENV_ID, --env_id ENV_ID\n"); + PRINT_OUT(" The backend environment id.\n"); + return; +} + + +static void print_error(std::string arg) +{ + PRINT_ERR("gpt-sovits: error: unrecognized arguments: %s\n", arg.c_str()); + return; +} + + +static int argument_parser(int argc, char **argv) +{ + int status = 0; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (status == 0) { + if (arg == "-i" || arg == "--input") { + status = 1; + } + else if (arg == "-b" || arg == "--benchmark") { + benchmark = true; + } + else if (arg == "-h" || arg == "--help") { + print_usage(); + print_help(); + return -1; + } + else if (arg == "-e" || arg == "--env_id") { + status = 4; + } + else { + print_usage(); + print_error(arg); + return -1; + } + } + else if (arg[0] != '-') { + switch (status) { + case 1: + reference_text = std::string(arg); + break; + case 4: + args_env_id = atoi(arg.c_str()); + break; + default: + print_usage(); + print_error(arg); + return -1; + } + status = 0; + } + else { + print_usage(); + print_error(arg); + return -1; + } + } + + return AILIA_STATUS_SUCCESS; +} + +// ====================== +// Main functions +// ====================== + +void setErrorDetail(const char *func, const char *detail){ + PRINT_ERR("Error %s Detail %s\n", func, detail); + throw(func); +} + +struct AILIATensor{ + std::vector data; + AILIAShape shape; +}; + +void forward(AILIANetwork *ailia, std::vector &inputs, std::vector &outputs){ + int status; + + unsigned int input_blob_cnt; + status = ailiaGetInputBlobCount(ailia, &input_blob_cnt); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaGetInputBlobCount",ailiaGetErrorDetail(ailia)); + } + + if (input_blob_cnt != inputs.size()){ + setErrorDetail("input blob cnt and input tensor size must be same", ""); + } + + for (int i = 0; i < inputs.size(); i++){ + unsigned int input_blob_idx = 0; + status = ailiaGetBlobIndexByInputIndex(ailia, &input_blob_idx, i); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaGetBlobIndexByInputIndex", ailiaGetErrorDetail(ailia)); + } + + if (debug){ + PRINT_OUT("input blob shape %d %d %d %d dims %d\n",inputs[i]->shape.x,inputs[i]->shape.y,inputs[i]->shape.z,inputs[i]->shape.w,inputs[i]->shape.dim); + } + + status = ailiaSetInputBlobShape(ailia,&inputs[i]->shape,input_blob_idx,AILIA_SHAPE_VERSION); + if(status!=AILIA_STATUS_SUCCESS){ + setErrorDetail("ailiaSetInputBlobShape",ailiaGetErrorDetail(ailia)); + } + + status = ailiaSetInputBlobData(ailia, &(inputs[i]->data)[0], inputs[i]->data.size() * sizeof(float), input_blob_idx); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaSetInputBlobData",ailiaGetErrorDetail(ailia)); + } + } + + status = ailiaUpdate(ailia); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaUpdate",ailiaGetErrorDetail(ailia)); + } + + unsigned int output_blob_cnt; + status = ailiaGetOutputBlobCount(ailia, &output_blob_cnt); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaGetOutputBlobCount",ailiaGetErrorDetail(ailia)); + } + + for (int i = 0; i < output_blob_cnt; i++){ + unsigned int output_blob_idx = 0; + status = ailiaGetBlobIndexByOutputIndex(ailia, &output_blob_idx, i); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaGetBlobIndexByInputIndex",ailiaGetErrorDetail(ailia)); + } + + AILIAShape output_blob_shape; + status=ailiaGetBlobShape(ailia,&output_blob_shape,output_blob_idx,AILIA_SHAPE_VERSION); + if(status!=AILIA_STATUS_SUCCESS){ + setErrorDetail("ailiaGetBlobShape", ailiaGetErrorDetail(ailia)); + } + + if (debug){ + PRINT_OUT("output_blob_shape %d %d %d %d dims %d\n",output_blob_shape.x,output_blob_shape.y,output_blob_shape.z,output_blob_shape.w,output_blob_shape.dim); + } + + if (outputs.size() <= i){ + AILIATensor tensor; + outputs.push_back(tensor); + } + + AILIATensor &ref_tensor = outputs[i]; + int new_shape = output_blob_shape.x*output_blob_shape.y*output_blob_shape.z*output_blob_shape.w; + if (new_shape != ref_tensor.data.size()){ + ref_tensor.data.resize(new_shape); + } + ref_tensor.shape = output_blob_shape; + + status = ailiaGetBlobData(ailia, &ref_tensor.data[0], ref_tensor.data.size() * sizeof(float), output_blob_idx); + if (status != AILIA_STATUS_SUCCESS) { + setErrorDetail("ailiaGetBlobData",ailiaGetErrorDetail(ailia)); + } + } +} + +std::string clean_pipeline(const std::string& txt, const std::unordered_set& graphemes) { + std::regex RE_MULTI_SPACE(R"(\s{2,})"); + std::string result = txt; + + // Convert to uppercase + std::transform(result.begin(), result.end(), result.begin(), ::toupper); + + // Remove characters not in graphemes + result.erase( + std::remove_if(result.begin(), result.end(), [&](char c) { + return graphemes.find(c) == graphemes.end(); + }), + result.end() + ); + + // Replace multiple spaces with a single space + result = std::regex_replace(result, RE_MULTI_SPACE, " "); + + return result; +} + +std::unordered_map lab2ind = { + {"", 0}, {"", 1}, {"", 2}, {"A", 3}, {"B", 4}, {"C", 5}, {"D", 6}, {"E", 7}, + {"F", 8}, {"G", 9}, {"H", 10}, {"I", 11}, {"J", 12}, {"K", 13}, {"L", 14}, {"M", 15}, {"N", 16}, + {"O", 17}, {"P", 18}, {"Q", 19}, {"R", 20}, {"S", 21}, {"T", 22}, {"U", 23}, {"V", 24}, + {"W", 25}, {"X", 26}, {"Y", 27}, {"Z", 28}, {"'", 29}, {" ", 30} +}; + +std::vector grapheme_pipeline(const std::string& char_seq, bool uppercase = true) { + std::string char_seq_upper = char_seq; + + if (uppercase) { + std::transform(char_seq_upper.begin(), char_seq_upper.end(), char_seq_upper.begin(), ::toupper); + } + + std::vector grapheme_list; + for (const char& c : char_seq_upper) { + std::string grapheme(1, c); // convert char to string + if (lab2ind.find(grapheme) != lab2ind.end()) { + grapheme_list.push_back(grapheme); + } + } + + auto encode_label = [](const std::string& label) -> int { + try { + return lab2ind.at(label); + } catch (const std::out_of_range&) { + std::string unk_label = ""; + return lab2ind.at(unk_label); + } + }; + + std::vector grapheme_encoded_list; + for (const auto& grapheme : grapheme_list) { + grapheme_encoded_list.push_back(encode_label(grapheme)); + } + + std::string bos_label = ""; + std::vector grapheme_encoded = { lab2ind[bos_label] }; + grapheme_encoded.insert(grapheme_encoded.end(), grapheme_encoded_list.begin(), grapheme_encoded_list.end()); + + int grapheme_len = grapheme_encoded.size(); + + // Convert grapheme_list of strings to list of single characters + std::vector grapheme_char_list; + for (const std::string& grapheme : grapheme_list) { + grapheme_char_list.push_back(grapheme[0]); + } + + //return {grapheme_char_list, grapheme_encoded_list, grapheme_encoded, grapheme_len}; + return grapheme_encoded; +} + +static int is_special_token_or_continue(int token, std::vector &continue_tokens){ + const int TOKEN_ID_CLS = 101; + const int TOKEN_ID_SEP = 102; + if (token == TOKEN_ID_CLS || token == TOKEN_ID_SEP || std::count(continue_tokens.begin(), continue_tokens.end(), token) != 0){ + return 1; + } + return 0; +} + +static std::vector expand_to_chars(std::vector grapheme_encoded, std::vector &word_emb){ + std::vector char_word_emb(grapheme_encoded.size() * BERT_EMBEDDING_SIZE); + int word_separator = 30; // space + int word_cnt = 0; + for (int i = 0; i < grapheme_encoded.size(); i++){ + printf("%d ", grapheme_encoded[i]); + + if (word_emb.size() < BERT_EMBEDDING_SIZE * word_cnt){ + throw("Word emb overflow"); + } + + for (int j = 0; j < BERT_EMBEDDING_SIZE; j++){ + char_word_emb[BERT_EMBEDDING_SIZE * i + j] = word_emb[BERT_EMBEDDING_SIZE * word_cnt + j]; + } + + if (grapheme_encoded[i] == word_separator){ + word_cnt++; + } + } + printf("word_cnt %d %d\n", word_cnt, word_emb.size() / BERT_EMBEDDING_SIZE); + return char_word_emb; +} + + +std::vector encode_input(AILIANetwork *bert, const std::string& input_text, std::vector &continue_tokens) { + std::unordered_set graphemes = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', + 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + '\'', ' ' + }; + + std::string txt_cleaned = clean_pipeline(input_text, graphemes); + if (debug){ + printf("input %s\n", input_text.c_str()); + printf("clean %s\n", txt_cleaned.c_str()); + } + + std::vector grapheme_encoded = grapheme_pipeline(txt_cleaned); + if (debug){ + printf("grapheme_encoded "); + for (int i = 0; i < grapheme_encoded.size(); i++){ + printf("%d ", grapheme_encoded[i]); + } + printf("\n"); + } + + std::vector input_ids_data(REF_TOKEN_SIZE); + std::vector attention_mask_data(REF_TOKEN_SIZE); + std::vector token_type_ids_data(REF_TOKEN_SIZE); + + for (int i = 0; i < REF_TOKEN_SIZE; i++){ + input_ids_data[i] = reference_token[i]; + attention_mask_data[i] = 1; + token_type_ids_data[i] = 0; + } + + AILIATensor input_ids; + input_ids.data = input_ids_data; + input_ids.shape.x = input_ids_data.size(); + input_ids.shape.y = 1; + input_ids.shape.z = 1; + input_ids.shape.w = 1; + input_ids.shape.dim = 2; + + AILIATensor attention_mask; + attention_mask.data = attention_mask_data; + attention_mask.shape.x = attention_mask_data.size(); + attention_mask.shape.y = 1; + attention_mask.shape.z = 1; + attention_mask.shape.w = 1; + attention_mask.shape.dim = 2; + + AILIATensor token_type_ids; + token_type_ids.data = token_type_ids_data; + token_type_ids.shape.x = token_type_ids_data.size(); + token_type_ids.shape.y = 1; + token_type_ids.shape.z = 1; + token_type_ids.shape.w = 1; + token_type_ids.shape.dim = 2; + + std::vector bert_inputs; + bert_inputs.push_back(&input_ids); + bert_inputs.push_back(&attention_mask); + bert_inputs.push_back(&token_type_ids); + std::vector bert_outputs; + forward(bert, bert_inputs, bert_outputs); + + if (debug){ + printf("hidden_states shape %d %d %d %d\n", bert_outputs[0].shape.x, bert_outputs[0].shape.y, bert_outputs[0].shape.z, bert_outputs[0].shape.w); + printf("hidden_states "); + for (int i = 0; i < 10; i++){ + printf("%f ", bert_outputs[0].data[i]); + } + printf("\n"); + } + + // hidden layerの末尾4レイヤーの結果をマージする + std::vector word_emb_with_special_token(BERT_EMBEDDING_SIZE * bert_outputs[0].shape.y); + for (int i = 0; i < bert_outputs[0].shape.y; i++){ + for (int j = 0; j < BERT_EMBEDDING_SIZE; j++){ + for (int k = 0; k < BERT_HIDDEN_LAYER_N; k++){ + word_emb_with_special_token[i * BERT_EMBEDDING_SIZE + j] += bert_outputs[0].data[(bert_outputs[0].shape.w - 1 - k) * BERT_EMBEDDING_SIZE * bert_outputs[0].shape.y + i * BERT_EMBEDDING_SIZE + j]; + } + } + } + + // wordに対応するトークン位置を取得する + std::vector token_ids_word; + for (int i = 0; i < input_ids_data.size(); i++){ + if (!is_special_token_or_continue(input_ids_data[i], continue_tokens)){ + token_ids_word.push_back(i); + } + } + + // wordに対応するトークン位置のembeddingを取得する + std::vector word_emb(BERT_EMBEDDING_SIZE * token_ids_word.size()); + for (int i = 0; i < token_ids_word.size(); i++){ + int id = token_ids_word[i]; + for (int j = 0; j < BERT_EMBEDDING_SIZE; j++){ + word_emb[i * BERT_EMBEDDING_SIZE + j] = word_emb_with_special_token[id * BERT_EMBEDDING_SIZE + j]; + } + } + + if (debug){ + printf("input_ids_data "); + for (int i = 0; i < input_ids_data.size(); i++){ + printf("%d ", (int)input_ids_data[i]); + } + printf("\n"); + printf("is_special_token_or_continue "); + for (int i = 0; i < input_ids_data.size(); i++){ + printf("%d ", is_special_token_or_continue(input_ids_data[i], continue_tokens)); + } + printf("\n"); + printf("token_ids_word "); + for (int i = 0; i < token_ids_word.size(); i++){ + printf("%d ", token_ids_word[i]); + } + printf("\n"); + printf("word_emb "); + for (int i = 0; i < 10; i++){ + printf("%f ", word_emb[i]); + } + printf("\n"); + } + + // character単位のembeddingに変換する + std::vector char_emb = expand_to_chars(grapheme_encoded, word_emb); + + if (debug){ + printf("\n"); + printf("char_emb "); + for (int i = 0; i < 10; i++){ + printf("%f ", char_emb[i]); + } + printf("\n"); + } + + std::vector grapheme_encoded_data(grapheme_encoded.size()); + for (int i = 0; i < grapheme_encoded.size(); i++){ + grapheme_encoded_data[i] = grapheme_encoded[i]; + } + + AILIATensor grapheme_encoded_tensor; + grapheme_encoded_tensor.data = grapheme_encoded_data; + grapheme_encoded_tensor.shape.x = grapheme_encoded.size(); + grapheme_encoded_tensor.shape.y = 1; + grapheme_encoded_tensor.shape.z = 1; + grapheme_encoded_tensor.shape.w = 1; + grapheme_encoded_tensor.shape.dim = 2; + + AILIATensor char_emb_tensor; + char_emb_tensor.data = char_emb; + char_emb_tensor.shape.x = BERT_EMBEDDING_SIZE; + char_emb_tensor.shape.y = grapheme_encoded.size(); + char_emb_tensor.shape.z = 1; + char_emb_tensor.shape.w = 1; + char_emb_tensor.shape.dim = 3; + + std::vector outputs; + outputs.push_back(grapheme_encoded_tensor); + outputs.push_back(char_emb_tensor); + + return outputs; +} + +static int compute(AILIANetwork* net[MODEL_N], std::vector &continue_tokens) +{ + int status = AILIA_STATUS_SUCCESS; + + std::vector encode_outpus = encode_input(net[MODEL_BERT], reference_text, continue_tokens); + + AILIATensor grapheme_encoded = encode_outpus[0]; + AILIATensor word_emb = encode_outpus[1]; + + std::vector atten_inputs; + atten_inputs.push_back(&grapheme_encoded); + atten_inputs.push_back(&word_emb); + std::vector atten_outputs; + forward(net[MODEL_ENCODER], atten_inputs, atten_outputs); + + AILIATensor p_seq = atten_outputs[0]; + AILIATensor encoder_outputs = atten_outputs[1]; + + if (debug){ + printf("p_seq.shape %d %d %d %d\n", p_seq.shape.x, p_seq.shape.y, p_seq.shape.z, p_seq.shape.w); + printf("p_seq "); + for (int i = 0; i < 10; i++){ + printf("%f ", p_seq.data[i]); + } + printf("\n"); + printf("encoder_outputs.shape %d %d %d %d\n", encoder_outputs.shape.x, encoder_outputs.shape.y, encoder_outputs.shape.z, encoder_outputs.shape.w); + printf("encoder_outputs "); + for (int i = 0; i < 10; i++){ + printf("%f ", encoder_outputs.data[i]); + } + printf("\n"); + } + + PRINT_OUT("Program finished successfully.\n"); + + return AILIA_STATUS_SUCCESS; +} + +static std::vector load_vocab(const char *path_a) +{ + FILE *fp = NULL; + fp = fopen(path_a, "r"); + if (fp == NULL){ + throw("vocab file not found"); + } + std::vector line; + std::vector continue_tokens; + int id = 0; + while(!feof(fp)){ + char c = fgetc(fp); + line.push_back(c); + if (c == '\n'){ + line[line.size() - 1] = '\0'; + if (line.size() >= 2){ + //printf("%s\n", &line[0]); + if (line[0] == '#' && line[1] == '#'){ + continue_tokens.push_back(id); + //printf("%d ", id); + } + } + line.clear(); + id++; + } + } + fclose(fp); + return continue_tokens; +} + +int main(int argc, char **argv) +{ + int status = argument_parser(argc, argv); + if (status != AILIA_STATUS_SUCCESS) { + return -1; + } + + // env list + unsigned int env_count; + status = ailiaGetEnvironmentCount(&env_count); + if (status != AILIA_STATUS_SUCCESS) { + PRINT_ERR("ailiaGetEnvironmentCount Failed %d", status); + return -1; + } + + int env_id = AILIA_ENVIRONMENT_ID_AUTO; + for (unsigned int i = 0; i < env_count; i++) { + AILIAEnvironment* env; + status = ailiaGetEnvironment(&env, i, AILIA_ENVIRONMENT_VERSION); + //bool is_fp16 = (env->props & AILIA_ENVIRONMENT_PROPERTY_FP16) != 0; + PRINT_OUT("env_id : %d type : %d name : %s", env->id, env->type, env->name); + //if (is_fp16){ + // PRINT_OUT(" (Warning : FP16 backend is not worked this model)\n"); + // continue; + //} + PRINT_OUT("\n"); + if (args_env_id == env->id){ + env_id = env->id; + } + if (args_env_id == -1 && env_id == AILIA_ENVIRONMENT_ID_AUTO){ + if (env->type == AILIA_ENVIRONMENT_TYPE_GPU) { + env_id = env->id; + } + } + } + if (args_env_id == -1){ + PRINT_OUT("you can select environment using -e option\n"); + } + + // net initialize + AILIANetwork *ailia[MODEL_N]; + for (int i = 0; i < MODEL_N; i++){ + status = ailiaCreate(&ailia[i], env_id, AILIA_MULTITHREAD_AUTO); + if (status != AILIA_STATUS_SUCCESS) { + PRINT_ERR("ailiaCreate failed %d\n", status); + if (status == AILIA_STATUS_LICENSE_NOT_FOUND || status==AILIA_STATUS_LICENSE_EXPIRED){ + PRINT_OUT("License file not found or expired : please place license file (AILIA.lic)\n"); + } + return -1; + } + + status = ailiaSetMemoryMode(ailia[i], AILIA_MEMORY_OPTIMAIZE_DEFAULT | AILIA_MEMORY_REUSE_INTERSTAGE); + if (status != AILIA_STATUS_SUCCESS) { + PRINT_ERR("ailiaSetMemoryMode failed %d\n", status); + ailiaDestroy(ailia[i]); + return -1; + } + + AILIAEnvironment *env_ptr = nullptr; + status = ailiaGetSelectedEnvironment(ailia[i], &env_ptr, AILIA_ENVIRONMENT_VERSION); + if (status != AILIA_STATUS_SUCCESS) { + PRINT_ERR("ailiaGetSelectedEnvironment failed %d\n", status); + ailiaDestroy(ailia[i]); + return -1; + } + + PRINT_OUT("selected env name : %s\n", env_ptr->name); + + status = ailiaOpenWeightFile(ailia[i], MODEL_NAME[i]); + if (status != AILIA_STATUS_SUCCESS) { + PRINT_ERR("ailiaOpenWeightFile failed %d\n", status); + ailiaDestroy(ailia[i]); + return -1; + } + } + + std::vector continue_tokens = load_vocab("vocab.txt"); + + auto start2 = std::chrono::high_resolution_clock::now(); + status = compute(ailia, continue_tokens); + auto end2 = std::chrono::high_resolution_clock::now(); + if (benchmark){ + PRINT_OUT("total processing time %lld ms\n", std::chrono::duration_cast(end2 - start2).count()); + } + + for (int i = 0; i < MODEL_N; i++){ + ailiaDestroy(ailia[i]); + } + + return status; +} diff --git a/natural_language_processing/soundchoice-g2p/soundchoice-g2p.sh b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.sh new file mode 100755 index 0000000..5a1c2bd --- /dev/null +++ b/natural_language_processing/soundchoice-g2p/soundchoice-g2p.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +MODEL="soundchoice-g2p" +FILE1="soundchoice-g2p_atn.onnx" +FILE2="soundchoice-g2p_emb.onnx" +FILE3="rnn_beam_searcher.onnx" +FILE4="vocab.txt" + +#download +if [ ! "$1" = "-h" ] && [ ! "$1" = "--help" ]; then + if [ ! -e ${FILE1} ]; then + echo "Downloading onnx file... save path: ${FILE1}" + curl https://storage.googleapis.com/ailia-models/${MODEL}/${FILE1} -o ${FILE1} + fi + if [ ! -e ${FILE2} ]; then + echo "Downloading onnx file... save path: ${FILE2}" + curl https://storage.googleapis.com/ailia-models/${MODEL}/${FILE2} -o ${FILE2} + fi + if [ ! -e ${FILE3} ]; then + echo "Downloading onnx file... save path: ${FILE3}" + curl https://storage.googleapis.com/ailia-models/${MODEL}/${FILE3} -o ${FILE3} + fi + if [ ! -e ${FILE4} ]; then + echo "Downloading onnx file... save path: ${FILE4}" + curl https://storage.googleapis.com/ailia-models/${MODEL}/${FILE4} -o ${FILE4} + fi + echo "ONNX file are prepared!" +fi +#execute +./${MODEL} $*