Update GptSovits

NaruseMioShirakana · Jan 24, 2024 · 06d0330 · 06d0330
1 parent f1d95e6
commit 06d0330
Show file tree

Hide file tree

Showing 8 changed files with 989 additions and 79 deletions.
diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp
@@ -238,7 +238,7 @@ std::wstring MVSDict::DictReplaceGetStr(const std::wstring& input, const std::ws
 
 void Tokenizer::load(const std::wstring& _Path)
 {
-	const MJson _VocabJson(to_byte_string(_Path).c_str());
+	const MJson _VocabJson(_Path);
 	if (!_VocabJson.HasMember("ContinuingSubwordPrefix") ||
 		!_VocabJson.HasMember("Type") ||
 		!_VocabJson.HasMember("Vocab") ||
@@ -251,34 +251,35 @@ void Tokenizer::load(const std::wstring& _Path)
 	if (Type == "Unigram") Model = TokenizerModel::Unigram;
 	Symbol = to_wide_string(_VocabJson["ContinuingSubwordPrefix"].GetString());
 
-	if(Model == TokenizerModel::WordPiece)
+	if (_VocabJson["Vocab"].IsArray())
 	{
-		if(_VocabJson["Vocab"].IsArray())
-		{
-			const auto _VocabArray = _VocabJson["Vocab"].GetArray();
-			int64_t Index = 0;
-			for (const auto& Object : _VocabArray)
-				Vocab[to_wide_string(Object.GetString())] = Index++;
-		}
-		else
+		const auto _VocabArray = _VocabJson["Vocab"].GetArray();
+		int64_t Index = 0;
+		for (const auto& Object : _VocabArray)
 		{
-			const auto _VocabDict = _VocabJson["Vocab"].GetMemberArray();
-			for (const auto& Pair : _VocabDict)
+			if (!(Object.IsString() || Object.IsArray()))
 			{
-				if (Pair.second.IsInt())
-					Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetInt());
-				else if (Pair.second.IsFloat())
-					Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetFloat());
+				auto Beg = Object.MemberBegin();
+				if (Beg.first.empty())
+					continue;
+				Vocab[to_wide_string(Beg.first)] = Beg.second.GetInt64();
 			}
+			else
+				Vocab[to_wide_string(Object.IsString() ? Object.GetString() : Object.GetArray()[0].GetString())] = Index++;
 		}
 	}
 	else
 	{
-		const auto _VocabArray = _VocabJson["Vocab"].GetArray();
-		int64_t Index = 0;
-		for (const auto& Object : _VocabArray)
-			Vocab[to_wide_string(Object.GetArray()[0].GetString())] = Index++;
+		const auto _VocabDict = _VocabJson["Vocab"].GetMemberArray();
+		for (const auto& Pair : _VocabDict)
+		{
+			if (Pair.second.IsInt())
+				Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetInt());
+			else if (Pair.second.IsFloat())
+				Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetFloat());
+		}
 	}
+
 	if (_VocabJson.HasMember("UseSplit") && _VocabJson["UseSplit"].IsBool())
 		UseSplit = _VocabJson["UseSplit"].GetBool();
 }
@@ -430,7 +431,7 @@ std::vector<std::wstring> Tokenizer::WordPieceMethod(const std::wstring& Seq, si
 	return Tokens;
 }
 
-std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<std::wstring>& Seq) const
+std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<std::wstring>& Seq, bool SkipBlank) const
 {
 	std::vector<TokenizerType> Tokens;
 	const auto UNKID = Vocab.at(L"[UNK]");
@@ -439,8 +440,12 @@ std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<st
 		const auto res = Vocab.find(iter);
 		if (res != Vocab.end())
 			Tokens.emplace_back(res->second);
-		else if(Tokens.empty() || Tokens.back() != UNKID)
+		else if(iter.empty() || Tokens.back() != UNKID)
+		{
+			if (SkipBlank && std::regex_match(iter, BlankRegex))
+				continue;
 			Tokens.emplace_back(UNKID);
+		}
 	}
 	return Tokens;
 }

diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.hpp
@@ -228,7 +228,7 @@ class Tokenizer
 	}
 	[[nodiscard]] std::vector<std::wstring> WordPieceMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
 	[[nodiscard]] std::vector<std::wstring> UnigramMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
-	std::vector<TokenizerType> operator()(const std::vector<std::wstring>& Seq) const;
+	std::vector<TokenizerType> operator()(const std::vector<std::wstring>& Seq, bool SkipBlank = false) const;
 	[[nodiscard]] std::vector<std::wstring> Tokenize(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
 	[[nodiscard]] std::vector<std::wstring> SplitWithPlugin(const std::vector<std::wstring>& _Inputs) const;
 	static std::vector<std::wstring> SplitString(const std::wstring& _InputRef, const std::wregex& _SignRegex);

diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/GPT-SoVits.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/GPT-SoVits.hpp
@@ -0,0 +1,97 @@
+/**
+ * FileName: GPT-SoVits.hpp
+ * Note: MoeVoiceStudioCore GPT-SoVits模型类
+ *
+ * Copyright (C) 2022-2023 NaruseMioShirakana (shirakanamio@foxmail.com)
+ *
+ * This file is part of MoeVoiceStudioCore library.
+ * MoeVoiceStudioCore library is free software: you can redistribute it and/or modify it under the terms of the
+ * GNU Affero General Public License as published by the Free Software Foundation, either version 3
+ * of the License, or any later version.
+ *
+ * MoeVoiceStudioCore library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License along with Foobar.
+ * If not, see <https://www.gnu.org/licenses/agpl-3.0.html>.
+ *
+ * date: 2023-11-9 Create
+*/
+
+#pragma once
+#include "TTS.hpp"
+
+MoeVoiceStudioCoreHeader
+class GptSoVits : public TextToSpeech
+{
+public:
+    GptSoVits(const MJson& _Config, const ProgressCallback& _ProgressCallback,
+        const DurationCallback& _DurationCallback,
+        ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU,
+        unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0);
+
+    GptSoVits(const std::map<std::string, std::wstring>& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback,
+        const DurationCallback& _DurationCallback,
+        ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU,
+        unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0);
+
+    void load(const std::map<std::string, std::wstring>& _PathDict,
+        const MJson& _Config, const ProgressCallback& _ProgressCallback,
+        const DurationCallback& _DurationCallback);
+
+    ~GptSoVits() override;
+
+    void destory()
+    {
+        delete sessionBert;
+        delete sessionVits;
+        delete sessionSSL;
+        sessionBert = nullptr;
+        sessionVits = nullptr;
+        sessionSSL = nullptr;
+
+        delete sessionEncoder;
+        delete sessionFDecoder;
+        delete sessionDecoder;
+        sessionEncoder = nullptr;
+        sessionFDecoder = nullptr;
+        sessionDecoder = nullptr;
+    }
+
+    [[nodiscard]] std::tuple<std::vector<float>, std::vector<int64_t>> GetBertPhs(const MoeVSProjectSpace::MoeVSTTSSeq& Seq, const MoeVSG2P::Tokenizer& Tokenizer) const;
+
+    [[nodiscard]] std::vector<std::vector<int16_t>> Inference(const std::vector<MoeVSProjectSpace::MoeVSTTSSeq>& _Input) const override;
+private:
+    Ort::Session* sessionBert = nullptr;
+    Ort::Session* sessionVits = nullptr;
+    Ort::Session* sessionSSL = nullptr;
+
+    Ort::Session* sessionEncoder = nullptr;
+    Ort::Session* sessionFDecoder = nullptr;
+    Ort::Session* sessionDecoder = nullptr;
+
+    int64_t NumLayers = 24;
+    int64_t EmbeddingDim = 512;
+    int64_t EOSId = 1024;
+
+    std::vector<const char*> VitsInputNames = { "text_seq", "pred_semantic", "ref_audio" };
+    const std::vector<const char*> VitsOutputNames = { "audio" };
+
+    std::vector<const char*> EncoderInputNames = { "ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content" };
+    const std::vector<const char*> EncoderOutputNames = { "x", "prompts" };
+    std::vector<const char*> DecoderInputNames = { "iy", "ik", "iv", "iy_emb", "ix_example" };
+    const std::vector<const char*> DecoderOutputNames = { "y", "k", "v", "y_emb", "logits", "samples" };
+    std::vector<const char*> FDecoderInputNames = { "x", "prompts" };
+    const std::vector<const char*> FDecoderOutputNames = { "y", "k", "v", "y_emb", "x_example" };
+
+    std::vector<const char*> SSLInputNames = { "audio" };
+    const std::vector<const char*> SSLOutputNames = { "last_hidden_state" };
+
+    const std::vector<const char*> BertInputNames = { "input_ids", "attention_mask", "token_type_ids" };
+    const std::vector<const char*> BertInputNames2 = { "input_ids", "attention_mask" };
+    const std::vector<const char*> BertInputNames3 = { "input_ids" };
+    const std::vector<const char*> BertOutputNames = { "last_hidden_state" };
+};
+
+MoeVoiceStudioCoreEnd
diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp
@@ -165,17 +165,21 @@ class TextToSpeech : public MoeVoiceStudioModule
 			i /= Sum;
 	}
 
-	MoeVSG2P::Tokenizer& GetTokenizer(const std::string& LanguageId)
+	[[nodiscard]] const MoeVSG2P::Tokenizer& GetTokenizer(const std::string& LanguageId) const
 	{
-		return Tokenizers[LanguageMap.at(LanguageId)];
+		return Tokenizers.at(LanguageTokenizerMap.at(LanguageId));
 	}
+
+	[[nodiscard]] static std::tuple<std::vector<std::wstring>, std::vector<int64_t>> SplitTonesFromTokens(const std::vector<std::wstring>& _Src, const std::vector<int64_t>& _ToneRef, int64_t FirstToneIdx, const std::string& LanguageSymbol);
+
 protected:
 	DurationCallback CustomDurationCallback;
 	int64_t SpeakerCount = 1;
 	std::unordered_map<std::wstring, int64_t> SpeakerMap;
+	std::map<std::string, std::wstring> LanguageTokenizerMap;
 	std::map<std::string, int64_t> LanguageMap = { {"ZH", 0}, {"JP", 1}, {"EN", 2} };
 	std::map<std::string, int64_t> LanguageTones = { {"ZH", 0}, {"JP", 0}, {"EN", 0} };
-	std::vector<MoeVSG2P::Tokenizer> Tokenizers;
+	std::map<std::wstring, MoeVSG2P::Tokenizer> Tokenizers;
 	MoeVSG2P::MVSCleaner* Cleaner = nullptr;
 	bool AddBlank = true;
 	bool Emotion = false;

diff --git a/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp b/MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp
@@ -26,6 +26,8 @@ MoeVoiceStudioCoreHeader
 
 void SetBertEnabled(bool cond);
 
+void DestoryAllBerts();
+
 class Vits : public TextToSpeech
 {
 public:
@@ -77,8 +79,10 @@ class Vits : public TextToSpeech
     bool EncoderG = false;
     std::vector<std::string> BertNames;
     std::vector<std::wstring> BertNamesIdx;
+    std::wstring ClapName;
     int64_t VQCodeBookSize = 10;
     bool UseVQ = false;
+    bool UseClap = false;
 
     std::vector<const char*> EncoderInputNames = { "x" };
     const std::vector<const char*> EncoderOutputNames = { "xout", "m_p", "logs_p", "x_mask" };