Skip to content

Commit

Permalink
Update GptSovits
Browse files Browse the repository at this point in the history
  • Loading branch information
NaruseMioShirakana committed Jan 24, 2024
1 parent f1d95e6 commit 06d0330
Show file tree
Hide file tree
Showing 8 changed files with 989 additions and 79 deletions.
49 changes: 27 additions & 22 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/InferTools/G2P/MoeVSG2P.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ std::wstring MVSDict::DictReplaceGetStr(const std::wstring& input, const std::ws

void Tokenizer::load(const std::wstring& _Path)
{
const MJson _VocabJson(to_byte_string(_Path).c_str());
const MJson _VocabJson(_Path);
if (!_VocabJson.HasMember("ContinuingSubwordPrefix") ||
!_VocabJson.HasMember("Type") ||
!_VocabJson.HasMember("Vocab") ||
Expand All @@ -251,34 +251,35 @@ void Tokenizer::load(const std::wstring& _Path)
if (Type == "Unigram") Model = TokenizerModel::Unigram;
Symbol = to_wide_string(_VocabJson["ContinuingSubwordPrefix"].GetString());

if(Model == TokenizerModel::WordPiece)
if (_VocabJson["Vocab"].IsArray())
{
if(_VocabJson["Vocab"].IsArray())
{
const auto _VocabArray = _VocabJson["Vocab"].GetArray();
int64_t Index = 0;
for (const auto& Object : _VocabArray)
Vocab[to_wide_string(Object.GetString())] = Index++;
}
else
const auto _VocabArray = _VocabJson["Vocab"].GetArray();
int64_t Index = 0;
for (const auto& Object : _VocabArray)
{
const auto _VocabDict = _VocabJson["Vocab"].GetMemberArray();
for (const auto& Pair : _VocabDict)
if (!(Object.IsString() || Object.IsArray()))
{
if (Pair.second.IsInt())
Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetInt());
else if (Pair.second.IsFloat())
Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetFloat());
auto Beg = Object.MemberBegin();
if (Beg.first.empty())
continue;
Vocab[to_wide_string(Beg.first)] = Beg.second.GetInt64();
}
else
Vocab[to_wide_string(Object.IsString() ? Object.GetString() : Object.GetArray()[0].GetString())] = Index++;
}
}
else
{
const auto _VocabArray = _VocabJson["Vocab"].GetArray();
int64_t Index = 0;
for (const auto& Object : _VocabArray)
Vocab[to_wide_string(Object.GetArray()[0].GetString())] = Index++;
const auto _VocabDict = _VocabJson["Vocab"].GetMemberArray();
for (const auto& Pair : _VocabDict)
{
if (Pair.second.IsInt())
Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetInt());
else if (Pair.second.IsFloat())
Vocab[to_wide_string(Pair.first)] = TokenizerType(Pair.second.GetFloat());
}
}

if (_VocabJson.HasMember("UseSplit") && _VocabJson["UseSplit"].IsBool())
UseSplit = _VocabJson["UseSplit"].GetBool();
}
Expand Down Expand Up @@ -430,7 +431,7 @@ std::vector<std::wstring> Tokenizer::WordPieceMethod(const std::wstring& Seq, si
return Tokens;
}

std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<std::wstring>& Seq) const
std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<std::wstring>& Seq, bool SkipBlank) const
{
std::vector<TokenizerType> Tokens;
const auto UNKID = Vocab.at(L"[UNK]");
Expand All @@ -439,8 +440,12 @@ std::vector<Tokenizer::TokenizerType> Tokenizer::operator()(const std::vector<st
const auto res = Vocab.find(iter);
if (res != Vocab.end())
Tokens.emplace_back(res->second);
else if(Tokens.empty() || Tokens.back() != UNKID)
else if(iter.empty() || Tokens.back() != UNKID)
{
if (SkipBlank && std::regex_match(iter, BlankRegex))
continue;
Tokens.emplace_back(UNKID);
}
}
return Tokens;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ class Tokenizer
}
[[nodiscard]] std::vector<std::wstring> WordPieceMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
[[nodiscard]] std::vector<std::wstring> UnigramMethod(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
std::vector<TokenizerType> operator()(const std::vector<std::wstring>& Seq) const;
std::vector<TokenizerType> operator()(const std::vector<std::wstring>& Seq, bool SkipBlank = false) const;
[[nodiscard]] std::vector<std::wstring> Tokenize(const std::wstring& Seq, size_t MaxWordLength = 25, TokenizerMethod Method = TokenizerMethod::Left) const;
[[nodiscard]] std::vector<std::wstring> SplitWithPlugin(const std::vector<std::wstring>& _Inputs) const;
static std::vector<std::wstring> SplitString(const std::wstring& _InputRef, const std::wregex& _SignRegex);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/**
* FileName: GPT-SoVits.hpp
* Note: MoeVoiceStudioCore GPT-SoVits模型类
*
* Copyright (C) 2022-2023 NaruseMioShirakana (shirakanamio@foxmail.com)
*
* This file is part of MoeVoiceStudioCore library.
* MoeVoiceStudioCore library is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or any later version.
*
* MoeVoiceStudioCore library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with Foobar.
* If not, see <https://www.gnu.org/licenses/agpl-3.0.html>.
*
* date: 2023-11-9 Create
*/

#pragma once
#include "TTS.hpp"

MoeVoiceStudioCoreHeader
class GptSoVits : public TextToSpeech
{
public:
GptSoVits(const MJson& _Config, const ProgressCallback& _ProgressCallback,
const DurationCallback& _DurationCallback,
ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU,
unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0);

GptSoVits(const std::map<std::string, std::wstring>& _PathDict, const MJson& _Config, const ProgressCallback& _ProgressCallback,
const DurationCallback& _DurationCallback,
ExecutionProviders ExecutionProvider_ = ExecutionProviders::CPU,
unsigned DeviceID_ = 0, unsigned ThreadCount_ = 0);

void load(const std::map<std::string, std::wstring>& _PathDict,
const MJson& _Config, const ProgressCallback& _ProgressCallback,
const DurationCallback& _DurationCallback);

~GptSoVits() override;

void destory()
{
delete sessionBert;
delete sessionVits;
delete sessionSSL;
sessionBert = nullptr;
sessionVits = nullptr;
sessionSSL = nullptr;

delete sessionEncoder;
delete sessionFDecoder;
delete sessionDecoder;
sessionEncoder = nullptr;
sessionFDecoder = nullptr;
sessionDecoder = nullptr;
}

[[nodiscard]] std::tuple<std::vector<float>, std::vector<int64_t>> GetBertPhs(const MoeVSProjectSpace::MoeVSTTSSeq& Seq, const MoeVSG2P::Tokenizer& Tokenizer) const;

[[nodiscard]] std::vector<std::vector<int16_t>> Inference(const std::vector<MoeVSProjectSpace::MoeVSTTSSeq>& _Input) const override;
private:
Ort::Session* sessionBert = nullptr;
Ort::Session* sessionVits = nullptr;
Ort::Session* sessionSSL = nullptr;

Ort::Session* sessionEncoder = nullptr;
Ort::Session* sessionFDecoder = nullptr;
Ort::Session* sessionDecoder = nullptr;

int64_t NumLayers = 24;
int64_t EmbeddingDim = 512;
int64_t EOSId = 1024;

std::vector<const char*> VitsInputNames = { "text_seq", "pred_semantic", "ref_audio" };
const std::vector<const char*> VitsOutputNames = { "audio" };

std::vector<const char*> EncoderInputNames = { "ref_seq", "text_seq", "ref_bert", "text_bert", "ssl_content" };
const std::vector<const char*> EncoderOutputNames = { "x", "prompts" };
std::vector<const char*> DecoderInputNames = { "iy", "ik", "iv", "iy_emb", "ix_example" };
const std::vector<const char*> DecoderOutputNames = { "y", "k", "v", "y_emb", "logits", "samples" };
std::vector<const char*> FDecoderInputNames = { "x", "prompts" };
const std::vector<const char*> FDecoderOutputNames = { "y", "k", "v", "y_emb", "x_example" };

std::vector<const char*> SSLInputNames = { "audio" };
const std::vector<const char*> SSLOutputNames = { "last_hidden_state" };

const std::vector<const char*> BertInputNames = { "input_ids", "attention_mask", "token_type_ids" };
const std::vector<const char*> BertInputNames2 = { "input_ids", "attention_mask" };
const std::vector<const char*> BertInputNames3 = { "input_ids" };
const std::vector<const char*> BertOutputNames = { "last_hidden_state" };
};

MoeVoiceStudioCoreEnd
10 changes: 7 additions & 3 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,21 @@ class TextToSpeech : public MoeVoiceStudioModule
i /= Sum;
}

MoeVSG2P::Tokenizer& GetTokenizer(const std::string& LanguageId)
[[nodiscard]] const MoeVSG2P::Tokenizer& GetTokenizer(const std::string& LanguageId) const
{
return Tokenizers[LanguageMap.at(LanguageId)];
return Tokenizers.at(LanguageTokenizerMap.at(LanguageId));
}

[[nodiscard]] static std::tuple<std::vector<std::wstring>, std::vector<int64_t>> SplitTonesFromTokens(const std::vector<std::wstring>& _Src, const std::vector<int64_t>& _ToneRef, int64_t FirstToneIdx, const std::string& LanguageSymbol);

protected:
DurationCallback CustomDurationCallback;
int64_t SpeakerCount = 1;
std::unordered_map<std::wstring, int64_t> SpeakerMap;
std::map<std::string, std::wstring> LanguageTokenizerMap;
std::map<std::string, int64_t> LanguageMap = { {"ZH", 0}, {"JP", 1}, {"EN", 2} };
std::map<std::string, int64_t> LanguageTones = { {"ZH", 0}, {"JP", 0}, {"EN", 0} };
std::vector<MoeVSG2P::Tokenizer> Tokenizers;
std::map<std::wstring, MoeVSG2P::Tokenizer> Tokenizers;
MoeVSG2P::MVSCleaner* Cleaner = nullptr;
bool AddBlank = true;
bool Emotion = false;
Expand Down
4 changes: 4 additions & 0 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ MoeVoiceStudioCoreHeader

void SetBertEnabled(bool cond);

void DestoryAllBerts();

class Vits : public TextToSpeech
{
public:
Expand Down Expand Up @@ -77,8 +79,10 @@ class Vits : public TextToSpeech
bool EncoderG = false;
std::vector<std::string> BertNames;
std::vector<std::wstring> BertNamesIdx;
std::wstring ClapName;
int64_t VQCodeBookSize = 10;
bool UseVQ = false;
bool UseClap = false;

std::vector<const char*> EncoderInputNames = { "x" };
const std::vector<const char*> EncoderOutputNames = { "xout", "m_p", "logs_p", "x_mask" };
Expand Down
Loading

0 comments on commit 06d0330

Please sign in to comment.