Skip to content

Commit

Permalink
FixChineseBertAlig
Browse files Browse the repository at this point in the history
  • Loading branch information
NaruseMioShirakana committed Dec 2, 2023
1 parent d3a7613 commit aaffced
Show file tree
Hide file tree
Showing 9 changed files with 509 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ namespace MoeVSProjectSpace

struct MoeVSTTSSeq
{
std::wstring SeqStr;
std::wstring SeqStr, TempStr;
std::string Langstr;
std::vector<std::string> LangstrSeq;
std::vector<std::wstring> Seq; //音素序列
std::vector<int64_t> Tones; //音调序列
std::vector<int64_t> Durations; //时长序列
Expand All @@ -179,6 +181,10 @@ namespace MoeVSProjectSpace
float RestTime = 0.5f; //停顿时间,为负数则直接断开音频并创建新音频
int64_t TotLang = 0;
std::wstring AdditionalInfo; //G2P额外信息

[[nodiscard]] std::wstring Serialization() const;

bool operator==(const MoeVSTTSSeq& right) const;
};

using MoeVSSvcParams = MoeVSParams;
Expand Down
13 changes: 11 additions & 2 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/TTS.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
/**
* FileName: TTS.hpp
* Note: MoeVoiceStudioCore TTS»ùÀà
* Note: MoeVoiceStudioCore TTS基类
*
* Copyright (C) 2022-2023 NaruseMioShirakana (shirakanamio@foxmail.com)
*
Expand Down Expand Up @@ -90,13 +90,20 @@ class TextToSpeech : public MoeVoiceStudioModule

[[nodiscard]] std::vector<MoeVSProjectSpace::MoeVSTTSSeq> GetInputSeqs(const MJson& _Input, const MoeVSProjectSpace::MoeVSParams& _InitParams) const;

std::vector<MoeVSProjectSpace::MoeVSTTSSeq>& SpecializeInputSeqs(std::vector<MoeVSProjectSpace::MoeVSTTSSeq>& _Seq);

[[nodiscard]] static std::vector<MoeVSProjectSpace::MoeVSTTSSeq> GetInputSeqsStatic(const MJson& _Input, const MoeVSProjectSpace::MoeVSParams& _InitParams);

static std::vector<std::vector<bool>> generatePath(float* duration, size_t durationSize, size_t maskSize);

[[nodiscard]] std::vector<float> GetEmotionVector(const std::vector<std::wstring>& src) const;

[[nodiscard]] std::vector<std::vector<int16_t>> Inference(const std::wstring& _Seq,
const MoeVSProjectSpace::MoeVSParams& _InferParams = MoeVSProjectSpace::MoeVSParams()) const;

[[nodiscard]] std::vector<std::wstring> Inference(const std::wstring& _Seq,
const MoeVSProjectSpace::MoeVSParams& _InferParams, bool T) const;

[[nodiscard]] std::vector<std::vector<int16_t>> Inference(const MJson& _Inputs,
const MoeVSProjectSpace::MoeVSParams& _InferParams = MoeVSProjectSpace::MoeVSParams()) const;

Expand All @@ -122,6 +129,8 @@ class TextToSpeech : public MoeVoiceStudioModule
return 0;
}

[[nodiscard]] std::vector<size_t> AligPhoneAttn(const std::string& LanguageStr, const std::vector<std::wstring>& PhoneSeq, size_t BertSize) const;

static int64_t find_max_idx(const std::vector<float>& inp)
{
int64_t idx = 0;
Expand Down
7 changes: 5 additions & 2 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/Models/header/Vits.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
/**
* FileName: Vits.hpp
* Note: MoeVoiceStudioCore VitsÄ£ÐÍÀà
* Note: MoeVoiceStudioCore Vits模型类
*
* Copyright (C) 2022-2023 NaruseMioShirakana (shirakanamio@foxmail.com)
*
Expand All @@ -24,6 +24,8 @@

MoeVoiceStudioCoreHeader

void SetBertEnabled(bool cond);

class Vits : public TextToSpeech
{
public:
Expand Down Expand Up @@ -101,6 +103,7 @@ class Vits : public TextToSpeech
const std::vector<const char*> EmbiddingOutputNames = { "g" };

const std::vector<const char*> BertInputNames = { "input_ids", "attention_mask", "token_type_ids" };
const std::vector<const char*> BertInputNames2 = { "input_ids", "token_type_ids" };
const std::vector<const char*> BertOutputNames = { "last_hidden_state" };
};

Expand Down
32 changes: 32 additions & 0 deletions MoeVoiceStudioSvc - Core - Cmd/Modules/Models/src/MoeVSProject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,4 +260,36 @@ namespace MoeVSProjectSpace
}
fclose(project_file);
}

std::wstring MoeVSTTSSeq::Serialization() const
{
std::wstring rtn = L"\t{\n";
if (SeqStr.empty())
return L"";
rtn += L"\t\t\"Tokens\": \"" + SeqStr + L"\",\n";
rtn += L"\t\t\"Seq\": " + wstring_vector_to_string(Seq) + L",\n";
rtn += L"\t\t\"Tones\": " + vector_to_string(Tones) + L",\n";
rtn += L"\t\t\"Durations\": " + vector_to_string(Durations) + L",\n";
rtn += L"\t\t\"Language\": " + (LangstrSeq.empty() ? vector_to_string(Language) : string_vector_to_string(LangstrSeq)) + L",\n";
rtn += L"\t\t\"SpeakerMix\": " + vector_to_string(SpeakerMix) + L",\n";
rtn += L"\t\t\"EmotionPrompt\": " + wstring_vector_to_string(EmotionPrompt) + L",\n";
rtn += L"\t\t\"NoiseScale\": " + std::to_wstring(NoiseScale) + L",\n";
rtn += L"\t\t\"LengthScale\": " + std::to_wstring(LengthScale) + L",\n";
rtn += L"\t\t\"DurationPredictorNoiseScale\": " + std::to_wstring(DurationPredictorNoiseScale) + L",\n";
rtn += L"\t\t\"FactorDpSdp\": " + std::to_wstring(FactorDpSdp) + L",\n";
rtn += L"\t\t\"GateThreshold\": " + std::to_wstring(GateThreshold) + L",\n";
rtn += L"\t\t\"MaxDecodeStep\": " + std::to_wstring(MaxDecodeStep) + L",\n";
rtn += L"\t\t\"Seed\": " + std::to_wstring(Seed) + L",\n";
rtn += L"\t\t\"SpeakerId\": " + std::to_wstring(SpeakerId) + L",\n";
rtn += L"\t\t\"RestTime\": " + std::to_wstring(RestTime) + L",\n";
rtn += L"\t\t\"PlaceHolderSymbol\": \"" + PlaceHolderSymbol + L"\",\n";
rtn += L"\t\t\"LanguageID\": \"" + to_wide_string(Langstr) + L"\",\n";
rtn += L"\t\t\"G2PAdditionalInfo\": \"" + AdditionalInfo + L"\"\n\t}";
return rtn;
}

bool MoeVSTTSSeq::operator==(const MoeVSTTSSeq& right) const
{
return Serialization() == right.Serialization();
}
}
Loading

0 comments on commit aaffced

Please sign in to comment.