-
Notifications
You must be signed in to change notification settings - Fork 20
/
tacotron2.cpp
117 lines (69 loc) · 3.65 KB
/
tacotron2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#include "tacotron2.h"
TFTensor<float> Tacotron2::DoInferenceTFTTS(const std::vector<int32_t> &InputIDs, int32_t SpeakerID, int32_t EmotionID)
{
if (!CurrentMdl)
throw std::exception("Tried to do inference on unloaded or invalid model!");
// Convenience reference so that we don't have to constantly derefer pointers.
cppflow::model& Mdl = *CurrentMdl;
// Define the tensors
// This is the shape of the input IDs, our equivalent to tf.expand_dims.
std::vector<int64_t> InputIDShape = { 1, (int64_t)InputIDs.size() };
cppflow::tensor input_ids{ InputIDs, InputIDShape };
cppflow::tensor speaker_ids{SpeakerID };
cppflow::tensor input_lengths{(int32_t)InputIDs.size() };
cppflow::tensor* emotion_ids = nullptr;
// This is a multi-emotion model
if (EmotionID != -1)
{
emotion_ids = new cppflow::tensor{std::vector<int32_t>{EmotionID}};
}
TensorVec Inputs = {{"serving_default_input_ids:0",input_ids},
{"serving_default_input_lengths:0",input_lengths},
{"serving_default_speaker_ids:0",speaker_ids}};
// Define output tensor
if (EmotionID != -1)
Inputs.push_back({"serving_default_emotion_ids:0",*emotion_ids});
// Do inference
// We only care about the after mel-after [1] and alignment history [3]
auto Outputs = Mdl(Inputs,{"StatefulPartitionedCall:0","StatefulPartitionedCall:1","StatefulPartitionedCall:2","StatefulPartitionedCall:3"});
// Define output and return it
TFTensor<float> MelOut = VoxUtil::CopyTensor<float>(Outputs[1]);
Attention = VoxUtil::CopyTensor<float>(Outputs[3]);
// We allocated the emotion_ids cppflow::tensor dynamically, delete it
if (emotion_ids)
delete emotion_ids;
// We could just straight out define it in the return statement, but I like it more this way
return MelOut;
}
TFTensor<float> Tacotron2::DoInferenceCoqui(const std::vector<int32_t> &InputIDs)
{
// Convenience reference so that we don't have to constantly derefer pointers.
cppflow::model& Mdl = *CurrentMdl;
// Define the tensors
// This is the shape of the input IDs, our equivalent to tf.expand_dims.
std::vector<int64_t> InputIDShape = { 1, (int64_t)InputIDs.size() };
cppflow::tensor input_ids{ InputIDs, InputIDShape };
TensorVec Inputs = {{"serving_default_characters:0",input_ids}};
// We only care about the after mel-after [1] and alignment history [2]
auto Outputs = Mdl(Inputs,{"StatefulPartitionedCall:0","StatefulPartitionedCall:1","StatefulPartitionedCall:2","StatefulPartitionedCall:3"});
// Define output and return it
TFTensor<float> MelOut = VoxUtil::CopyTensor<float>(Outputs[1]);
// Coqui TT2 attention output is inverse of what our attention plotter expects, so we transpose it.
cppflow::tensor AttTransposed = cppflow::transpose(Outputs[2],cppflow::tensor{0,2,1});
Attention = VoxUtil::CopyTensor<float>(AttTransposed);
return MelOut;
}
Tacotron2::Tacotron2()
{
}
TFTensor<float> Tacotron2::DoInference(const std::vector<int32_t> &InputIDs, const std::vector<float> &ArgsFloat, const std::vector<int32_t> ArgsInt, int32_t SpeakerID, int32_t EmotionID)
{
if (!CurrentMdl)
throw std::runtime_error("Tried to do inference on unloaded or invalid model!");
if (GetCurrentRepo() == ETTSRepo::TensorflowTTS)
return DoInferenceTFTTS(InputIDs,SpeakerID,EmotionID);
else if (GetCurrentRepo() == ETTSRepo::CoquiTTS)
return DoInferenceCoqui(InputIDs);
else
throw std::runtime_error("Unknown/unset/unimplemented TTS repo!!!");
}