-
Notifications
You must be signed in to change notification settings - Fork 110
/
Copy pathconfig.yaml
160 lines (144 loc) · 3.44 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
dataset:
train:
wav_scp: './train/wav.scp'
mel_scp: './train/mel.scp'
dur_scp: './train/dur.scp'
emb_type1:
_name: 'pinyin'
scp: './train/py.scp'
vocab: 'py.vocab'
emb_type2:
_name: 'graphic'
scp: './train/gp.scp'
vocab: 'gp.vocab'
emb_type3:
_name: 'speaker'
scp: './train/spk.scp'
vocab: # dosn't need vocab
training:
batch_size: 16
epochs: 10000
grad_clip_thresh: 1.0
acc_step: 1
checkpoint_path: "./checkpoints/"
log_path: "./log/"
checkpoint_step: 5000
synth_step: 5000
log_step: 20
num_workers: 8
evaluation_step: 1000
optimizer:
type: Adam
n_warm_up_step: 2000
#lr_decrease_step: 10000
#lr_decrease_factor:
params:
betas: [0.9,0.98]
eps: !!float 1e-9
weight_decay: !!float 0.0
lr: !!float 1e-4
lr_scheduler:
type: CyclicLR
params:
base_lr: !!float 1e-7
max_lr: !!float 1e-4
step_size_up: 5000
step_size_down: 8000
cycle_momentum: False
vocoder:
type: VocGan # choose one of the following
MelGAN:
checkpoint: ~/checkpoints/melgan/melgan_ljspeech.pth
config: ~/checkpoints/melgan/default.yaml
device: cpu
VocGan:
checkpoint: ~/checkpoints/vctk_pretrained_model_3180.pt #~/checkpoints/ljspeech_29de09d_4000.pt
denoise: True
device: cpu
HiFiGAN:
checkpoint: ~/checkpoints/VCTK_V3/generator_v3 # you need to download checkpoint and set the params here
device: cpu
Waveglow:
checkpoint: ~/checkpoints/waveglow_256channels_universal_v5_state_dict.pt
sigma: 1.0
denoiser_strength: 0.0 # try 0.1
device: cpu #try cpu if out of memory
fbank:
sample_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
max_wav_value: 32768.0
n_mels: 80
fmin: 0.0
fmax: 8000.0 # should be 11025
mel_mean: -6.0304103
encoder:
encoder_type: 'FS2TransformerEncoder'
conf:
n_layers: 4
n_heads: 2
hidden_dim: 256
dropout: 0.25
d_inner: 1024
max_len: 2048
decoder:
decoder_type: 'FS2TransformerDecoder'
input_dim: 256 # should be the same as the output of encoder
n_layers: 4
n_heads: 2
hidden_dim: 256
d_inner: 1024
dropout: 0.25
max_len: 2048 # max len of seq, for position embedding pre-computation
#(class) Decoder(input_dim: int = 256, n_layers: int = 4, n_heads: int = 2,
#hidden_dim: int = 256, d_inner: int = 1024, dropout: float = 0.5, max_len: int = 2048
postnet:
postnet_type: 'PostUNet' # 'PostUNet', 'PostNet1d'
speaker_embedding:
enable: True
vocab: #None
vocab_size: 1 #
weight: 1.0
dim: 256
utterence_embedding:
enable: False
type: 'lstm' # resnet
feature_config:
type: 'mel'
n_mels: 80
sampling_rate: 22050
n_fft: 1024
hop_length: 256
win_length: 1024
model_config:
n_layers: 3
bidirectional: True
hanzi_embedding:
enable: True
type: embedding
vocab: './gp.vocab'
dim: 256
weight: 0.5
max_seq_len: 100
pinyin_embedding:
enable: True
type: embedding
vocab: './py.vocab'
dim: 256
weight: 1.0
max_seq_len: 100
duration_predictor:
input_dim: 256 # should be the same as encoder hiddien_dim
filter_size: 256
kernel_size: 3
dropout: 0.5
duration_mean: 21.517294924096635 #for aishell3
f0_predictor:
enable: False
filter_size: 256
kernel_size: 3
dropout: 0.5
n_bins: 256
synthesis:
normalize: True # normalize the sound volume