-
Notifications
You must be signed in to change notification settings - Fork 28
/
demo.py
89 lines (72 loc) · 3.25 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import matplotlib.pylab as plt
import numpy as np
import pinyin
import soundfile as sf
import torch
from config import sampling_rate
from models.layers import STFT
from models.models import Tacotron2
from utils import text_to_sequence, ensure_folder, plot_data, HParams
class Denoiser(torch.nn.Module):
""" Removes model bias from audio produced with waveglow """
def __init__(self, waveglow, filter_length=1024, n_overlap=4,
win_length=1024, mode='zeros'):
super(Denoiser, self).__init__()
self.stft = STFT(filter_length=filter_length,
hop_length=int(filter_length / n_overlap),
win_length=win_length).cuda()
if mode == 'zeros':
mel_input = torch.zeros(
(1, 80, 88),
dtype=waveglow.upsample.weight.dtype,
device=waveglow.upsample.weight.device)
elif mode == 'normal':
mel_input = torch.randn(
(1, 80, 88),
dtype=waveglow.upsample.weight.dtype,
device=waveglow.upsample.weight.device)
else:
raise Exception("Mode {} if not supported".format(mode))
with torch.no_grad():
bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
bias_spec, _ = self.stft.transform(bias_audio)
self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
def forward(self, audio, strength=0.1):
audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
audio_spec_denoised = audio_spec - self.bias_spec * strength
audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)
return audio_denoised
if __name__ == '__main__':
config = HParams()
checkpoint = 'tacotron2-cn.pt'
print('loading model: {}...'.format(checkpoint))
model = Tacotron2(config)
model.load_state_dict(torch.load(checkpoint))
model = model.to('cpu')
model.eval()
waveglow_path = 'waveglow_256channels.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
k.float()
denoiser = Denoiser(waveglow)
text = "相对论直接和间接的催生了量子力学的诞生 也为研究微观世界的高速运动确立了全新的数学模型"
text = pinyin.get(text, format="numerical", delimiter=" ")
print(text)
sequence = np.array(text_to_sequence(text))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
mel_outputs_postnet.float().data.cpu().numpy()[0],
alignments.float().data.cpu().numpy()[0].T))
ensure_folder('images')
plt.savefig('images/mel_spec.jpg')
mel_outputs_postnet = mel_outputs_postnet.type(torch.float16)
with torch.no_grad():
audio = waveglow.infer(mel_outputs_postnet.cuda(), sigma=0.666)
audio = audio[0].data.cpu().numpy()
audio = audio.astype(np.float32)
print('audio.shape: ' + str(audio.shape))
print(audio)
sf.write('output.wav', audio, sampling_rate, 'PCM_24')