-
Notifications
You must be signed in to change notification settings - Fork 1
/
audio_handler.py
107 lines (82 loc) · 4.06 KB
/
audio_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Audio processing follows bits from https://github.com/Louismac/MAGNet
import numpy as np
import librosa
import utils.audio_dataset_generator
import lws
class AudioHandler(object):
"""
Will handle everything around processing audio.
Audio representation conversions and reconstructions.
- raw audio 2 spectrogram
- spectrogram 2 reconstructed audio
"""
def __init__(self, griffin_iterations = 60, fft_size=2048, window_size=1024, hop_size=512, sample_rate=44100, sequence_length=40):
self.griffin_iterations = griffin_iterations
self.fft_size = fft_size
self.window_size = window_size
self.hop_size = hop_size
self.sample_rate = sample_rate
self.sequence_length = sequence_length
# Init the LWS:
lws_L = 5
self.lws_processor = lws.lws(self.window_size, self.hop_size, L=lws_L, fftsize=self.fft_size, mode="music")
def spectrogram2audio(self, spectrogram, method="Griff"):
if method == "Griff":
audio = self.griffin_lim(spectrogram.T, self.griffin_iterations)
audio = np.array(audio)
elif method == "LWS":
audio = self.lws_reconstruct(spectrogram)
audio = np.array(audio)
# sometimes gets error:
# if not np.isfinite(y).all():
# raise ParameterError('Audio buffer is not finite everywhere')
# isfinite() -> Test element-wise for finiteness (not infinity or not Not a Number).
if not np.isfinite(audio).all():
print("Problem with the signal - it's not finite (contains inf or NaN)")
print("Signal = ", audio)
audio = np.nan_to_num(audio)
print("Attempted hacky fix")
return audio
def griffin_lim(self, stftm_matrix, max_iter=100):
""""Iterative method to 'build' phases for magnitudes."""
stft_matrix = np.random.random(stftm_matrix.shape)
y = librosa.core.istft(stft_matrix, self.hop_size, self.window_size)
if not np.isfinite(y).all():
print("Problem with the signal - it's not finite (contains inf or NaN)")
print("Signal = ", y)
y = np.nan_to_num(y)
print("Attempted hacky fix")
for i in range(max_iter):
if not np.isfinite(y).all():
print("Problem with the signal - it's not finite (contains inf or NaN), in iteration",i)
print("Signal = ", y)
y = np.nan_to_num(y)
print("Attempted hacky fix inside the iterative method")
stft_matrix = librosa.core.stft(y, self.fft_size, self.hop_size, self.window_size)
stft_matrix = stftm_matrix * stft_matrix / np.abs(stft_matrix)
y = librosa.core.istft(stft_matrix, self.hop_size, self.window_size)
return y
def lws_reconstruct(self, data):
X0 = data
## HAX, doesnt influence it too much tho
if X0.dtype != "float64":
X0 = np.asarray(X0, dtype=np.float64)
X1 = self.lws_processor.run_lws(X0) # reconstruction from magnitude (in general, one can reconstruct from an initial complex spectrogram)
print('{:6}: {:5.2f} dB'.format('LWS', self.lws_processor.get_consistency(X1)))
represented = X1
# now reconstruct from:
print("\trepresented shape", represented.shape)
reconstruction = self.lws_processor.istft(represented) # where x is a single-channel waveform
reconstruction = np.asarray(reconstruction)
print("\toutput reconstruction:", reconstruction.shape) # (531968,)
print("\treconstruction data type:", reconstruction.dtype)
return reconstruction
def load_dataset(self, dataset_path):
dataset = utils.audio_dataset_generator.AudioDatasetGenerator(
fft_size = self.fft_size, window_size = self.window_size, hop_size = self.hop_size,
sequence_length = self.sequence_length, sample_rate = self.sample_rate)
try:
dataset.load(dataset_path, prevent_shuffling=True)
except:
dataset = None
return dataset