-
Notifications
You must be signed in to change notification settings - Fork 10
/
loaders.py
209 lines (165 loc) · 5.2 KB
/
loaders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
from scipy.io import wavfile
import audioread.rawread
import audioread.ffdec
import matplotlib.pyplot as plt
import soundfile as sf
import aubio
from pydub import AudioSegment
import torchaudio
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio
import librosa
import soxbindings
import sox
import stempeg
"""
Some of the code taken from:
https://github.com/aubio/aubio/blob/master/python/demos/demo_reading_speed.py
"""
@tf.function
def load_tfio_fromffmpeg(fp):
# not supported yet
audio = tfio.IOTensor.graph(tf.int16).from_ffmpeg(fp)
return tf.cast(audio.to_tensor(), tf.float32) / 32767.0
@tf.function
def load_tfio_fromaudio(fp, ext="wav"):
if ext in ["wav", "flac", "mp4"]:
audio = tfio.IOTensor.graph(tf.float16).from_audio(fp)
return tf.cast(audio.to_tensor(), tf.float16)
else:
return tfio.IOTensor.graph(tf.float32).from_audio(fp).to_tensor()
@tf.function
def load_tf_decode_wav(fp, ext="wav", rate=44100):
audio, rate = tf.audio.decode_wav(tf.io.read_file(fp))
return tf.cast(audio, tf.float32)
def load_aubio(fp):
f = aubio.source(fp, hop_size=1024)
sig = np.zeros(f.duration, dtype=aubio.float_type)
total_frames = 0
while True:
samples, read = f()
sig[total_frames:total_frames + read] = samples[:read]
total_frames += read
if read < f.hop_size:
break
return sig
def load_torchaudio(fp):
sig, rate = torchaudio.load(fp)
return sig
def load_stempeg(fp):
sig = stempeg.read._read_ffmpeg(
fp,
sample_rate=44100,
channels=1,
start=None,
duration=None,
dtype=np.float32,
ffmpeg_format='f32le',
stem_idx=0
)
return sig
def load_soundfile(fp):
sig, rate = sf.read(fp)
return sig
def load_scipy(fp):
rate, sig = wavfile.read(fp)
sig = sig.astype('float32') / 32767
return sig
def load_scipy_mmap(fp):
rate, sig = wavfile.read(fp, mmap=True)
sig = sig.astype('float32') / 32767
return sig
def load_ar_ffmpeg(fp):
with audioread.ffdec.FFmpegAudioFile(fp) as f:
total_frames = 0
for buf in f:
sig = _convert_buffer_to_float(buf)
sig = sig.reshape(f.channels, -1)
total_frames += sig.shape[1]
return sig
def load_soxbindings(fp):
tfm = soxbindings.Transformer()
array_out = tfm.build_array(input_filepath=fp)
return array_out
def load_pydub(fp):
song = AudioSegment.from_file(fp)
sig = np.asarray(song.get_array_of_samples(), dtype='float32')
sig = sig.reshape(song.channels, -1) / 32767.
return sig
def load_librosa(fp):
# loading with `sr=None` is disabling the internal resampling
sig, rate = librosa.load(fp, sr=None)
return sig
def _convert_buffer_to_float(buf, n_bytes=2, dtype=np.float32):
# taken from librosa.util.utils
# Invert the scale of the data
scale = 1./float(1 << ((8 * n_bytes) - 1))
# Construct the format string
fmt = '<i{:d}'.format(n_bytes)
# Rescale and format the data buffer
out = scale * np.frombuffer(buf, fmt).astype(dtype)
return out
def info_soundfile(fp):
info = {}
info['duration'] = sf.info(fp).duration
info['samples'] = int(sf.info(fp).duration * sf.info(fp).samplerate)
info['channels'] = sf.info(fp).channels
info['sampling_rate'] = sf.info(fp).samplerate
return info
def info_audioread(fp):
info = {}
with audioread.audio_open(fp) as f:
info['duration'] = f.duration
with audioread.audio_open(fp) as f:
info['samples'] = int(f.duration * f.samplerate)
with audioread.audio_open(fp) as f:
info['channels'] = f.channels
with audioread.audio_open(fp) as f:
info['sampling_rate'] = f.samplerate
return info
def info_aubio(fp):
info = {}
with aubio.source(fp) as f:
info['duration'] = f.duration / f.samplerate
with aubio.source(fp) as f:
info['samples'] = f.duration
with aubio.source(fp) as f:
info['channels'] = f.channels
with aubio.source(fp) as f:
info['sampling_rate'] = f.samplerate
return info
def info_sox(fp):
info = {}
info['duration'] = sox.file_info.duration(fp)
info['samples'] = sox.file_info.num_samples(fp)
info['channels'] = sox.file_info.channels(fp)
info['sampling_rate'] = int(sox.file_info.sample_rate(fp))
return info
def info_pydub(fp):
info = {}
f = AudioSegment.from_file(fp)
info['duration'] = f.duration_seconds
f = AudioSegment.from_file(fp)
info['samples'] = int(f.frame_count())
f = AudioSegment.from_file(fp)
info['channels'] = f.channels
f = AudioSegment.from_file(fp)
info['sampling_rate'] = f.frame_rate
return info
def info_torchaudio(fp):
info = {}
si = torchaudio.info(str(fp))
info["sampling_rate"] = si.sample_rate
info["samples"] = si.num_frames
info["channels"] = si.num_channels
info["duration"] = info["samples"] / info["sampling_rate"]
return info
def info_stempeg(fp):
info = {}
si = stempeg.Info(fp)
info["sampling_rate"] = si.sample_rate(0)
info["samples"] = si.samples(0)
info["channels"] = si.channels(0)
info["duration"] = si.duration(0)
return info