|
1 |
| -import os |
2 | 1 | import random
|
3 | 2 | from dataclasses import dataclass, field
|
4 |
| -from typing import Optional |
5 | 3 |
|
6 | 4 | import librosa
|
7 | 5 | import numpy as np
|
8 | 6 | import torch
|
9 | 7 | import torchaudio
|
10 |
| - |
11 |
| -# from qut_noise import QUTNoise |
12 | 8 | from torchaudio.utils import download_asset
|
13 | 9 |
|
14 | 10 | from tonic.audio_transforms import FixLength
|
|
18 | 14 | "RandomPitchShift",
|
19 | 15 | "RandomAmplitudeScale",
|
20 | 16 | "AddWhiteNoise",
|
21 |
| - "AddHomeNoise", |
22 |
| - "EmbeddedHomeNoise", |
23 | 17 | "RIR",
|
24 | 18 | ]
|
25 | 19 |
|
@@ -174,197 +168,6 @@ def __call__(self, audio: np.ndarray):
|
174 | 168 | return noisy_audio
|
175 | 169 |
|
176 | 170 |
|
177 |
| -# @dataclass |
178 |
| -# class AddHomeNoise: |
179 |
| -# """Add a home background noise (from QUTNOise dataset) to the audio sample with a known snr |
180 |
| -# (signal to noise ratio). |
181 |
| - |
182 |
| -# Parameters: |
183 |
| -# sample_length (int): sample length in seconds |
184 |
| -# target_sr (float): the target sample rate of the mixed final signal (default to the higher sample rate, between sample rates of noise and data ) |
185 |
| -# params_dataset (dict): containing other parameters of the noise dataset |
186 |
| -# orig_sr (float): original sample rate of data |
187 |
| -# factors (float): range of desired snrs |
188 |
| -# partition (str): partition of the QUTNoise dataset that is used for noise augmentation |
189 |
| -# aug_index (int): index of the chosen factor for snr. It will be randomly chosen from the desired range (if not passed while initilization) |
190 |
| -# caching (bool): if we are caching the DiskCached dataset will dynamically pass copy index of data item to the transform (to set aug_index). Otherwise the aug_index will be chosen randomly in every call of transform |
191 |
| -# seed (int): a fixed seed for reproducibility |
192 |
| -# Args: |
193 |
| -# audio (np.ndarray): data sample |
194 |
| -# Returns: |
195 |
| -# np.ndarray: data sample with added noise |
196 |
| -# """ |
197 |
| - |
198 |
| -# sample_length: int |
199 |
| -# params_dataset: dict |
200 |
| -# target_sr: float = 48000 |
201 |
| -# orig_sr: float = 16000 |
202 |
| -# factors: list = field(default_factory=lambda: [0, 10, 20]) |
203 |
| -# partition: str = "test" |
204 |
| -# aug_index: int = 0 |
205 |
| -# caching: bool = False |
206 |
| -# seed: int = 123 |
207 |
| - |
208 |
| -# def __post_init__(self): |
209 |
| -# random.seed(self.seed) |
210 |
| - |
211 |
| -# noises = QUTNoise( |
212 |
| -# classes=["HOME"], |
213 |
| -# create_splits=False, |
214 |
| -# duration_split=[self.sample_length], |
215 |
| -# partition=self.partition, |
216 |
| -# **self.params_dataset, |
217 |
| -# ) |
218 |
| - |
219 |
| -# split_qutnoise_path = noises.config_path |
220 |
| - |
221 |
| -# self.wave_files_path = ( |
222 |
| -# str(split_qutnoise_path) |
223 |
| -# + "/splits_" |
224 |
| -# + str(self.sample_length) |
225 |
| -# + "s" |
226 |
| -# + "/" |
227 |
| -# + self.partition |
228 |
| -# + "/" |
229 |
| -# ) |
230 |
| - |
231 |
| -# self.home_noises = os.listdir(self.wave_files_path) |
232 |
| - |
233 |
| -# def resample(self, audio): |
234 |
| -# audio_resampled = librosa.resample( |
235 |
| -# audio, orig_sr=self.orig_sr, target_sr=self.target_sr |
236 |
| -# ) |
237 |
| -# return audio_resampled |
238 |
| - |
239 |
| -# def get_noise(self): |
240 |
| -# self.noise_wave = random.choice(self.home_noises) |
241 |
| - |
242 |
| -# noise, _ = librosa.core.load( |
243 |
| -# self.wave_files_path + self.noise_wave, sr=self.target_sr |
244 |
| -# ) |
245 |
| -# self.noise = noise[0 : int(self.target_sr) * self.sample_length] |
246 |
| -# return self.noise |
247 |
| - |
248 |
| -# def add_noise( |
249 |
| -# self, |
250 |
| -# waveform: torch.Tensor, |
251 |
| -# noise: torch.Tensor, |
252 |
| -# snr: torch.Tensor, |
253 |
| -# ) -> torch.Tensor: |
254 |
| -# """Scales and adds noise to waveform per signal-to-noise ratio. |
255 |
| - |
256 |
| -# Specifically, for each pair of waveform vector :math:`x \in \mathbb{R}^L` and noise vector |
257 |
| -# :math:`n \in \mathbb{R}^L`, the function computes output :math:`y` as |
258 |
| -# .. math:: |
259 |
| -# y = x + a n \, \text{,} |
260 |
| -# where |
261 |
| -# .. math:: |
262 |
| -# a = \sqrt{ \frac{ ||x||_{2}^{2} }{ ||n||_{2}^{2} } \cdot 10^{-\frac{\text{SNR}}{10}} } \, \text{,} |
263 |
| -# with :math:`\text{SNR}` being the desired signal-to-noise ratio between :math:`x` and :math:`n`, in dB. |
264 |
| -# Note that this function broadcasts singleton leading dimensions in its inputs in a manner that is |
265 |
| -# consistent with the above formulae and PyTorch's broadcasting semantics. |
266 |
| -# .. devices:: CPU CUDA |
267 |
| -# .. properties:: Autograd TorchScript |
268 |
| -# Args: |
269 |
| -# waveform (torch.Tensor): Input waveform, with shape `(..., L)`. |
270 |
| -# noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``). |
271 |
| -# snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`. |
272 |
| -# Returns: |
273 |
| -# torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)` |
274 |
| -# (same shape as ``waveform``). |
275 |
| -# """ |
276 |
| - |
277 |
| -# L = waveform.size(-1) |
278 |
| - |
279 |
| -# if L != noise.size(-1): |
280 |
| -# raise ValueError( |
281 |
| -# f"Length dimensions of waveform and noise don't match (got {L} and {noise.size(-1)})." |
282 |
| -# ) |
283 |
| - |
284 |
| -# # compute scale, second by second |
285 |
| -# noisy_audio = torch.zeros_like(waveform) |
286 |
| -# for i in range(0, self.sample_length): |
287 |
| -# start, end = int(i * self.target_sr), int((i + 1) * self.target_sr) |
288 |
| -# sig, noise_ = waveform[:, start:end], noise[:, start:end] |
289 |
| - |
290 |
| -# energy_signal = torch.linalg.vector_norm(sig, ord=2, dim=-1) ** 2 # (*,) |
291 |
| -# energy_noise = torch.linalg.vector_norm(noise_, ord=2, dim=-1) ** 2 # (*,) |
292 |
| -# original_snr_db = 10 * ( |
293 |
| -# torch.log10(energy_signal) - torch.log10(energy_noise) |
294 |
| -# ) |
295 |
| -# scale = 10 ** ((original_snr_db - snr) / 20.0) # (*,) |
296 |
| - |
297 |
| -# # scale noise |
298 |
| -# self.scaled_noise = scale.unsqueeze(-1) * noise_ # (*, 1) * (*, L) = (*, L) |
299 |
| -# noisy_audio[:, start:end] = sig + self.scaled_noise |
300 |
| - |
301 |
| -# return noisy_audio |
302 |
| - |
303 |
| -# def __call__(self, audio: np.ndarray): |
304 |
| -# if not self.caching: |
305 |
| -# self.aug_index = random.choice(range(0, len(self.factors))) |
306 |
| -# snr_db = torch.tensor([self.factors[self.aug_index]]) |
307 |
| -# self.noise = torch.from_numpy(self.get_noise()) |
308 |
| -# self.noise = torch.unsqueeze(self.noise, dim=0) |
309 |
| -# self.resampled_audio = torch.from_numpy(self.resample(audio)) |
310 |
| -# noisy_audio = self.add_noise(self.resampled_audio, self.noise, snr_db) |
311 |
| - |
312 |
| -# return noisy_audio.detach().numpy() |
313 |
| - |
314 |
| - |
315 |
| -# @dataclass |
316 |
| -# class EmbeddedHomeNoise(AddHomeNoise): |
317 |
| -# """Add a home background noise (from QUTNOise dataset) to the data sample with a known snr_db |
318 |
| -# (signal to noise ratio). |
319 |
| - |
320 |
| -# The difference with AddHomeNoise is that a leading (/and trainling) noise will be added to the augmented sample. |
321 |
| -# Parameters: |
322 |
| -# noise_length (int): the length of noise (in seconds) that will be added to the sample |
323 |
| -# two_sided (bool): if True the augmented signal will be encompassed between leading and trailing noises |
324 |
| -# Args: |
325 |
| -# audio (np.ndarray): data sample |
326 |
| -# Returns: |
327 |
| -# np.ndarray: data sample with added noise at the begining |
328 |
| -# """ |
329 |
| - |
330 |
| -# noise_length: int = None |
331 |
| -# two_sided: bool = False |
332 |
| - |
333 |
| -# def __post_init__(self): |
334 |
| -# super().__post_init__() |
335 |
| - |
336 |
| -# if self.noise_length is None: |
337 |
| -# raise ValueError("noise length is not specified") |
338 |
| -# elif self.noise_length > self.sample_length: |
339 |
| -# raise ValueError( |
340 |
| -# "in the current implementation length of noise can't exceed sample length" |
341 |
| -# ) |
342 |
| - |
343 |
| -# def __call__(self, audio: np.ndarray): |
344 |
| -# if not self.caching: |
345 |
| -# self.aug_index = random.choice(range(0, len(self.factors))) |
346 |
| -# snr_db = torch.tensor([self.factors[self.aug_index]]) |
347 |
| - |
348 |
| -# self.noise = torch.from_numpy(self.get_noise()) |
349 |
| -# self.noise = torch.unsqueeze(self.noise, dim=0) |
350 |
| -# self.resampled_audio = torch.from_numpy(self.resample(audio)) |
351 |
| -# noisy_audio = ( |
352 |
| -# self.add_noise(self.resampled_audio, self.noise, snr_db).detach().numpy() |
353 |
| -# ) |
354 |
| - |
355 |
| -# initial_noise = self.scaled_noise[ |
356 |
| -# :, 0 : int(self.target_sr * self.noise_length) |
357 |
| -# ] |
358 |
| -# if self.two_sided: |
359 |
| -# noise_then_audio = np.concatenate( |
360 |
| -# (initial_noise, noisy_audio, initial_noise), axis=1 |
361 |
| -# ) |
362 |
| -# else: |
363 |
| -# noise_then_audio = np.concatenate((initial_noise, noisy_audio), axis=1) |
364 |
| - |
365 |
| -# return noise_then_audio |
366 |
| - |
367 |
| - |
368 | 171 | @dataclass
|
369 | 172 | class RIR:
|
370 | 173 | """Convolves a RIR (room impluse response) to the data sample.
|
|
0 commit comments