From c821aa081f996e48108e523ab8eedc87ee234d6f Mon Sep 17 00:00:00 2001 From: genisplaja Date: Tue, 1 Apr 2025 18:48:58 +0200 Subject: [PATCH 01/11] add code --- compiam/data.py | 18 ++ compiam/separation/README.md | 7 +- compiam/separation/__init__.py | 1 + .../singing_voice_extraction/__init__.py | 3 + .../convtdf_vocal_finetune.py | 212 ++++++++++++++++++ 5 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py diff --git a/compiam/data.py b/compiam/data.py index fc1c2a4b..2f38fac2 100644 --- a/compiam/data.py +++ b/compiam/data.py @@ -234,6 +234,24 @@ }, }, }, + "separation:convtdf-vocal-finetune": { + "module_name": "compiam.separation.singing_voice_separation.convtdf-vocal-finetune", + "class_name": "ConvTDFVocalFineTune", + "default_version": "v1", + "kwargs": { + "v1": { + "model_path": os.path.join( + "models", + "separation", + "convtdf-vocal-finetune", + "vocals", + "TODO", + ), + "download_link": "TODO", + "download_checksum": "TODO", + }, + }, + }, } diff --git a/compiam/separation/README.md b/compiam/separation/README.md index 2fc9abfe..a227f6e9 100644 --- a/compiam/separation/README.md +++ b/compiam/separation/README.md @@ -3,9 +3,12 @@ | **Tool** | **Task** | **Paper** | |---------------------------|----------------------------------|-----------| | ColdDiffSep | Singing voice extraction | [1] | -| MDXNet w/ mixer model | Music source separation | [2] | +| ConvTDF Vocal Fine-tuned | Singing voice extraction | [2] | +| MDXNet w/ mixer model | Music source separation | [3] | [1] G. Plaja-Roglans, M. Miron, A. Shankar, and X. Serra, "Carnatic Singing Voice Separation using Cold Diffusion on Training Data with Bleeding", in International Society for Music Information Retrieval Conference (ISMIR 23), 2023. -[2] Work under review. \ No newline at end of file +[2] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025. + +[3] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025. \ No newline at end of file diff --git a/compiam/separation/__init__.py b/compiam/separation/__init__.py index 66686471..512384f1 100644 --- a/compiam/separation/__init__.py +++ b/compiam/separation/__init__.py @@ -12,6 +12,7 @@ ### IMPORT HERE THE CONSIDERED TASKS from compiam.separation import singing_voice_extraction +from compiam.separation import music_source_separation # Show user the available tasks diff --git a/compiam/separation/singing_voice_extraction/__init__.py b/compiam/separation/singing_voice_extraction/__init__.py index 9f72faee..f0ee9c9d 100644 --- a/compiam/separation/singing_voice_extraction/__init__.py +++ b/compiam/separation/singing_voice_extraction/__init__.py @@ -7,6 +7,9 @@ from compiam.separation.singing_voice_extraction.cold_diff_sep import ( ColdDiffSep, ) +from compiam.separation.singing_voice_extraction.convtdf_vocal_finetune import ( + ConvTDFVocalFineTune, +) # Show user the available tools diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py new file mode 100644 index 00000000..0f6fc82a --- /dev/null +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -0,0 +1,212 @@ +import os + +import numpy as np + +from compiam.exceptions import ModelNotTrainedError +from compiam.utils import get_logger, WORKDIR +from compiam.utils.download import download_remote_model + + +logger = get_logger(__name__) + + +class ConvTDFVocalFineTune(object): + """ConvTDF net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding).""" + + def __init__( + self, + model_path=None, + download_link=None, + download_checksum=None, + sample_rate=24000, + gpu="-1", + ): + """Leakage-aware singing voice separation init method. + + :param model_path: path to file to the model weights. + :param download_link: link to the remote pre-trained model. + :param download_checksum: checksum of the model file. + :param sample_rate: sample rate to which the audio is sampled for extraction. + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + """ + ### IMPORTING OPTIONAL DEPENDENCIES + try: + global torch + import torch + + global nn + import torch.nn as nn + + global torchaudio + import torchaudio + + global ConvTDFNet + from compiam.separation.music_source_separation.mixer_model.models import ( + ConvTDFNet, + ) + + except: + raise ImportError( + "In order to use this tool you need to have torch and torchaudio installed. " + "Install compIAM with torch support: pip install 'compiam[torch]'" + ) + ### + + ## Setting up GPU if specified + self.gpu = gpu + self.device = None + self.select_gpu(gpu) + + self.model = self._build_model() + self.sample_rate = sample_rate + self.trained = False + + self.model_path = model_path + self.download_link = download_link + self.download_checksum = download_checksum + if self.model_path is not None: + self.load_model(self.model_path) + + self.chunk_size = self.model.chunk_size + + def forward(self, x): + """Forward pass of the mixer model""" + return self.model(x) + + def _build_model(self): + """Build the MDXNet mixer model.""" + convtdfnet = ConvTDFNet().to(self.device) + convtdfnet.eval() + return convtdfnet + + def load_model(self, model_path): + if not os.path.exists(model_path): + self.download_model(model_path) # Downloading model weights + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) + try: + weights = torch.load(model_path, weights_only=True, map_location=self.device) + except: + weights = torch.load(model_path, map_location=self.device) + self.model.load_state_dict(weights) + self.model_path = model_path + self.trained = True + + def separate( + self, + input_data, + input_sr=44100, + gpu="-1", + ): + """Separate Carnatic singing voice from mixture. + + :param input_data: Audio signal/path to separate. + :param input_sr: sampling rate of the input array of data (if any). This variable is only + relevant if the input is an array of data instead of a filepath. + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + :return: Singing voice and violin signals. + """ + ## Setting up GPU if specified + self.gpu = gpu + self.device = None + self.select_gpu(gpu) + + if self.trained is False: + raise ModelNotTrainedError( + """ Model is not trained. Please load model before running inference! + You can load the pre-trained instance with the load_model wrapper.""" + ) + + # Loading and resampling audio + if isinstance(input_data, str): + if not os.path.exists(input_data): + raise FileNotFoundError("Target audio not found.") + audio, input_sr = torchaudio.load(input_data) + elif isinstance(input_data, np.ndarray): + input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device) + elif isinstance(input_data, torch.Tensor): + input_data = input_data.to(torch.float32).to(self.device) + else: + raise ValueError("Input must be path to audio signal or an audio array") + + if len(input_data.shape) == 1: + input_data = input_data.unsqueeze(0) + + if len(input_data.shape) == 3: + if input_data.shape[0] != 1: + raise ValueError("Batching is not supported. Please provide a single audio signal.") + input_data = input_data.squeeze(0) + + # resample audio + if input_sr != self.sample_rate: + logger.warning( + f"Resampling... (input sampling rate is assumed {input_sr}Hz, \ + make sure this is correct and change input_sr otherwise)" + ) + audio = torchaudio.transforms.Resample( + orig_freq=input_sr, new_freq=self.sample_rate + )(input_data) + + # downsampling to mono + if audio.shape[0] == 2: + audio = audio.mean(dim=0, keepdim=True) + logger.info( + f"Downsampling to mono... your audio is stereo, \ + and the model is trained on mono audio." + ) + + # audio has shape B, 1, N + audio = audio.reshape(-1) + predictions = [] + pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size) + audio = torch.nn.functional.pad(audio, (0, pad_length)) + + for i in range(0, audio.shape[-1], self.chunk_size): + audio_chunk = audio[i : i + self.chunk_size].reshape( + 1, 1, -1 + ) # TODO Batching + predictions.append(self.forward(audio_chunk)) + + result = torch.cat(predictions, dim=-1) + result = result[:, :, :-pad_length] + + vocal_separation = torchaudio.transforms.Resample( + orig_freq=self.sample_rate, new_freq=input_sr + )(result) + + return vocal_separation.detach().cpu().numpy().reshape(-1) + + def download_model(self, model_path=None, force_overwrite=False): + """Download pre-trained model.""" + download_path = ( + os.sep + os.path.join(*model_path.split(os.sep)[:-2]) + if model_path is not None + else os.path.join(WORKDIR, "models", "separation", "conv-tdf-finetune") + ) + # Creating model folder to store the weights + if not os.path.exists(download_path): + os.makedirs(download_path) + download_remote_model( + self.download_link, + self.download_checksum, + download_path, + force_overwrite=force_overwrite, + ) + + def select_gpu(self, gpu="-1"): + """Select the GPU to use for inference. + + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + :returns: None + """ + if int(gpu) == -1: + self.device = torch.device("cpu") + else: + if torch.cuda.is_available(): + self.device = torch.device("cuda:" + str(gpu)) + elif torch.backends.mps.is_available(): + self.device = torch.device("mps:" + str(gpu)) + else: + self.device = torch.device("cpu") + logger.warning("No GPU available. Running on CPU.") + self.gpu = gpu From 1ffd61e357f0da512b0adf50ccfe17444d7c08fd Mon Sep 17 00:00:00 2001 From: genisplaja Date: Tue, 1 Apr 2025 19:00:16 +0200 Subject: [PATCH 02/11] tests, docs, ack --- ACKNOWLEDGEMENTS | 1 + docs/source/separation.rst | 10 ++++++ tests/separation/test_convtdf_finetune.py | 43 +++++++++++++++++++++++ 3 files changed, 54 insertions(+) create mode 100644 tests/separation/test_convtdf_finetune.py diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS index f7588570..a238ef95 100644 --- a/ACKNOWLEDGEMENTS +++ b/ACKNOWLEDGEMENTS @@ -18,6 +18,7 @@ Jom Kuriakose Shrey Dutta Shubham Lohiya Swarada Bharadwaj +Serafin Schweinitz Project Musical AI - PID2019-111403GB-I00/AEI/10.13039/501100011033 funded by the Spanish Ministerio de Ciencia, Innovación y Universidades (MCIU) and the Agencia Estatal de diff --git a/docs/source/separation.rst b/docs/source/separation.rst index d8950094..356dc594 100644 --- a/docs/source/separation.rst +++ b/docs/source/separation.rst @@ -16,6 +16,16 @@ Leakage-aware Carnatic Singing Voice Separation :members: +Leakage-aware Carnatic Singing Voice Separation +----------------------------------------------- + +.. note:: + REQUIRES: torch + +.. autoclass:: compiam.separation.singing_voice_extraction.convtdf_vocal_finetune.ConvTDFVocalFineTune + :members: + + Vocals and violin separation ++++++++++++++++++++++++++++ diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py new file mode 100644 index 00000000..552aad2e --- /dev/null +++ b/tests/separation/test_convtdf_finetune.py @@ -0,0 +1,43 @@ +import os +import pytest +import shutil + +import numpy as np + +import compiam +from compiam.data import TESTDIR +from compiam.exceptions import ModelNotTrainedError + + +def _separate(): + from compiam.separation.singing_voice_extraction import ConvTDFVocalFineTune + + convtdf_vocal = ConvTDFVocalFineTune() + with pytest.raises(ModelNotTrainedError): + convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav")) + convtdf_vocal.trained = True + with pytest.raises(FileNotFoundError): + convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav")) + + convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR) + audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100 + separation = convtdf_vocal.separate(audio_in, input_sr=sr) + assert isinstance(separation, tuple) + assert isinstance(separation[0], np.ndarray) + assert isinstance(separation[1], np.ndarray) + shutil.rmtree(os.path.join(TESTDIR, "models")) + + +@pytest.mark.torch +def test_predict_torch(): + _separate() + + +@pytest.mark.full_ml +def test_predict_full(): + _separate() + + +@pytest.mark.all +def test_predict_all(): + _separate() From ccf796ea8c7d09a58ee4d343f55a231809736fb5 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Tue, 1 Apr 2025 20:12:22 +0200 Subject: [PATCH 03/11] add convtdf default params --- .../mixer_model/models.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/compiam/separation/music_source_separation/mixer_model/models.py b/compiam/separation/music_source_separation/mixer_model/models.py index a66404f2..6ad2989f 100644 --- a/compiam/separation/music_source_separation/mixer_model/models.py +++ b/compiam/separation/music_source_separation/mixer_model/models.py @@ -7,18 +7,18 @@ class ConvTDFNet(nn.Module): def __init__( self, - hop_length, - num_blocks, - dim_t, - n_fft, - dim_c, - dim_f, - g, - k, - l, - bn, - bias, - scale, + hop_length=558, + dim_t=256, + n_fft=6144, + dim_c=2, + dim_f=2048, + num_blocks=11, + g=32, + k=3, + l=3, + bn=4, + bias=False, + scale=2, ): super(ConvTDFNet, self).__init__() self.hop_length = hop_length From 84c487a47a5d3a6706e677ff19cc5269b0a69e1e Mon Sep 17 00:00:00 2001 From: genisplaja Date: Tue, 1 Apr 2025 20:12:35 +0200 Subject: [PATCH 04/11] add download model details --- compiam/data.py | 10 +++++----- .../singing_voice_extraction/convtdf_vocal_finetune.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/compiam/data.py b/compiam/data.py index 2f38fac2..608d7482 100644 --- a/compiam/data.py +++ b/compiam/data.py @@ -235,7 +235,7 @@ }, }, "separation:convtdf-vocal-finetune": { - "module_name": "compiam.separation.singing_voice_separation.convtdf-vocal-finetune", + "module_name": "compiam.separation.singing_voice_extraction.convtdf_vocal_finetune", "class_name": "ConvTDFVocalFineTune", "default_version": "v1", "kwargs": { @@ -243,12 +243,12 @@ "model_path": os.path.join( "models", "separation", - "convtdf-vocal-finetune", + "convtdf_vocal_finetune", "vocals", - "TODO", + "checkpoint_finetuned.pt", ), - "download_link": "TODO", - "download_checksum": "TODO", + "download_link": "https://zenodo.org/records/15121572/files/convtdf_vocal_finetune.zip?download=1", + "download_checksum": "170c7a25cb06911f2e4a9452ce943aed", }, }, }, diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py index 0f6fc82a..1e0af483 100644 --- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -88,7 +88,7 @@ def load_model(self, model_path): weights = torch.load(model_path, weights_only=True, map_location=self.device) except: weights = torch.load(model_path, map_location=self.device) - self.model.load_state_dict(weights) + self.model.load_state_dict(weights["model_state_dict"]) self.model_path = model_path self.trained = True From 4fe66352992a8a9bd5ae27be3076b72569aafcb5 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Tue, 1 Apr 2025 20:34:00 +0200 Subject: [PATCH 05/11] fix tests --- tests/separation/test_convtdf_finetune.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py index 552aad2e..a93f3cf8 100644 --- a/tests/separation/test_convtdf_finetune.py +++ b/tests/separation/test_convtdf_finetune.py @@ -22,9 +22,7 @@ def _separate(): convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR) audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100 separation = convtdf_vocal.separate(audio_in, input_sr=sr) - assert isinstance(separation, tuple) - assert isinstance(separation[0], np.ndarray) - assert isinstance(separation[1], np.ndarray) + assert isinstance(separation, np.ndarray) shutil.rmtree(os.path.join(TESTDIR, "models")) From bfc419159604f980e6186126a8c673c55f3de390 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Wed, 2 Apr 2025 18:10:06 +0200 Subject: [PATCH 06/11] fix variable names in audio loading --- .../mixer_model/__init__.py | 16 ++++++++-------- .../convtdf_vocal_finetune.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index c3fc2c59..bfeb287a 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -124,19 +124,19 @@ def separate( raise FileNotFoundError("Target audio not found.") audio, input_sr = torchaudio.load(input_data) elif isinstance(input_data, np.ndarray): - input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device) + audio = torch.from_numpy(input_data).to(torch.float32).to(self.device) elif isinstance(input_data, torch.Tensor): - input_data = input_data.to(torch.float32).to(self.device) + audio = input_data.to(torch.float32).to(self.device) else: raise ValueError("Input must be path to audio signal or an audio array") - if len(input_data.shape) == 1: - input_data = input_data.unsqueeze(0) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) # Add mono channel if no audio channels - if len(input_data.shape) == 3: - if input_data.shape[0] != 1: + if len(audio.shape) == 3: + if audio.shape[0] != 1: raise ValueError("Batching is not supported. Please provide a single audio signal.") - input_data = input_data.squeeze(0) + audio = audio.squeeze(0) # Remove batch size 1 # resample audio if input_sr != self.sample_rate: @@ -146,7 +146,7 @@ def separate( ) audio = torchaudio.transforms.Resample( orig_freq=input_sr, new_freq=self.sample_rate - )(input_data) + )(audio) # downsampling to mono if audio.shape[0] == 2: diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py index 1e0af483..528ac9f2 100644 --- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -123,19 +123,19 @@ def separate( raise FileNotFoundError("Target audio not found.") audio, input_sr = torchaudio.load(input_data) elif isinstance(input_data, np.ndarray): - input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device) + audio = torch.from_numpy(input_data).to(torch.float32).to(self.device) elif isinstance(input_data, torch.Tensor): - input_data = input_data.to(torch.float32).to(self.device) + audio = input_data.to(torch.float32).to(self.device) else: raise ValueError("Input must be path to audio signal or an audio array") - if len(input_data.shape) == 1: - input_data = input_data.unsqueeze(0) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) # Adding mono channel if no audio channels - if len(input_data.shape) == 3: - if input_data.shape[0] != 1: + if len(audio.shape) == 3: + if audio.shape[0] != 1: raise ValueError("Batching is not supported. Please provide a single audio signal.") - input_data = input_data.squeeze(0) + audio = audio.squeeze(0) # Removing batch dimension # resample audio if input_sr != self.sample_rate: @@ -145,7 +145,7 @@ def separate( ) audio = torchaudio.transforms.Resample( orig_freq=input_sr, new_freq=self.sample_rate - )(input_data) + )(audio) # downsampling to mono if audio.shape[0] == 2: From d4b514231c18c46fefada76b72206998bd389719 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 3 Apr 2025 20:24:10 +0200 Subject: [PATCH 07/11] overlap and add --- .../mixer_model/__init__.py | 44 ++++++++++++++----- .../convtdf_vocal_finetune.py | 44 ++++++++++++++----- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index bfeb287a..e2de0748 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -69,6 +69,7 @@ def __init__( self.load_model(self.model_path) self.chunk_size = self.model.chunk_size + self.overlap = 0.25 def forward(self, x): """Forward pass of the mixer model""" @@ -156,27 +157,46 @@ def separate( and the model is trained on mono audio." ) - # audio has shape B, 1, N + initial_length = audio.shape[-1] audio = audio.reshape(-1) - predictions = [] - pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size) + pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size audio = torch.nn.functional.pad(audio, (0, pad_length)) - for i in range(0, audio.shape[-1], self.chunk_size): - audio_chunk = audio[i : i + self.chunk_size].reshape( - 1, 1, -1 - ) # TODO Batching - predictions.append(self.forward(audio_chunk)) + chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size) + hop_size = int(chunk_size * (1 - self.overlap)) + num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 - result = torch.cat(predictions, dim=-1) - result = result[:, :, :-pad_length] + window = torch.hann_window(chunk_size) + out = torch.zeros((2, audio.shape[-1])) # (Channels=2, Time) + weight_sum = torch.zeros(audio.shape[-1]) # Weight accumulation for normalization + + # Process chunks + for i in range(num_chunks): + start = i * hop_size + end = start + chunk_size + + # Extract chunk (reshape for model input) + audio_chunk = audio[start:end].reshape(1, 1, -1) + + # Apply model separation (assumes 2-channel output) + separated_chunk = self.forward(audio_chunk).reshape(2, -1) # (2, chunk_size) + + # Apply windowing + separated_chunk *= window # Smooth transition + + # Overlap-Add to output + out[:, start:end] += separated_chunk + weight_sum[start:end] += window # Accumulate weights + + out /= weight_sum.unsqueeze(0).clamp(min=1e-8) # Avoid division by zero + out = out[..., :initial_length].unsqueeze(0) # (1, 2, N) vocal_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr - )(result[:, 0, :]) + )(out[:, 0, :]) violin_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr - )(result[:, 1, :]) + )(out[:, 1, :]) vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1) violin_separation = violin_separation.detach().cpu().numpy().reshape(-1) diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py index 528ac9f2..d468d7da 100644 --- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -68,6 +68,7 @@ def __init__( self.load_model(self.model_path) self.chunk_size = self.model.chunk_size + self.overlap = 0.25 def forward(self, x): """Forward pass of the mixer model""" @@ -155,25 +156,44 @@ def separate( and the model is trained on mono audio." ) - # audio has shape B, 1, N + initial_length = audio.shape[-1] audio = audio.reshape(-1) - predictions = [] - pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size) + pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size audio = torch.nn.functional.pad(audio, (0, pad_length)) - for i in range(0, audio.shape[-1], self.chunk_size): - audio_chunk = audio[i : i + self.chunk_size].reshape( - 1, 1, -1 - ) # TODO Batching - predictions.append(self.forward(audio_chunk)) + chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size) + hop_size = int(chunk_size * (1 - self.overlap)) + num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 - result = torch.cat(predictions, dim=-1) - result = result[:, :, :-pad_length] + window = torch.hann_window(chunk_size) + out = torch.zeros(audio.shape[-1]) # (Time,) + weight_sum = torch.zeros(audio.shape[-1]) # Weight accumulation for normalization + + # Process chunks + for i in range(num_chunks): + start = i * hop_size + end = start + chunk_size + + # Extract chunk (reshape for model input) + audio_chunk = audio[start:end].reshape(1, 1, -1) + + # Apply model separation (now outputs 1-channel) + separated_chunk = self.forward(audio_chunk).reshape(-1) # (chunk_size,) + + # Apply windowing + separated_chunk *= window # Smooth transition + + # Overlap-Add to output + out[start:end] += separated_chunk + weight_sum[start:end] += window # Accumulate weights + + out /= weight_sum.clamp(min=1e-8) # Avoid division by zero + out = out[:initial_length].unsqueeze(0) # (1, N) vocal_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr - )(result) - + )(out) + return vocal_separation.detach().cpu().numpy().reshape(-1) def download_model(self, model_path=None, force_overwrite=False): From bfbc157060192ab554849deb3f5378e85b789745 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Wed, 23 Jul 2025 23:32:46 +0200 Subject: [PATCH 08/11] minor fixes, norm for separation models --- .../music_source_separation/mixer_model/__init__.py | 4 ++++ .../singing_voice_extraction/cold_diff_sep/__init__.py | 8 +++++--- .../singing_voice_extraction/convtdf_vocal_finetune.py | 6 +++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index e2de0748..0b889ba1 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -98,6 +98,7 @@ def separate( self, input_data, input_sr=44100, + normalize_input=True, gpu="-1", ): """Separate singing voice and violin from mixture. @@ -105,6 +106,7 @@ def separate( :param input_data: Audio signal to separate. :param input_sr: sampling rate of the input array of data (if any). This variable is only relevant if the input is an array of data instead of a filepath. + :param normalize_input: Normalize the input audio signal. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :return: Singing voice and violin signals. """ @@ -157,6 +159,8 @@ def separate( and the model is trained on mono audio." ) + if normalize_input: + audio = audio / audio.max() initial_length = audio.shape[-1] audio = audio.reshape(-1) pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index 8cc92c06..6a7d9dd5 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -159,9 +159,11 @@ def separate( if normalize_input: # Normalizing audio for better performance overall - mean = tf.reduce_mean(mixture, keepdims=True) - std = tf.math.reduce_std(mixture, keepdims=True) - mixture = (mixture - mean) / (1e-6 + std) + #mean = tf.reduce_mean(mixture, keepdims=True) + #std = tf.math.reduce_std(mixture, keepdims=True) + #mixture = (mixture - mean) / (1e-6 + std) + # For now, divide by maximum vale + mixture = mixture / mixture.max() output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2) diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py index d468d7da..71f277b5 100644 --- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -11,7 +11,7 @@ class ConvTDFVocalFineTune(object): - """ConvTDF net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding).""" + """ConvTDF Net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding).""" def __init__( self, @@ -97,6 +97,7 @@ def separate( self, input_data, input_sr=44100, + normalize_input=True, gpu="-1", ): """Separate Carnatic singing voice from mixture. @@ -104,6 +105,7 @@ def separate( :param input_data: Audio signal/path to separate. :param input_sr: sampling rate of the input array of data (if any). This variable is only relevant if the input is an array of data instead of a filepath. + :param normalize_input: Normalize the input audio signal. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :return: Singing voice and violin signals. """ @@ -156,6 +158,8 @@ def separate( and the model is trained on mono audio." ) + if normalize_input: + audio = audio / audio.max() initial_length = audio.shape[-1] audio = audio.reshape(-1) pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size From f14bbdeaa9f0cabd02fdfe7f0f7a69dc92473514 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Wed, 23 Jul 2025 23:39:35 +0200 Subject: [PATCH 09/11] ftanet norm --- compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py index e7115e29..14245793 100644 --- a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py @@ -270,6 +270,7 @@ def predict( hop_size=80, batch_size=5, out_step=None, + amplify_input=1.0, gpu="-1", ): """Extract melody from input_data. @@ -283,6 +284,7 @@ def predict( (defaulted to 5, increase if enough computational power, reduce if needed). :param out_step: particular time-step duration if needed at output + :param amplify_input: for low volume inputs, we've found that overlouding it may provide better voicing detection (e.g. x10, x50) :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :returns: a 2-D list with time-stamps and pitch values per timestamp. """ @@ -323,6 +325,10 @@ def predict( xlist = [] timestamps = [] + # Applying loudness scaling + audio = audio / audio.max() + audio = audio * amplify_input + audio_len = len(audio) batch_min = self.sample_rate * 60 * batch_size freqs = [] From c6a2e3864d3cc21fd756778992ef84d38544da6f Mon Sep 17 00:00:00 2001 From: genisplaja Date: Wed, 23 Jul 2025 23:43:09 +0200 Subject: [PATCH 10/11] black and typos --- compiam/__init__.py | 2 +- compiam/dunya/__init__.py | 12 +++---- compiam/io.py | 5 +-- .../melody/pattern/sancara_search/__init__.py | 2 +- .../ftanet_carnatic/__init__.py | 2 +- .../ftaresnet_carnatic/__init__.py | 4 ++- .../raga_recognition/deepsrgm/__init__.py | 6 ++-- .../mixer_model/__init__.py | 34 +++++++++++++------ .../cold_diff_sep/__init__.py | 18 ++++++---- .../convtdf_vocal_finetune.py | 28 ++++++++++----- .../dhrupad_bandish_segmentation/__init__.py | 4 +-- .../audio_processing.py | 1 - compiam/utils/__init__.py | 4 ++- compiam/utils/download.py | 12 +++---- compiam/version.py | 2 +- tests/melody/test_deepsrgm.py | 4 ++- tests/melody/test_essentia_extractors.py | 8 +++-- tests/separation/test_convtdf_finetune.py | 4 ++- 18 files changed, 94 insertions(+), 58 deletions(-) diff --git a/compiam/__init__.py b/compiam/__init__.py index 5a795d57..796e129e 100644 --- a/compiam/__init__.py +++ b/compiam/__init__.py @@ -71,7 +71,7 @@ def load_dataset(dataset_name, data_home=None, version="default"): """ if dataset_name not in datasets_list: raise ValueError("Invalid dataset {}".format(dataset_name)) - dataloader = mirdata.initialize( + dataloader = mirdata.initialize( dataset_name=dataset_name, data_home=data_home, version=version ) dataloader.download(["index"]) # Download index file diff --git a/compiam/dunya/__init__.py b/compiam/dunya/__init__.py index 140a39ed..4301a560 100644 --- a/compiam/dunya/__init__.py +++ b/compiam/dunya/__init__.py @@ -25,9 +25,7 @@ def __init__(self, tradition, token): dunya.set_token(self.token) if tradition not in ["carnatic", "hindustani"]: - raise ValueError( - "Please choose a valid tradition: carnatic or hindustani" - ) + raise ValueError("Please choose a valid tradition: carnatic or hindustani") self.tradition = carnatic if tradition == "carnatic" else hindustani # Functions from the compmusic API are added as a method in the Corpora class @@ -36,10 +34,12 @@ def __init__(self, tradition, token): if callable(func): setattr(self, name, func) - logger.warning(""" + logger.warning( + """ Note that a part of the collection is under restricted access. To access the full collection please request permission at https://dunya.compmusic.upf.edu/user/profile/ - """) + """ + ) def get_collection(self, recording_detail=False): """Get the documents (recordings) in a collection. @@ -54,7 +54,7 @@ def get_collection(self, recording_detail=False): + "Please note that it might take a few moments..." ) return self.tradition.get_recordings(recording_detail) - + @staticmethod def list_available_types(recording_id): """Get the available source filetypes for a Musicbrainz recording. diff --git a/compiam/io.py b/compiam/io.py index 7cef4e4c..51fd384d 100644 --- a/compiam/io.py +++ b/compiam/io.py @@ -94,11 +94,11 @@ def write_scalar_txt(data, output_path): def resolve_dottedname(dotted_name): """Resolve a dotted name to an actual object, similar to zope.dottedname.resolve - + :param dotted_name: a dotted name :returns: the object the dotted name refers to """ - module_name, _, attribute_name = dotted_name.rpartition('.') + module_name, _, attribute_name = dotted_name.rpartition(".") if not module_name: raise ImportError(f"Invalid dotted name: '{dotted_name}'") module = importlib.import_module(module_name) @@ -111,6 +111,7 @@ def load_yaml(path): :param path: input file :returns: loaded yaml information """ + def constructor_dottedname(loader, node): value = loader.construct_scalar(node) return resolve_dottedname(value) diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py index b1537920..ba197638 100644 --- a/compiam/melody/pattern/sancara_search/__init__.py +++ b/compiam/melody/pattern/sancara_search/__init__.py @@ -185,7 +185,7 @@ def load_model(self, model_path, conf_path, spec_path): try: self.model.load_state_dict( torch.load(model_path, weights_only=True, map_location=self.device), - strict=False + strict=False, ) except: self.model.load_state_dict( diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py index 14245793..73b2030a 100644 --- a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py @@ -325,7 +325,7 @@ def predict( xlist = [] timestamps = [] - # Applying loudness scaling + # Applying loudness scaling audio = audio / audio.max() audio = audio * amplify_input diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py index 9a5ccac7..9852393b 100644 --- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py @@ -83,7 +83,9 @@ def load_model(self, model_path): ## Ensuring we can load the model for different torch versions ## -- (weights only might be deprecated) try: - self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device)) + self.model.load_state_dict( + torch.load(model_path, weights_only=True, map_location=self.device) + ) except: self.model.load_state_dict(torch.load(model_path, map_location=self.device)) self.model_path = model_path diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py index 1d79ea7c..9a100e85 100644 --- a/compiam/melody/raga_recognition/deepsrgm/__init__.py +++ b/compiam/melody/raga_recognition/deepsrgm/__init__.py @@ -124,7 +124,9 @@ def load_model(self, model_path, rnn="lstm"): self.model_path = model_path try: - weights = torch.load(model_path, weights_only=True, map_location=self.device) + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) except: weights = torch.load(model_path, map_location=self.device) new_weights = weights.copy() @@ -168,7 +170,7 @@ def load_raga_dataset(self, data_home=None, download=False): "compmusic_raga", data_home=data_home, version="default" ) if download: - self.dataset.download() # Downloads index and features + self.dataset.download() # Downloads index and features logger.warning( f""" The features are downloaded, but the audio of this dataset is private. diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index 0b889ba1..3faa9cef 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -87,7 +87,9 @@ def load_model(self, model_path): ## Ensuring we can load the model for different torch versions ## -- (weights only might be deprecated) try: - weights = torch.load(model_path, weights_only=True, map_location=self.device) + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) except: weights = torch.load(model_path, map_location=self.device) self.model.load_state_dict(weights) @@ -132,15 +134,17 @@ def separate( audio = input_data.to(torch.float32).to(self.device) else: raise ValueError("Input must be path to audio signal or an audio array") - + if len(audio.shape) == 1: - audio = audio.unsqueeze(0) # Add mono channel if no audio channels + audio = audio.unsqueeze(0) # Add mono channel if no audio channels if len(audio.shape) == 3: if audio.shape[0] != 1: - raise ValueError("Batching is not supported. Please provide a single audio signal.") + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) audio = audio.squeeze(0) # Remove batch size 1 - + # resample audio if input_sr != self.sample_rate: logger.warning( @@ -151,7 +155,7 @@ def separate( orig_freq=input_sr, new_freq=self.sample_rate )(audio) - # downsampling to mono + # downsampling to mono if audio.shape[0] == 2: audio = audio.mean(dim=0, keepdim=True) logger.info( @@ -163,16 +167,22 @@ def separate( audio = audio / audio.max() initial_length = audio.shape[-1] audio = audio.reshape(-1) - pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size + pad_length = ( + self.chunk_size - (audio.shape[-1] % self.chunk_size) + ) % self.chunk_size audio = torch.nn.functional.pad(audio, (0, pad_length)) - chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size) + chunk_size = audio.shape[-1] // ( + (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size + ) hop_size = int(chunk_size * (1 - self.overlap)) num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 window = torch.hann_window(chunk_size) out = torch.zeros((2, audio.shape[-1])) # (Channels=2, Time) - weight_sum = torch.zeros(audio.shape[-1]) # Weight accumulation for normalization + weight_sum = torch.zeros( + audio.shape[-1] + ) # Weight accumulation for normalization # Process chunks for i in range(num_chunks): @@ -183,7 +193,9 @@ def separate( audio_chunk = audio[start:end].reshape(1, 1, -1) # Apply model separation (assumes 2-channel output) - separated_chunk = self.forward(audio_chunk).reshape(2, -1) # (2, chunk_size) + separated_chunk = self.forward(audio_chunk).reshape( + 2, -1 + ) # (2, chunk_size) # Apply windowing separated_chunk *= window # Smooth transition @@ -201,7 +213,7 @@ def separate( violin_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr )(out[:, 1, :]) - + vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1) violin_separation = violin_separation.detach().cpu().numpy().reshape(-1) return (vocal_separation, violin_separation) diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index 6a7d9dd5..ec93270b 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -139,10 +139,14 @@ def separate( if len(mixture.shape) == 3: if mixture.shape[0] != 1: - raise ValueError("Batching is not supported. Please provide a single audio signal.") + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) else: mixture = mixture.squeeze(0) - mixture = tf.reduce_mean(mixture, axis=0, keepdims=False) # Removing dimension + mixture = tf.reduce_mean( + mixture, axis=0, keepdims=False + ) # Removing dimension logger.info( f"Downsampling to mono... your audio is stereo, \ and the model is trained on mono audio." @@ -159,12 +163,12 @@ def separate( if normalize_input: # Normalizing audio for better performance overall - #mean = tf.reduce_mean(mixture, keepdims=True) - #std = tf.math.reduce_std(mixture, keepdims=True) - #mixture = (mixture - mean) / (1e-6 + std) - # For now, divide by maximum vale + # mean = tf.reduce_mean(mixture, keepdims=True) + # std = tf.math.reduce_std(mixture, keepdims=True) + # mixture = (mixture - mean) / (1e-6 + std) + # For now, divide by maximum value mixture = mixture / mixture.max() - + output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2) runs = math.floor(mixture.shape[0] / hopsized_chunk) diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py index 71f277b5..ebd7def3 100644 --- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -1,6 +1,6 @@ import os -import numpy as np +import numpy as np from compiam.exceptions import ModelNotTrainedError from compiam.utils import get_logger, WORKDIR @@ -86,7 +86,9 @@ def load_model(self, model_path): ## Ensuring we can load the model for different torch versions ## -- (weights only might be deprecated) try: - weights = torch.load(model_path, weights_only=True, map_location=self.device) + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) except: weights = torch.load(model_path, map_location=self.device) self.model.load_state_dict(weights["model_state_dict"]) @@ -131,15 +133,17 @@ def separate( audio = input_data.to(torch.float32).to(self.device) else: raise ValueError("Input must be path to audio signal or an audio array") - + if len(audio.shape) == 1: audio = audio.unsqueeze(0) # Adding mono channel if no audio channels if len(audio.shape) == 3: if audio.shape[0] != 1: - raise ValueError("Batching is not supported. Please provide a single audio signal.") + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) audio = audio.squeeze(0) # Removing batch dimension - + # resample audio if input_sr != self.sample_rate: logger.warning( @@ -150,7 +154,7 @@ def separate( orig_freq=input_sr, new_freq=self.sample_rate )(audio) - # downsampling to mono + # downsampling to mono if audio.shape[0] == 2: audio = audio.mean(dim=0, keepdim=True) logger.info( @@ -162,16 +166,22 @@ def separate( audio = audio / audio.max() initial_length = audio.shape[-1] audio = audio.reshape(-1) - pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size + pad_length = ( + self.chunk_size - (audio.shape[-1] % self.chunk_size) + ) % self.chunk_size audio = torch.nn.functional.pad(audio, (0, pad_length)) - chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size) + chunk_size = audio.shape[-1] // ( + (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size + ) hop_size = int(chunk_size * (1 - self.overlap)) num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 window = torch.hann_window(chunk_size) out = torch.zeros(audio.shape[-1]) # (Time,) - weight_sum = torch.zeros(audio.shape[-1]) # Weight accumulation for normalization + weight_sum = torch.zeros( + audio.shape[-1] + ) # Weight accumulation for normalization # Process chunks for i in range(num_chunks): diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py index 81cd7d9c..6a101c8f 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py @@ -183,9 +183,7 @@ def load_model(self, model_path): torch.load(model_path, weights_only=True, map_location=self.device) ) except: - self.model.load_state_dict( - torch.load(model_path, map_location=self.device) - ) + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) self.model.eval() self.loaded_model_path = model_path self.trained = True diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py index 07c68bb5..ef99b1c8 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py @@ -10,7 +10,6 @@ logger = get_logger(__name__) - def split_audios(save_dir=None, annotations_path=None, audios_path=None): """Split audio of Dhrupad dataset diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py index ef971b90..436cd314 100644 --- a/compiam/utils/__init__.py +++ b/compiam/utils/__init__.py @@ -180,7 +180,9 @@ def stereo_to_mono(audio): if audio.shape[0] > audio.shape[1]: audio = audio.T if audio.shape[0] > 2: - raise ValueError("Expected mono or stereo audio, got multi-channel audio") + raise ValueError( + "Expected mono or stereo audio, got multi-channel audio" + ) # If stereo, average the channels if audio.shape[0] == 2: audio = np.mean(audio, axis=0) diff --git a/compiam/utils/download.py b/compiam/utils/download.py index 3cc2ee60..9c4a3046 100644 --- a/compiam/utils/download.py +++ b/compiam/utils/download.py @@ -75,19 +75,17 @@ def download_remote_model( def download_zip(url, root_path): """Download a ZIP file from a URL.""" # Get the file name from the URL - local_filename = os.path.join( - root_path, - url.split("/")[-1].split("?")[0] - ) + local_filename = os.path.join(root_path, url.split("/")[-1].split("?")[0]) # Stream the download and save the file with requests.get(url, stream=True) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) chunk_size = 8192 - with open(local_filename, "wb") as f, tqdm( - total=total_size, unit="iB", unit_scale=True - ) as pbar: + with ( + open(local_filename, "wb") as f, + tqdm(total=total_size, unit="iB", unit_scale=True) as pbar, + ): for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) diff --git a/compiam/version.py b/compiam/version.py index 8afe466f..a1c2a810 100644 --- a/compiam/version.py +++ b/compiam/version.py @@ -1,4 +1,4 @@ """Version info""" short_version = "0.4" -version = "0.4.1" \ No newline at end of file +version = "0.4.1" diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index 51f9a523..98c57dbb 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -44,7 +44,9 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + audio = librosa.load( + os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100 + )[0] audio = np.tile(audio, 9) feat_1 = deepsrgm.get_features(audio) feat_2 = deepsrgm.get_features(np.stack([audio, audio])) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index 622c969a..e8c634cb 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -71,10 +71,14 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + audio = librosa.load( + os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100 + )[0] tonic_2 = tonic_multipitch.extract(audio) # Testing input array tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array - tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array + tonic_4 = tonic_multipitch.extract( + np.stack([audio, audio]).T + ) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125 diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py index a93f3cf8..dabcc157 100644 --- a/tests/separation/test_convtdf_finetune.py +++ b/tests/separation/test_convtdf_finetune.py @@ -19,7 +19,9 @@ def _separate(): with pytest.raises(FileNotFoundError): convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav")) - convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR) + convtdf_vocal = compiam.load_model( + "separation:convtdf-vocal-finetune", data_home=TESTDIR + ) audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100 separation = convtdf_vocal.separate(audio_in, input_sr=sr) assert isinstance(separation, np.ndarray) From 38ce53e0603b57c462e6e0a0ee2888dd2833c201 Mon Sep 17 00:00:00 2001 From: genisplaja Date: Thu, 24 Jul 2025 00:42:03 +0200 Subject: [PATCH 11/11] correct TF max function --- .../singing_voice_extraction/cold_diff_sep/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index ec93270b..d76c1327 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -167,7 +167,8 @@ def separate( # std = tf.math.reduce_std(mixture, keepdims=True) # mixture = (mixture - mean) / (1e-6 + std) # For now, divide by maximum value - mixture = mixture / mixture.max() + mixture = mixture / tf.reduce_max(mixture) + output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2)