diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS index f7588570..a238ef95 100644 --- a/ACKNOWLEDGEMENTS +++ b/ACKNOWLEDGEMENTS @@ -18,6 +18,7 @@ Jom Kuriakose Shrey Dutta Shubham Lohiya Swarada Bharadwaj +Serafin Schweinitz Project Musical AI - PID2019-111403GB-I00/AEI/10.13039/501100011033 funded by the Spanish Ministerio de Ciencia, Innovación y Universidades (MCIU) and the Agencia Estatal de diff --git a/compiam/__init__.py b/compiam/__init__.py index 5a795d57..796e129e 100644 --- a/compiam/__init__.py +++ b/compiam/__init__.py @@ -71,7 +71,7 @@ def load_dataset(dataset_name, data_home=None, version="default"): """ if dataset_name not in datasets_list: raise ValueError("Invalid dataset {}".format(dataset_name)) - dataloader = mirdata.initialize( + dataloader = mirdata.initialize( dataset_name=dataset_name, data_home=data_home, version=version ) dataloader.download(["index"]) # Download index file diff --git a/compiam/data.py b/compiam/data.py index fc1c2a4b..608d7482 100644 --- a/compiam/data.py +++ b/compiam/data.py @@ -234,6 +234,24 @@ }, }, }, + "separation:convtdf-vocal-finetune": { + "module_name": "compiam.separation.singing_voice_extraction.convtdf_vocal_finetune", + "class_name": "ConvTDFVocalFineTune", + "default_version": "v1", + "kwargs": { + "v1": { + "model_path": os.path.join( + "models", + "separation", + "convtdf_vocal_finetune", + "vocals", + "checkpoint_finetuned.pt", + ), + "download_link": "https://zenodo.org/records/15121572/files/convtdf_vocal_finetune.zip?download=1", + "download_checksum": "170c7a25cb06911f2e4a9452ce943aed", + }, + }, + }, } diff --git a/compiam/dunya/__init__.py b/compiam/dunya/__init__.py index 140a39ed..4301a560 100644 --- a/compiam/dunya/__init__.py +++ b/compiam/dunya/__init__.py @@ -25,9 +25,7 @@ def __init__(self, tradition, token): dunya.set_token(self.token) if tradition not in ["carnatic", "hindustani"]: - raise ValueError( - "Please choose a valid tradition: carnatic or hindustani" - ) + raise ValueError("Please choose a valid tradition: carnatic or hindustani") self.tradition = carnatic if tradition == "carnatic" else hindustani # Functions from the compmusic API are added as a method in the Corpora class @@ -36,10 +34,12 @@ def __init__(self, tradition, token): if callable(func): setattr(self, name, func) - logger.warning(""" + logger.warning( + """ Note that a part of the collection is under restricted access. To access the full collection please request permission at https://dunya.compmusic.upf.edu/user/profile/ - """) + """ + ) def get_collection(self, recording_detail=False): """Get the documents (recordings) in a collection. @@ -54,7 +54,7 @@ def get_collection(self, recording_detail=False): + "Please note that it might take a few moments..." ) return self.tradition.get_recordings(recording_detail) - + @staticmethod def list_available_types(recording_id): """Get the available source filetypes for a Musicbrainz recording. diff --git a/compiam/io.py b/compiam/io.py index 7cef4e4c..51fd384d 100644 --- a/compiam/io.py +++ b/compiam/io.py @@ -94,11 +94,11 @@ def write_scalar_txt(data, output_path): def resolve_dottedname(dotted_name): """Resolve a dotted name to an actual object, similar to zope.dottedname.resolve - + :param dotted_name: a dotted name :returns: the object the dotted name refers to """ - module_name, _, attribute_name = dotted_name.rpartition('.') + module_name, _, attribute_name = dotted_name.rpartition(".") if not module_name: raise ImportError(f"Invalid dotted name: '{dotted_name}'") module = importlib.import_module(module_name) @@ -111,6 +111,7 @@ def load_yaml(path): :param path: input file :returns: loaded yaml information """ + def constructor_dottedname(loader, node): value = loader.construct_scalar(node) return resolve_dottedname(value) diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py index b1537920..ba197638 100644 --- a/compiam/melody/pattern/sancara_search/__init__.py +++ b/compiam/melody/pattern/sancara_search/__init__.py @@ -185,7 +185,7 @@ def load_model(self, model_path, conf_path, spec_path): try: self.model.load_state_dict( torch.load(model_path, weights_only=True, map_location=self.device), - strict=False + strict=False, ) except: self.model.load_state_dict( diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py index e7115e29..73b2030a 100644 --- a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py @@ -270,6 +270,7 @@ def predict( hop_size=80, batch_size=5, out_step=None, + amplify_input=1.0, gpu="-1", ): """Extract melody from input_data. @@ -283,6 +284,7 @@ def predict( (defaulted to 5, increase if enough computational power, reduce if needed). :param out_step: particular time-step duration if needed at output + :param amplify_input: for low volume inputs, we've found that overlouding it may provide better voicing detection (e.g. x10, x50) :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :returns: a 2-D list with time-stamps and pitch values per timestamp. """ @@ -323,6 +325,10 @@ def predict( xlist = [] timestamps = [] + # Applying loudness scaling + audio = audio / audio.max() + audio = audio * amplify_input + audio_len = len(audio) batch_min = self.sample_rate * 60 * batch_size freqs = [] diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py index 9a5ccac7..9852393b 100644 --- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py +++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py @@ -83,7 +83,9 @@ def load_model(self, model_path): ## Ensuring we can load the model for different torch versions ## -- (weights only might be deprecated) try: - self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device)) + self.model.load_state_dict( + torch.load(model_path, weights_only=True, map_location=self.device) + ) except: self.model.load_state_dict(torch.load(model_path, map_location=self.device)) self.model_path = model_path diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py index 1d79ea7c..9a100e85 100644 --- a/compiam/melody/raga_recognition/deepsrgm/__init__.py +++ b/compiam/melody/raga_recognition/deepsrgm/__init__.py @@ -124,7 +124,9 @@ def load_model(self, model_path, rnn="lstm"): self.model_path = model_path try: - weights = torch.load(model_path, weights_only=True, map_location=self.device) + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) except: weights = torch.load(model_path, map_location=self.device) new_weights = weights.copy() @@ -168,7 +170,7 @@ def load_raga_dataset(self, data_home=None, download=False): "compmusic_raga", data_home=data_home, version="default" ) if download: - self.dataset.download() # Downloads index and features + self.dataset.download() # Downloads index and features logger.warning( f""" The features are downloaded, but the audio of this dataset is private. diff --git a/compiam/separation/README.md b/compiam/separation/README.md index 2fc9abfe..a227f6e9 100644 --- a/compiam/separation/README.md +++ b/compiam/separation/README.md @@ -3,9 +3,12 @@ | **Tool** | **Task** | **Paper** | |---------------------------|----------------------------------|-----------| | ColdDiffSep | Singing voice extraction | [1] | -| MDXNet w/ mixer model | Music source separation | [2] | +| ConvTDF Vocal Fine-tuned | Singing voice extraction | [2] | +| MDXNet w/ mixer model | Music source separation | [3] | [1] G. Plaja-Roglans, M. Miron, A. Shankar, and X. Serra, "Carnatic Singing Voice Separation using Cold Diffusion on Training Data with Bleeding", in International Society for Music Information Retrieval Conference (ISMIR 23), 2023. -[2] Work under review. \ No newline at end of file +[2] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025. + +[3] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025. \ No newline at end of file diff --git a/compiam/separation/__init__.py b/compiam/separation/__init__.py index 66686471..512384f1 100644 --- a/compiam/separation/__init__.py +++ b/compiam/separation/__init__.py @@ -12,6 +12,7 @@ ### IMPORT HERE THE CONSIDERED TASKS from compiam.separation import singing_voice_extraction +from compiam.separation import music_source_separation # Show user the available tasks diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py index c3fc2c59..3faa9cef 100644 --- a/compiam/separation/music_source_separation/mixer_model/__init__.py +++ b/compiam/separation/music_source_separation/mixer_model/__init__.py @@ -69,6 +69,7 @@ def __init__( self.load_model(self.model_path) self.chunk_size = self.model.chunk_size + self.overlap = 0.25 def forward(self, x): """Forward pass of the mixer model""" @@ -86,7 +87,9 @@ def load_model(self, model_path): ## Ensuring we can load the model for different torch versions ## -- (weights only might be deprecated) try: - weights = torch.load(model_path, weights_only=True, map_location=self.device) + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) except: weights = torch.load(model_path, map_location=self.device) self.model.load_state_dict(weights) @@ -97,6 +100,7 @@ def separate( self, input_data, input_sr=44100, + normalize_input=True, gpu="-1", ): """Separate singing voice and violin from mixture. @@ -104,6 +108,7 @@ def separate( :param input_data: Audio signal to separate. :param input_sr: sampling rate of the input array of data (if any). This variable is only relevant if the input is an array of data instead of a filepath. + :param normalize_input: Normalize the input audio signal. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :return: Singing voice and violin signals. """ @@ -124,20 +129,22 @@ def separate( raise FileNotFoundError("Target audio not found.") audio, input_sr = torchaudio.load(input_data) elif isinstance(input_data, np.ndarray): - input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device) + audio = torch.from_numpy(input_data).to(torch.float32).to(self.device) elif isinstance(input_data, torch.Tensor): - input_data = input_data.to(torch.float32).to(self.device) + audio = input_data.to(torch.float32).to(self.device) else: raise ValueError("Input must be path to audio signal or an audio array") - - if len(input_data.shape) == 1: - input_data = input_data.unsqueeze(0) - - if len(input_data.shape) == 3: - if input_data.shape[0] != 1: - raise ValueError("Batching is not supported. Please provide a single audio signal.") - input_data = input_data.squeeze(0) - + + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) # Add mono channel if no audio channels + + if len(audio.shape) == 3: + if audio.shape[0] != 1: + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) + audio = audio.squeeze(0) # Remove batch size 1 + # resample audio if input_sr != self.sample_rate: logger.warning( @@ -146,9 +153,9 @@ def separate( ) audio = torchaudio.transforms.Resample( orig_freq=input_sr, new_freq=self.sample_rate - )(input_data) + )(audio) - # downsampling to mono + # downsampling to mono if audio.shape[0] == 2: audio = audio.mean(dim=0, keepdim=True) logger.info( @@ -156,28 +163,57 @@ def separate( and the model is trained on mono audio." ) - # audio has shape B, 1, N + if normalize_input: + audio = audio / audio.max() + initial_length = audio.shape[-1] audio = audio.reshape(-1) - predictions = [] - pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size) + pad_length = ( + self.chunk_size - (audio.shape[-1] % self.chunk_size) + ) % self.chunk_size audio = torch.nn.functional.pad(audio, (0, pad_length)) - for i in range(0, audio.shape[-1], self.chunk_size): - audio_chunk = audio[i : i + self.chunk_size].reshape( - 1, 1, -1 - ) # TODO Batching - predictions.append(self.forward(audio_chunk)) + chunk_size = audio.shape[-1] // ( + (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size + ) + hop_size = int(chunk_size * (1 - self.overlap)) + num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 + + window = torch.hann_window(chunk_size) + out = torch.zeros((2, audio.shape[-1])) # (Channels=2, Time) + weight_sum = torch.zeros( + audio.shape[-1] + ) # Weight accumulation for normalization - result = torch.cat(predictions, dim=-1) - result = result[:, :, :-pad_length] + # Process chunks + for i in range(num_chunks): + start = i * hop_size + end = start + chunk_size + + # Extract chunk (reshape for model input) + audio_chunk = audio[start:end].reshape(1, 1, -1) + + # Apply model separation (assumes 2-channel output) + separated_chunk = self.forward(audio_chunk).reshape( + 2, -1 + ) # (2, chunk_size) + + # Apply windowing + separated_chunk *= window # Smooth transition + + # Overlap-Add to output + out[:, start:end] += separated_chunk + weight_sum[start:end] += window # Accumulate weights + + out /= weight_sum.unsqueeze(0).clamp(min=1e-8) # Avoid division by zero + out = out[..., :initial_length].unsqueeze(0) # (1, 2, N) vocal_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr - )(result[:, 0, :]) + )(out[:, 0, :]) violin_separation = torchaudio.transforms.Resample( orig_freq=self.sample_rate, new_freq=input_sr - )(result[:, 1, :]) - + )(out[:, 1, :]) + vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1) violin_separation = violin_separation.detach().cpu().numpy().reshape(-1) return (vocal_separation, violin_separation) diff --git a/compiam/separation/music_source_separation/mixer_model/models.py b/compiam/separation/music_source_separation/mixer_model/models.py index a66404f2..6ad2989f 100644 --- a/compiam/separation/music_source_separation/mixer_model/models.py +++ b/compiam/separation/music_source_separation/mixer_model/models.py @@ -7,18 +7,18 @@ class ConvTDFNet(nn.Module): def __init__( self, - hop_length, - num_blocks, - dim_t, - n_fft, - dim_c, - dim_f, - g, - k, - l, - bn, - bias, - scale, + hop_length=558, + dim_t=256, + n_fft=6144, + dim_c=2, + dim_f=2048, + num_blocks=11, + g=32, + k=3, + l=3, + bn=4, + bias=False, + scale=2, ): super(ConvTDFNet, self).__init__() self.hop_length = hop_length diff --git a/compiam/separation/singing_voice_extraction/__init__.py b/compiam/separation/singing_voice_extraction/__init__.py index 9f72faee..f0ee9c9d 100644 --- a/compiam/separation/singing_voice_extraction/__init__.py +++ b/compiam/separation/singing_voice_extraction/__init__.py @@ -7,6 +7,9 @@ from compiam.separation.singing_voice_extraction.cold_diff_sep import ( ColdDiffSep, ) +from compiam.separation.singing_voice_extraction.convtdf_vocal_finetune import ( + ConvTDFVocalFineTune, +) # Show user the available tools diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index 8cc92c06..d76c1327 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -139,10 +139,14 @@ def separate( if len(mixture.shape) == 3: if mixture.shape[0] != 1: - raise ValueError("Batching is not supported. Please provide a single audio signal.") + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) else: mixture = mixture.squeeze(0) - mixture = tf.reduce_mean(mixture, axis=0, keepdims=False) # Removing dimension + mixture = tf.reduce_mean( + mixture, axis=0, keepdims=False + ) # Removing dimension logger.info( f"Downsampling to mono... your audio is stereo, \ and the model is trained on mono audio." @@ -159,10 +163,13 @@ def separate( if normalize_input: # Normalizing audio for better performance overall - mean = tf.reduce_mean(mixture, keepdims=True) - std = tf.math.reduce_std(mixture, keepdims=True) - mixture = (mixture - mean) / (1e-6 + std) - + # mean = tf.reduce_mean(mixture, keepdims=True) + # std = tf.math.reduce_std(mixture, keepdims=True) + # mixture = (mixture - mean) / (1e-6 + std) + # For now, divide by maximum value + mixture = mixture / tf.reduce_max(mixture) + + output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2) runs = math.floor(mixture.shape[0] / hopsized_chunk) diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py new file mode 100644 index 00000000..ebd7def3 --- /dev/null +++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py @@ -0,0 +1,246 @@ +import os + +import numpy as np + +from compiam.exceptions import ModelNotTrainedError +from compiam.utils import get_logger, WORKDIR +from compiam.utils.download import download_remote_model + + +logger = get_logger(__name__) + + +class ConvTDFVocalFineTune(object): + """ConvTDF Net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding).""" + + def __init__( + self, + model_path=None, + download_link=None, + download_checksum=None, + sample_rate=24000, + gpu="-1", + ): + """Leakage-aware singing voice separation init method. + + :param model_path: path to file to the model weights. + :param download_link: link to the remote pre-trained model. + :param download_checksum: checksum of the model file. + :param sample_rate: sample rate to which the audio is sampled for extraction. + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + """ + ### IMPORTING OPTIONAL DEPENDENCIES + try: + global torch + import torch + + global nn + import torch.nn as nn + + global torchaudio + import torchaudio + + global ConvTDFNet + from compiam.separation.music_source_separation.mixer_model.models import ( + ConvTDFNet, + ) + + except: + raise ImportError( + "In order to use this tool you need to have torch and torchaudio installed. " + "Install compIAM with torch support: pip install 'compiam[torch]'" + ) + ### + + ## Setting up GPU if specified + self.gpu = gpu + self.device = None + self.select_gpu(gpu) + + self.model = self._build_model() + self.sample_rate = sample_rate + self.trained = False + + self.model_path = model_path + self.download_link = download_link + self.download_checksum = download_checksum + if self.model_path is not None: + self.load_model(self.model_path) + + self.chunk_size = self.model.chunk_size + self.overlap = 0.25 + + def forward(self, x): + """Forward pass of the mixer model""" + return self.model(x) + + def _build_model(self): + """Build the MDXNet mixer model.""" + convtdfnet = ConvTDFNet().to(self.device) + convtdfnet.eval() + return convtdfnet + + def load_model(self, model_path): + if not os.path.exists(model_path): + self.download_model(model_path) # Downloading model weights + ## Ensuring we can load the model for different torch versions + ## -- (weights only might be deprecated) + try: + weights = torch.load( + model_path, weights_only=True, map_location=self.device + ) + except: + weights = torch.load(model_path, map_location=self.device) + self.model.load_state_dict(weights["model_state_dict"]) + self.model_path = model_path + self.trained = True + + def separate( + self, + input_data, + input_sr=44100, + normalize_input=True, + gpu="-1", + ): + """Separate Carnatic singing voice from mixture. + + :param input_data: Audio signal/path to separate. + :param input_sr: sampling rate of the input array of data (if any). This variable is only + relevant if the input is an array of data instead of a filepath. + :param normalize_input: Normalize the input audio signal. + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + :return: Singing voice and violin signals. + """ + ## Setting up GPU if specified + self.gpu = gpu + self.device = None + self.select_gpu(gpu) + + if self.trained is False: + raise ModelNotTrainedError( + """ Model is not trained. Please load model before running inference! + You can load the pre-trained instance with the load_model wrapper.""" + ) + + # Loading and resampling audio + if isinstance(input_data, str): + if not os.path.exists(input_data): + raise FileNotFoundError("Target audio not found.") + audio, input_sr = torchaudio.load(input_data) + elif isinstance(input_data, np.ndarray): + audio = torch.from_numpy(input_data).to(torch.float32).to(self.device) + elif isinstance(input_data, torch.Tensor): + audio = input_data.to(torch.float32).to(self.device) + else: + raise ValueError("Input must be path to audio signal or an audio array") + + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) # Adding mono channel if no audio channels + + if len(audio.shape) == 3: + if audio.shape[0] != 1: + raise ValueError( + "Batching is not supported. Please provide a single audio signal." + ) + audio = audio.squeeze(0) # Removing batch dimension + + # resample audio + if input_sr != self.sample_rate: + logger.warning( + f"Resampling... (input sampling rate is assumed {input_sr}Hz, \ + make sure this is correct and change input_sr otherwise)" + ) + audio = torchaudio.transforms.Resample( + orig_freq=input_sr, new_freq=self.sample_rate + )(audio) + + # downsampling to mono + if audio.shape[0] == 2: + audio = audio.mean(dim=0, keepdim=True) + logger.info( + f"Downsampling to mono... your audio is stereo, \ + and the model is trained on mono audio." + ) + + if normalize_input: + audio = audio / audio.max() + initial_length = audio.shape[-1] + audio = audio.reshape(-1) + pad_length = ( + self.chunk_size - (audio.shape[-1] % self.chunk_size) + ) % self.chunk_size + audio = torch.nn.functional.pad(audio, (0, pad_length)) + + chunk_size = audio.shape[-1] // ( + (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size + ) + hop_size = int(chunk_size * (1 - self.overlap)) + num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1 + + window = torch.hann_window(chunk_size) + out = torch.zeros(audio.shape[-1]) # (Time,) + weight_sum = torch.zeros( + audio.shape[-1] + ) # Weight accumulation for normalization + + # Process chunks + for i in range(num_chunks): + start = i * hop_size + end = start + chunk_size + + # Extract chunk (reshape for model input) + audio_chunk = audio[start:end].reshape(1, 1, -1) + + # Apply model separation (now outputs 1-channel) + separated_chunk = self.forward(audio_chunk).reshape(-1) # (chunk_size,) + + # Apply windowing + separated_chunk *= window # Smooth transition + + # Overlap-Add to output + out[start:end] += separated_chunk + weight_sum[start:end] += window # Accumulate weights + + out /= weight_sum.clamp(min=1e-8) # Avoid division by zero + out = out[:initial_length].unsqueeze(0) # (1, N) + + vocal_separation = torchaudio.transforms.Resample( + orig_freq=self.sample_rate, new_freq=input_sr + )(out) + + return vocal_separation.detach().cpu().numpy().reshape(-1) + + def download_model(self, model_path=None, force_overwrite=False): + """Download pre-trained model.""" + download_path = ( + os.sep + os.path.join(*model_path.split(os.sep)[:-2]) + if model_path is not None + else os.path.join(WORKDIR, "models", "separation", "conv-tdf-finetune") + ) + # Creating model folder to store the weights + if not os.path.exists(download_path): + os.makedirs(download_path) + download_remote_model( + self.download_link, + self.download_checksum, + download_path, + force_overwrite=force_overwrite, + ) + + def select_gpu(self, gpu="-1"): + """Select the GPU to use for inference. + + :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. + :returns: None + """ + if int(gpu) == -1: + self.device = torch.device("cpu") + else: + if torch.cuda.is_available(): + self.device = torch.device("cuda:" + str(gpu)) + elif torch.backends.mps.is_available(): + self.device = torch.device("mps:" + str(gpu)) + else: + self.device = torch.device("cpu") + logger.warning("No GPU available. Running on CPU.") + self.gpu = gpu diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py index 81cd7d9c..6a101c8f 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py @@ -183,9 +183,7 @@ def load_model(self, model_path): torch.load(model_path, weights_only=True, map_location=self.device) ) except: - self.model.load_state_dict( - torch.load(model_path, map_location=self.device) - ) + self.model.load_state_dict(torch.load(model_path, map_location=self.device)) self.model.eval() self.loaded_model_path = model_path self.trained = True diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py index 07c68bb5..ef99b1c8 100644 --- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py +++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py @@ -10,7 +10,6 @@ logger = get_logger(__name__) - def split_audios(save_dir=None, annotations_path=None, audios_path=None): """Split audio of Dhrupad dataset diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py index ef971b90..436cd314 100644 --- a/compiam/utils/__init__.py +++ b/compiam/utils/__init__.py @@ -180,7 +180,9 @@ def stereo_to_mono(audio): if audio.shape[0] > audio.shape[1]: audio = audio.T if audio.shape[0] > 2: - raise ValueError("Expected mono or stereo audio, got multi-channel audio") + raise ValueError( + "Expected mono or stereo audio, got multi-channel audio" + ) # If stereo, average the channels if audio.shape[0] == 2: audio = np.mean(audio, axis=0) diff --git a/compiam/utils/download.py b/compiam/utils/download.py index 3cc2ee60..9c4a3046 100644 --- a/compiam/utils/download.py +++ b/compiam/utils/download.py @@ -75,19 +75,17 @@ def download_remote_model( def download_zip(url, root_path): """Download a ZIP file from a URL.""" # Get the file name from the URL - local_filename = os.path.join( - root_path, - url.split("/")[-1].split("?")[0] - ) + local_filename = os.path.join(root_path, url.split("/")[-1].split("?")[0]) # Stream the download and save the file with requests.get(url, stream=True) as r: r.raise_for_status() total_size = int(r.headers.get("content-length", 0)) chunk_size = 8192 - with open(local_filename, "wb") as f, tqdm( - total=total_size, unit="iB", unit_scale=True - ) as pbar: + with ( + open(local_filename, "wb") as f, + tqdm(total=total_size, unit="iB", unit_scale=True) as pbar, + ): for chunk in r.iter_content(chunk_size=chunk_size): if chunk: # filter out keep-alive new chunks f.write(chunk) diff --git a/compiam/version.py b/compiam/version.py index 8afe466f..a1c2a810 100644 --- a/compiam/version.py +++ b/compiam/version.py @@ -1,4 +1,4 @@ """Version info""" short_version = "0.4" -version = "0.4.1" \ No newline at end of file +version = "0.4.1" diff --git a/docs/source/separation.rst b/docs/source/separation.rst index d8950094..356dc594 100644 --- a/docs/source/separation.rst +++ b/docs/source/separation.rst @@ -16,6 +16,16 @@ Leakage-aware Carnatic Singing Voice Separation :members: +Leakage-aware Carnatic Singing Voice Separation +----------------------------------------------- + +.. note:: + REQUIRES: torch + +.. autoclass:: compiam.separation.singing_voice_extraction.convtdf_vocal_finetune.ConvTDFVocalFineTune + :members: + + Vocals and violin separation ++++++++++++++++++++++++++++ diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index 51f9a523..98c57dbb 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -44,7 +44,9 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + audio = librosa.load( + os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100 + )[0] audio = np.tile(audio, 9) feat_1 = deepsrgm.get_features(audio) feat_2 = deepsrgm.get_features(np.stack([audio, audio])) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index 622c969a..e8c634cb 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -71,10 +71,14 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + audio = librosa.load( + os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100 + )[0] tonic_2 = tonic_multipitch.extract(audio) # Testing input array tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array - tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array + tonic_4 = tonic_multipitch.extract( + np.stack([audio, audio]).T + ) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125 diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py new file mode 100644 index 00000000..dabcc157 --- /dev/null +++ b/tests/separation/test_convtdf_finetune.py @@ -0,0 +1,43 @@ +import os +import pytest +import shutil + +import numpy as np + +import compiam +from compiam.data import TESTDIR +from compiam.exceptions import ModelNotTrainedError + + +def _separate(): + from compiam.separation.singing_voice_extraction import ConvTDFVocalFineTune + + convtdf_vocal = ConvTDFVocalFineTune() + with pytest.raises(ModelNotTrainedError): + convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav")) + convtdf_vocal.trained = True + with pytest.raises(FileNotFoundError): + convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav")) + + convtdf_vocal = compiam.load_model( + "separation:convtdf-vocal-finetune", data_home=TESTDIR + ) + audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100 + separation = convtdf_vocal.separate(audio_in, input_sr=sr) + assert isinstance(separation, np.ndarray) + shutil.rmtree(os.path.join(TESTDIR, "models")) + + +@pytest.mark.torch +def test_predict_torch(): + _separate() + + +@pytest.mark.full_ml +def test_predict_full(): + _separate() + + +@pytest.mark.all +def test_predict_all(): + _separate()