From c821aa081f996e48108e523ab8eedc87ee234d6f Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Tue, 1 Apr 2025 18:48:58 +0200
Subject: [PATCH 01/11] add code

---
 compiam/data.py                               |  18 ++
 compiam/separation/README.md                  |   7 +-
 compiam/separation/__init__.py                |   1 +
 .../singing_voice_extraction/__init__.py      |   3 +
 .../convtdf_vocal_finetune.py                 | 212 ++++++++++++++++++
 5 files changed, 239 insertions(+), 2 deletions(-)
 create mode 100644 compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py

diff --git a/compiam/data.py b/compiam/data.py
index fc1c2a4b..2f38fac2 100644
--- a/compiam/data.py
+++ b/compiam/data.py
@@ -234,6 +234,24 @@
             },
         },
     },
+    "separation:convtdf-vocal-finetune": {
+        "module_name": "compiam.separation.singing_voice_separation.convtdf-vocal-finetune",
+        "class_name": "ConvTDFVocalFineTune",
+        "default_version": "v1",
+        "kwargs": {
+            "v1": {
+                "model_path": os.path.join(
+                    "models",
+                    "separation",
+                    "convtdf-vocal-finetune",
+                    "vocals",
+                    "TODO",
+                ),
+                "download_link": "TODO",
+                "download_checksum": "TODO",
+            },
+        },
+    },
 }
 
 
diff --git a/compiam/separation/README.md b/compiam/separation/README.md
index 2fc9abfe..a227f6e9 100644
--- a/compiam/separation/README.md
+++ b/compiam/separation/README.md
@@ -3,9 +3,12 @@
 | **Tool**                  | **Task**                         | **Paper** |
 |---------------------------|----------------------------------|-----------|
 | ColdDiffSep               | Singing voice extraction         | [1]       |
-| MDXNet w/ mixer model     | Music source separation          | [2]       |
+| ConvTDF Vocal Fine-tuned  | Singing voice extraction         | [2]       |
+| MDXNet w/ mixer model     | Music source separation          | [3]       |
 
 
 [1] G. Plaja-Roglans, M. Miron, A. Shankar, and X. Serra, "Carnatic Singing Voice Separation using Cold Diffusion on Training Data with Bleeding", in International Society for Music Information Retrieval Conference (ISMIR 23), 2023. 
 
-[2] Work under review.
\ No newline at end of file
+[2] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.
+
+[3] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.
\ No newline at end of file
diff --git a/compiam/separation/__init__.py b/compiam/separation/__init__.py
index 66686471..512384f1 100644
--- a/compiam/separation/__init__.py
+++ b/compiam/separation/__init__.py
@@ -12,6 +12,7 @@
 
 ### IMPORT HERE THE CONSIDERED TASKS
 from compiam.separation import singing_voice_extraction
+from compiam.separation import music_source_separation
 
 
 # Show user the available tasks
diff --git a/compiam/separation/singing_voice_extraction/__init__.py b/compiam/separation/singing_voice_extraction/__init__.py
index 9f72faee..f0ee9c9d 100644
--- a/compiam/separation/singing_voice_extraction/__init__.py
+++ b/compiam/separation/singing_voice_extraction/__init__.py
@@ -7,6 +7,9 @@
 from compiam.separation.singing_voice_extraction.cold_diff_sep import (
     ColdDiffSep,
 )
+from compiam.separation.singing_voice_extraction.convtdf_vocal_finetune import (
+    ConvTDFVocalFineTune,
+)
 
 
 # Show user the available tools
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
new file mode 100644
index 00000000..0f6fc82a
--- /dev/null
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -0,0 +1,212 @@
+import os
+
+import numpy as np 
+
+from compiam.exceptions import ModelNotTrainedError
+from compiam.utils import get_logger, WORKDIR
+from compiam.utils.download import download_remote_model
+
+
+logger = get_logger(__name__)
+
+
+class ConvTDFVocalFineTune(object):
+    """ConvTDF net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding)."""
+
+    def __init__(
+        self,
+        model_path=None,
+        download_link=None,
+        download_checksum=None,
+        sample_rate=24000,
+        gpu="-1",
+    ):
+        """Leakage-aware singing voice separation init method.
+
+        :param model_path: path to file to the model weights.
+        :param download_link: link to the remote pre-trained model.
+        :param download_checksum: checksum of the model file.
+        :param sample_rate: sample rate to which the audio is sampled for extraction.
+        :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
+        """
+        ### IMPORTING OPTIONAL DEPENDENCIES
+        try:
+            global torch
+            import torch
+
+            global nn
+            import torch.nn as nn
+
+            global torchaudio
+            import torchaudio
+
+            global ConvTDFNet
+            from compiam.separation.music_source_separation.mixer_model.models import (
+                ConvTDFNet,
+            )
+
+        except:
+            raise ImportError(
+                "In order to use this tool you need to have torch and torchaudio installed. "
+                "Install compIAM with torch support: pip install 'compiam[torch]'"
+            )
+        ###
+
+        ## Setting up GPU if specified
+        self.gpu = gpu
+        self.device = None
+        self.select_gpu(gpu)
+
+        self.model = self._build_model()
+        self.sample_rate = sample_rate
+        self.trained = False
+
+        self.model_path = model_path
+        self.download_link = download_link
+        self.download_checksum = download_checksum
+        if self.model_path is not None:
+            self.load_model(self.model_path)
+
+        self.chunk_size = self.model.chunk_size
+
+    def forward(self, x):
+        """Forward pass of the mixer model"""
+        return self.model(x)
+
+    def _build_model(self):
+        """Build the MDXNet mixer model."""
+        convtdfnet = ConvTDFNet().to(self.device)
+        convtdfnet.eval()
+        return convtdfnet
+
+    def load_model(self, model_path):
+        if not os.path.exists(model_path):
+            self.download_model(model_path)  # Downloading model weights
+        ## Ensuring we can load the model for different torch versions
+        ## -- (weights only might be deprecated)
+        try:
+            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+        except:
+            weights = torch.load(model_path, map_location=self.device)
+        self.model.load_state_dict(weights)
+        self.model_path = model_path
+        self.trained = True
+
+    def separate(
+        self,
+        input_data,
+        input_sr=44100,
+        gpu="-1",
+    ):
+        """Separate Carnatic singing voice from mixture.
+
+        :param input_data: Audio signal/path to separate.
+        :param input_sr: sampling rate of the input array of data (if any). This variable is only
+            relevant if the input is an array of data instead of a filepath.
+        :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
+        :return: Singing voice and violin signals.
+        """
+        ## Setting up GPU if specified
+        self.gpu = gpu
+        self.device = None
+        self.select_gpu(gpu)
+
+        if self.trained is False:
+            raise ModelNotTrainedError(
+                """ Model is not trained. Please load model before running inference!
+                You can load the pre-trained instance with the load_model wrapper."""
+            )
+
+        # Loading and resampling audio
+        if isinstance(input_data, str):
+            if not os.path.exists(input_data):
+                raise FileNotFoundError("Target audio not found.")
+            audio, input_sr = torchaudio.load(input_data)
+        elif isinstance(input_data, np.ndarray):
+            input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device)
+        elif isinstance(input_data, torch.Tensor):
+            input_data = input_data.to(torch.float32).to(self.device)
+        else:
+            raise ValueError("Input must be path to audio signal or an audio array")
+        
+        if len(input_data.shape) == 1:
+            input_data = input_data.unsqueeze(0)
+
+        if len(input_data.shape) == 3:
+            if input_data.shape[0] != 1:
+                raise ValueError("Batching is not supported. Please provide a single audio signal.")
+            input_data = input_data.squeeze(0)
+        
+        # resample audio
+        if input_sr != self.sample_rate:
+            logger.warning(
+                f"Resampling... (input sampling rate is assumed {input_sr}Hz, \
+                    make sure this is correct and change input_sr otherwise)"
+            )
+            audio = torchaudio.transforms.Resample(
+                orig_freq=input_sr, new_freq=self.sample_rate
+            )(input_data)
+
+        # downsampling to mono
+        if audio.shape[0] == 2:
+            audio = audio.mean(dim=0, keepdim=True)
+            logger.info(
+                f"Downsampling to mono... your audio is stereo, \
+                    and the model is trained on mono audio."
+            )
+
+        # audio has shape B, 1, N
+        audio = audio.reshape(-1)
+        predictions = []
+        pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        audio = torch.nn.functional.pad(audio, (0, pad_length))
+
+        for i in range(0, audio.shape[-1], self.chunk_size):
+            audio_chunk = audio[i : i + self.chunk_size].reshape(
+                1, 1, -1
+            )  # TODO Batching
+            predictions.append(self.forward(audio_chunk))
+
+        result = torch.cat(predictions, dim=-1)
+        result = result[:, :, :-pad_length]
+
+        vocal_separation = torchaudio.transforms.Resample(
+            orig_freq=self.sample_rate, new_freq=input_sr
+        )(result)
+        
+        return vocal_separation.detach().cpu().numpy().reshape(-1)
+
+    def download_model(self, model_path=None, force_overwrite=False):
+        """Download pre-trained model."""
+        download_path = (
+            os.sep + os.path.join(*model_path.split(os.sep)[:-2])
+            if model_path is not None
+            else os.path.join(WORKDIR, "models", "separation", "conv-tdf-finetune")
+        )
+        # Creating model folder to store the weights
+        if not os.path.exists(download_path):
+            os.makedirs(download_path)
+        download_remote_model(
+            self.download_link,
+            self.download_checksum,
+            download_path,
+            force_overwrite=force_overwrite,
+        )
+
+    def select_gpu(self, gpu="-1"):
+        """Select the GPU to use for inference.
+
+        :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
+        :returns: None
+        """
+        if int(gpu) == -1:
+            self.device = torch.device("cpu")
+        else:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda:" + str(gpu))
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps:" + str(gpu))
+            else:
+                self.device = torch.device("cpu")
+                logger.warning("No GPU available. Running on CPU.")
+        self.gpu = gpu

From 1ffd61e357f0da512b0adf50ccfe17444d7c08fd Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Tue, 1 Apr 2025 19:00:16 +0200
Subject: [PATCH 02/11] tests, docs, ack

---
 ACKNOWLEDGEMENTS                          |  1 +
 docs/source/separation.rst                | 10 ++++++
 tests/separation/test_convtdf_finetune.py | 43 +++++++++++++++++++++++
 3 files changed, 54 insertions(+)
 create mode 100644 tests/separation/test_convtdf_finetune.py

diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS
index f7588570..a238ef95 100644
--- a/ACKNOWLEDGEMENTS
+++ b/ACKNOWLEDGEMENTS
@@ -18,6 +18,7 @@ Jom Kuriakose
 Shrey Dutta
 Shubham Lohiya
 Swarada Bharadwaj
+Serafin Schweinitz 
 
 Project Musical AI - PID2019-111403GB-I00/AEI/10.13039/501100011033 funded by the Spanish 
 Ministerio de Ciencia, Innovación y Universidades (MCIU) and the Agencia Estatal de 
diff --git a/docs/source/separation.rst b/docs/source/separation.rst
index d8950094..356dc594 100644
--- a/docs/source/separation.rst
+++ b/docs/source/separation.rst
@@ -16,6 +16,16 @@ Leakage-aware Carnatic Singing Voice Separation
    :members:
 
 
+Leakage-aware Carnatic Singing Voice Separation
+-----------------------------------------------
+
+.. note::
+    REQUIRES: torch
+
+.. autoclass:: compiam.separation.singing_voice_extraction.convtdf_vocal_finetune.ConvTDFVocalFineTune
+   :members:
+
+
 Vocals and violin separation
 ++++++++++++++++++++++++++++
 
diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py
new file mode 100644
index 00000000..552aad2e
--- /dev/null
+++ b/tests/separation/test_convtdf_finetune.py
@@ -0,0 +1,43 @@
+import os
+import pytest
+import shutil
+
+import numpy as np
+
+import compiam
+from compiam.data import TESTDIR
+from compiam.exceptions import ModelNotTrainedError
+
+
+def _separate():
+    from compiam.separation.singing_voice_extraction import ConvTDFVocalFineTune
+
+    convtdf_vocal = ConvTDFVocalFineTune()
+    with pytest.raises(ModelNotTrainedError):
+        convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav"))
+    convtdf_vocal.trained = True
+    with pytest.raises(FileNotFoundError):
+        convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav"))
+
+    convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR)
+    audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100
+    separation = convtdf_vocal.separate(audio_in, input_sr=sr)
+    assert isinstance(separation, tuple)
+    assert isinstance(separation[0], np.ndarray)
+    assert isinstance(separation[1], np.ndarray)
+    shutil.rmtree(os.path.join(TESTDIR, "models"))
+
+
+@pytest.mark.torch
+def test_predict_torch():
+    _separate()
+
+
+@pytest.mark.full_ml
+def test_predict_full():
+    _separate()
+
+
+@pytest.mark.all
+def test_predict_all():
+    _separate()

From ccf796ea8c7d09a58ee4d343f55a231809736fb5 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Tue, 1 Apr 2025 20:12:22 +0200
Subject: [PATCH 03/11] add convtdf default params

---
 .../mixer_model/models.py                     | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/compiam/separation/music_source_separation/mixer_model/models.py b/compiam/separation/music_source_separation/mixer_model/models.py
index a66404f2..6ad2989f 100644
--- a/compiam/separation/music_source_separation/mixer_model/models.py
+++ b/compiam/separation/music_source_separation/mixer_model/models.py
@@ -7,18 +7,18 @@
 class ConvTDFNet(nn.Module):
     def __init__(
         self,
-        hop_length,
-        num_blocks,
-        dim_t,
-        n_fft,
-        dim_c,
-        dim_f,
-        g,
-        k,
-        l,
-        bn,
-        bias,
-        scale,
+        hop_length=558,
+        dim_t=256,
+        n_fft=6144,
+        dim_c=2,
+        dim_f=2048,
+        num_blocks=11,
+        g=32,
+        k=3,
+        l=3,
+        bn=4,
+        bias=False,
+        scale=2,
     ):
         super(ConvTDFNet, self).__init__()
         self.hop_length = hop_length

From 84c487a47a5d3a6706e677ff19cc5269b0a69e1e Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Tue, 1 Apr 2025 20:12:35 +0200
Subject: [PATCH 04/11] add download model details

---
 compiam/data.py                                        | 10 +++++-----
 .../singing_voice_extraction/convtdf_vocal_finetune.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiam/data.py b/compiam/data.py
index 2f38fac2..608d7482 100644
--- a/compiam/data.py
+++ b/compiam/data.py
@@ -235,7 +235,7 @@
         },
     },
     "separation:convtdf-vocal-finetune": {
-        "module_name": "compiam.separation.singing_voice_separation.convtdf-vocal-finetune",
+        "module_name": "compiam.separation.singing_voice_extraction.convtdf_vocal_finetune",
         "class_name": "ConvTDFVocalFineTune",
         "default_version": "v1",
         "kwargs": {
@@ -243,12 +243,12 @@
                 "model_path": os.path.join(
                     "models",
                     "separation",
-                    "convtdf-vocal-finetune",
+                    "convtdf_vocal_finetune",
                     "vocals",
-                    "TODO",
+                    "checkpoint_finetuned.pt",
                 ),
-                "download_link": "TODO",
-                "download_checksum": "TODO",
+                "download_link": "https://zenodo.org/records/15121572/files/convtdf_vocal_finetune.zip?download=1",
+                "download_checksum": "170c7a25cb06911f2e4a9452ce943aed",
             },
         },
     },
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
index 0f6fc82a..1e0af483 100644
--- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -88,7 +88,7 @@ def load_model(self, model_path):
             weights = torch.load(model_path, weights_only=True, map_location=self.device)
         except:
             weights = torch.load(model_path, map_location=self.device)
-        self.model.load_state_dict(weights)
+        self.model.load_state_dict(weights["model_state_dict"])
         self.model_path = model_path
         self.trained = True
 

From 4fe66352992a8a9bd5ae27be3076b72569aafcb5 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Tue, 1 Apr 2025 20:34:00 +0200
Subject: [PATCH 05/11] fix tests

---
 tests/separation/test_convtdf_finetune.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py
index 552aad2e..a93f3cf8 100644
--- a/tests/separation/test_convtdf_finetune.py
+++ b/tests/separation/test_convtdf_finetune.py
@@ -22,9 +22,7 @@ def _separate():
     convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR)
     audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100
     separation = convtdf_vocal.separate(audio_in, input_sr=sr)
-    assert isinstance(separation, tuple)
-    assert isinstance(separation[0], np.ndarray)
-    assert isinstance(separation[1], np.ndarray)
+    assert isinstance(separation, np.ndarray)
     shutil.rmtree(os.path.join(TESTDIR, "models"))
 
 

From bfc419159604f980e6186126a8c673c55f3de390 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Wed, 2 Apr 2025 18:10:06 +0200
Subject: [PATCH 06/11] fix variable names in audio loading

---
 .../mixer_model/__init__.py                      | 16 ++++++++--------
 .../convtdf_vocal_finetune.py                    | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index c3fc2c59..bfeb287a 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -124,19 +124,19 @@ def separate(
                 raise FileNotFoundError("Target audio not found.")
             audio, input_sr = torchaudio.load(input_data)
         elif isinstance(input_data, np.ndarray):
-            input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device)
+            audio = torch.from_numpy(input_data).to(torch.float32).to(self.device)
         elif isinstance(input_data, torch.Tensor):
-            input_data = input_data.to(torch.float32).to(self.device)
+            audio = input_data.to(torch.float32).to(self.device)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
         
-        if len(input_data.shape) == 1:
-            input_data = input_data.unsqueeze(0)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)  # Add mono channel if no audio channels
 
-        if len(input_data.shape) == 3:
-            if input_data.shape[0] != 1:
+        if len(audio.shape) == 3:
+            if audio.shape[0] != 1:
                 raise ValueError("Batching is not supported. Please provide a single audio signal.")
-            input_data = input_data.squeeze(0)
+            audio = audio.squeeze(0)  # Remove batch size 1
         
         # resample audio
         if input_sr != self.sample_rate:
@@ -146,7 +146,7 @@ def separate(
             )
             audio = torchaudio.transforms.Resample(
                 orig_freq=input_sr, new_freq=self.sample_rate
-            )(input_data)
+            )(audio)
 
         # downsampling to mono
         if audio.shape[0] == 2:
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
index 1e0af483..528ac9f2 100644
--- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -123,19 +123,19 @@ def separate(
                 raise FileNotFoundError("Target audio not found.")
             audio, input_sr = torchaudio.load(input_data)
         elif isinstance(input_data, np.ndarray):
-            input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device)
+            audio = torch.from_numpy(input_data).to(torch.float32).to(self.device)
         elif isinstance(input_data, torch.Tensor):
-            input_data = input_data.to(torch.float32).to(self.device)
+            audio = input_data.to(torch.float32).to(self.device)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
         
-        if len(input_data.shape) == 1:
-            input_data = input_data.unsqueeze(0)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)  # Adding mono channel if no audio channels
 
-        if len(input_data.shape) == 3:
-            if input_data.shape[0] != 1:
+        if len(audio.shape) == 3:
+            if audio.shape[0] != 1:
                 raise ValueError("Batching is not supported. Please provide a single audio signal.")
-            input_data = input_data.squeeze(0)
+            audio = audio.squeeze(0)  # Removing batch dimension
         
         # resample audio
         if input_sr != self.sample_rate:
@@ -145,7 +145,7 @@ def separate(
             )
             audio = torchaudio.transforms.Resample(
                 orig_freq=input_sr, new_freq=self.sample_rate
-            )(input_data)
+            )(audio)
 
         # downsampling to mono
         if audio.shape[0] == 2:

From d4b514231c18c46fefada76b72206998bd389719 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 3 Apr 2025 20:24:10 +0200
Subject: [PATCH 07/11] overlap and add

---
 .../mixer_model/__init__.py                   | 44 ++++++++++++++-----
 .../convtdf_vocal_finetune.py                 | 44 ++++++++++++++-----
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index bfeb287a..e2de0748 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -69,6 +69,7 @@ def __init__(
             self.load_model(self.model_path)
 
         self.chunk_size = self.model.chunk_size
+        self.overlap = 0.25
 
     def forward(self, x):
         """Forward pass of the mixer model"""
@@ -156,27 +157,46 @@ def separate(
                     and the model is trained on mono audio."
             )
 
-        # audio has shape B, 1, N
+        initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
-        predictions = []
-        pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size
         audio = torch.nn.functional.pad(audio, (0, pad_length))
 
-        for i in range(0, audio.shape[-1], self.chunk_size):
-            audio_chunk = audio[i : i + self.chunk_size].reshape(
-                1, 1, -1
-            )  # TODO Batching
-            predictions.append(self.forward(audio_chunk))
+        chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size)
+        hop_size = int(chunk_size * (1 - self.overlap))
+        num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1
 
-        result = torch.cat(predictions, dim=-1)
-        result = result[:, :, :-pad_length]
+        window = torch.hann_window(chunk_size)
+        out = torch.zeros((2, audio.shape[-1]))  # (Channels=2, Time)
+        weight_sum = torch.zeros(audio.shape[-1])  # Weight accumulation for normalization
+
+        # Process chunks
+        for i in range(num_chunks):
+            start = i * hop_size
+            end = start + chunk_size
+
+            # Extract chunk (reshape for model input)
+            audio_chunk = audio[start:end].reshape(1, 1, -1)
+
+            # Apply model separation (assumes 2-channel output)
+            separated_chunk = self.forward(audio_chunk).reshape(2, -1)  # (2, chunk_size)
+
+            # Apply windowing
+            separated_chunk *= window  # Smooth transition
+
+            # Overlap-Add to output
+            out[:, start:end] += separated_chunk
+            weight_sum[start:end] += window  # Accumulate weights
+
+        out /= weight_sum.unsqueeze(0).clamp(min=1e-8)  # Avoid division by zero
+        out = out[..., :initial_length].unsqueeze(0)  # (1, 2, N)
 
         vocal_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
-        )(result[:, 0, :])
+        )(out[:, 0, :])
         violin_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
-        )(result[:, 1, :])
+        )(out[:, 1, :])
         
         vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1)
         violin_separation = violin_separation.detach().cpu().numpy().reshape(-1)
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
index 528ac9f2..d468d7da 100644
--- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -68,6 +68,7 @@ def __init__(
             self.load_model(self.model_path)
 
         self.chunk_size = self.model.chunk_size
+        self.overlap = 0.25
 
     def forward(self, x):
         """Forward pass of the mixer model"""
@@ -155,25 +156,44 @@ def separate(
                     and the model is trained on mono audio."
             )
 
-        # audio has shape B, 1, N
+        initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
-        predictions = []
-        pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size
         audio = torch.nn.functional.pad(audio, (0, pad_length))
 
-        for i in range(0, audio.shape[-1], self.chunk_size):
-            audio_chunk = audio[i : i + self.chunk_size].reshape(
-                1, 1, -1
-            )  # TODO Batching
-            predictions.append(self.forward(audio_chunk))
+        chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size)
+        hop_size = int(chunk_size * (1 - self.overlap))
+        num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1
 
-        result = torch.cat(predictions, dim=-1)
-        result = result[:, :, :-pad_length]
+        window = torch.hann_window(chunk_size)
+        out = torch.zeros(audio.shape[-1])  # (Time,)
+        weight_sum = torch.zeros(audio.shape[-1])  # Weight accumulation for normalization
+
+        # Process chunks
+        for i in range(num_chunks):
+            start = i * hop_size
+            end = start + chunk_size
+
+            # Extract chunk (reshape for model input)
+            audio_chunk = audio[start:end].reshape(1, 1, -1)
+
+            # Apply model separation (now outputs 1-channel)
+            separated_chunk = self.forward(audio_chunk).reshape(-1)  # (chunk_size,)
+
+            # Apply windowing
+            separated_chunk *= window  # Smooth transition
+
+            # Overlap-Add to output
+            out[start:end] += separated_chunk
+            weight_sum[start:end] += window  # Accumulate weights
+
+        out /= weight_sum.clamp(min=1e-8)  # Avoid division by zero
+        out = out[:initial_length].unsqueeze(0)  # (1, N)
 
         vocal_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
-        )(result)
-        
+        )(out)
+
         return vocal_separation.detach().cpu().numpy().reshape(-1)
 
     def download_model(self, model_path=None, force_overwrite=False):

From bfbc157060192ab554849deb3f5378e85b789745 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Wed, 23 Jul 2025 23:32:46 +0200
Subject: [PATCH 08/11] minor fixes, norm for separation models

---
 .../music_source_separation/mixer_model/__init__.py       | 4 ++++
 .../singing_voice_extraction/cold_diff_sep/__init__.py    | 8 +++++---
 .../singing_voice_extraction/convtdf_vocal_finetune.py    | 6 +++++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index e2de0748..0b889ba1 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -98,6 +98,7 @@ def separate(
         self,
         input_data,
         input_sr=44100,
+        normalize_input=True,
         gpu="-1",
     ):
         """Separate singing voice and violin from mixture.
@@ -105,6 +106,7 @@ def separate(
         :param input_data: Audio signal to separate.
         :param input_sr: sampling rate of the input array of data (if any). This variable is only
             relevant if the input is an array of data instead of a filepath.
+        :param normalize_input: Normalize the input audio signal.
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :return: Singing voice and violin signals.
         """
@@ -157,6 +159,8 @@ def separate(
                     and the model is trained on mono audio."
             )
 
+        if normalize_input:
+            audio = audio / audio.max()
         initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
         pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size
diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
index 8cc92c06..6a7d9dd5 100644
--- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
+++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
@@ -159,9 +159,11 @@ def separate(
 
         if normalize_input:
             # Normalizing audio for better performance overall
-            mean = tf.reduce_mean(mixture, keepdims=True)
-            std = tf.math.reduce_std(mixture, keepdims=True)
-            mixture = (mixture - mean) / (1e-6 + std)
+            #mean = tf.reduce_mean(mixture, keepdims=True)
+            #std = tf.math.reduce_std(mixture, keepdims=True)
+            #mixture = (mixture - mean) / (1e-6 + std)
+            # For now, divide by maximum vale
+            mixture = mixture / mixture.max()
             
         output_voc = np.zeros(mixture.shape)
         hopsized_chunk = int((chunk_size * self.sample_rate) / 2)
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
index d468d7da..71f277b5 100644
--- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -11,7 +11,7 @@
 
 
 class ConvTDFVocalFineTune(object):
-    """ConvTDF net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding)."""
+    """ConvTDF Net fine-tuned to separate clean Carnatic vocals training with Saraga (which has bleeding)."""
 
     def __init__(
         self,
@@ -97,6 +97,7 @@ def separate(
         self,
         input_data,
         input_sr=44100,
+        normalize_input=True,
         gpu="-1",
     ):
         """Separate Carnatic singing voice from mixture.
@@ -104,6 +105,7 @@ def separate(
         :param input_data: Audio signal/path to separate.
         :param input_sr: sampling rate of the input array of data (if any). This variable is only
             relevant if the input is an array of data instead of a filepath.
+        :param normalize_input: Normalize the input audio signal.
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :return: Singing voice and violin signals.
         """
@@ -156,6 +158,8 @@ def separate(
                     and the model is trained on mono audio."
             )
 
+        if normalize_input:
+            audio = audio / audio.max()
         initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
         pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size

From f14bbdeaa9f0cabd02fdfe7f0f7a69dc92473514 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Wed, 23 Jul 2025 23:39:35 +0200
Subject: [PATCH 09/11] ftanet norm

---
 compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
index e7115e29..14245793 100644
--- a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
+++ b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
@@ -270,6 +270,7 @@ def predict(
         hop_size=80,
         batch_size=5,
         out_step=None,
+        amplify_input=1.0,
         gpu="-1",
     ):
         """Extract melody from input_data.
@@ -283,6 +284,7 @@ def predict(
             (defaulted to 5, increase if enough computational power, reduce if
             needed).
         :param out_step: particular time-step duration if needed at output
+        :param amplify_input: for low volume inputs, we've found that overlouding it may provide better voicing detection (e.g. x10, x50)
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :returns: a 2-D list with time-stamps and pitch values per timestamp.
         """
@@ -323,6 +325,10 @@ def predict(
         xlist = []
         timestamps = []
 
+        # Applying loudness scaling 
+        audio = audio / audio.max()
+        audio = audio * amplify_input
+
         audio_len = len(audio)
         batch_min = self.sample_rate * 60 * batch_size
         freqs = []

From c6a2e3864d3cc21fd756778992ef84d38544da6f Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Wed, 23 Jul 2025 23:43:09 +0200
Subject: [PATCH 10/11] black and typos

---
 compiam/__init__.py                           |  2 +-
 compiam/dunya/__init__.py                     | 12 +++----
 compiam/io.py                                 |  5 +--
 .../melody/pattern/sancara_search/__init__.py |  2 +-
 .../ftanet_carnatic/__init__.py               |  2 +-
 .../ftaresnet_carnatic/__init__.py            |  4 ++-
 .../raga_recognition/deepsrgm/__init__.py     |  6 ++--
 .../mixer_model/__init__.py                   | 34 +++++++++++++------
 .../cold_diff_sep/__init__.py                 | 18 ++++++----
 .../convtdf_vocal_finetune.py                 | 28 ++++++++++-----
 .../dhrupad_bandish_segmentation/__init__.py  |  4 +--
 .../audio_processing.py                       |  1 -
 compiam/utils/__init__.py                     |  4 ++-
 compiam/utils/download.py                     | 12 +++----
 compiam/version.py                            |  2 +-
 tests/melody/test_deepsrgm.py                 |  4 ++-
 tests/melody/test_essentia_extractors.py      |  8 +++--
 tests/separation/test_convtdf_finetune.py     |  4 ++-
 18 files changed, 94 insertions(+), 58 deletions(-)

diff --git a/compiam/__init__.py b/compiam/__init__.py
index 5a795d57..796e129e 100644
--- a/compiam/__init__.py
+++ b/compiam/__init__.py
@@ -71,7 +71,7 @@ def load_dataset(dataset_name, data_home=None, version="default"):
     """
     if dataset_name not in datasets_list:
         raise ValueError("Invalid dataset {}".format(dataset_name))
-    dataloader =  mirdata.initialize(
+    dataloader = mirdata.initialize(
         dataset_name=dataset_name, data_home=data_home, version=version
     )
     dataloader.download(["index"])  # Download index file
diff --git a/compiam/dunya/__init__.py b/compiam/dunya/__init__.py
index 140a39ed..4301a560 100644
--- a/compiam/dunya/__init__.py
+++ b/compiam/dunya/__init__.py
@@ -25,9 +25,7 @@ def __init__(self, tradition, token):
         dunya.set_token(self.token)
 
         if tradition not in ["carnatic", "hindustani"]:
-            raise ValueError(
-                "Please choose a valid tradition: carnatic or hindustani"
-            )
+            raise ValueError("Please choose a valid tradition: carnatic or hindustani")
         self.tradition = carnatic if tradition == "carnatic" else hindustani
 
         # Functions from the compmusic API are added as a method in the Corpora class
@@ -36,10 +34,12 @@ def __init__(self, tradition, token):
             if callable(func):
                 setattr(self, name, func)
 
-        logger.warning("""
+        logger.warning(
+            """
             Note that a part of the collection is under restricted access.
             To access the full collection please request permission at https://dunya.compmusic.upf.edu/user/profile/
-        """)
+        """
+        )
 
     def get_collection(self, recording_detail=False):
         """Get the documents (recordings) in a collection.
@@ -54,7 +54,7 @@ def get_collection(self, recording_detail=False):
                 + "Please note that it might take a few moments..."
             )
         return self.tradition.get_recordings(recording_detail)
-    
+
     @staticmethod
     def list_available_types(recording_id):
         """Get the available source filetypes for a Musicbrainz recording.
diff --git a/compiam/io.py b/compiam/io.py
index 7cef4e4c..51fd384d 100644
--- a/compiam/io.py
+++ b/compiam/io.py
@@ -94,11 +94,11 @@ def write_scalar_txt(data, output_path):
 
 def resolve_dottedname(dotted_name):
     """Resolve a dotted name to an actual object, similar to zope.dottedname.resolve
-    
+
     :param dotted_name: a dotted name
     :returns: the object the dotted name refers to
     """
-    module_name, _, attribute_name = dotted_name.rpartition('.')
+    module_name, _, attribute_name = dotted_name.rpartition(".")
     if not module_name:
         raise ImportError(f"Invalid dotted name: '{dotted_name}'")
     module = importlib.import_module(module_name)
@@ -111,6 +111,7 @@ def load_yaml(path):
     :param path: input file
     :returns: loaded yaml information
     """
+
     def constructor_dottedname(loader, node):
         value = loader.construct_scalar(node)
         return resolve_dottedname(value)
diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py
index b1537920..ba197638 100644
--- a/compiam/melody/pattern/sancara_search/__init__.py
+++ b/compiam/melody/pattern/sancara_search/__init__.py
@@ -185,7 +185,7 @@ def load_model(self, model_path, conf_path, spec_path):
         try:
             self.model.load_state_dict(
                 torch.load(model_path, weights_only=True, map_location=self.device),
-                strict=False
+                strict=False,
             )
         except:
             self.model.load_state_dict(
diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
index 14245793..73b2030a 100644
--- a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
+++ b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
@@ -325,7 +325,7 @@ def predict(
         xlist = []
         timestamps = []
 
-        # Applying loudness scaling 
+        # Applying loudness scaling
         audio = audio / audio.max()
         audio = audio * amplify_input
 
diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
index 9a5ccac7..9852393b 100644
--- a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
+++ b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
@@ -83,7 +83,9 @@ def load_model(self, model_path):
         ## Ensuring we can load the model for different torch versions
         ## -- (weights only might be deprecated)
         try:
-            self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device)
+            )
         except:
             self.model.load_state_dict(torch.load(model_path, map_location=self.device))
         self.model_path = model_path
diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py
index 1d79ea7c..9a100e85 100644
--- a/compiam/melody/raga_recognition/deepsrgm/__init__.py
+++ b/compiam/melody/raga_recognition/deepsrgm/__init__.py
@@ -124,7 +124,9 @@ def load_model(self, model_path, rnn="lstm"):
 
         self.model_path = model_path
         try:
-            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+            weights = torch.load(
+                model_path, weights_only=True, map_location=self.device
+            )
         except:
             weights = torch.load(model_path, map_location=self.device)
         new_weights = weights.copy()
@@ -168,7 +170,7 @@ def load_raga_dataset(self, data_home=None, download=False):
             "compmusic_raga", data_home=data_home, version="default"
         )
         if download:
-            self.dataset.download()  # Downloads index and features
+            self.dataset.download()  # Downloads index and features
             logger.warning(
                 f"""
                 The features are downloaded, but the audio of this dataset is private. 
diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
index 0b889ba1..3faa9cef 100644
--- a/compiam/separation/music_source_separation/mixer_model/__init__.py
+++ b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -87,7 +87,9 @@ def load_model(self, model_path):
         ## Ensuring we can load the model for different torch versions
         ## -- (weights only might be deprecated)
         try:
-            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+            weights = torch.load(
+                model_path, weights_only=True, map_location=self.device
+            )
         except:
             weights = torch.load(model_path, map_location=self.device)
         self.model.load_state_dict(weights)
@@ -132,15 +134,17 @@ def separate(
             audio = input_data.to(torch.float32).to(self.device)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
-        
+
         if len(audio.shape) == 1:
-            audio = audio.unsqueeze(0)  # Add mono channel if no audio channels
+            audio = audio.unsqueeze(0)  # Add mono channel if no audio channels
 
         if len(audio.shape) == 3:
             if audio.shape[0] != 1:
-                raise ValueError("Batching is not supported. Please provide a single audio signal.")
+                raise ValueError(
+                    "Batching is not supported. Please provide a single audio signal."
+                )
             audio = audio.squeeze(0)  # Remove batch size 1
-        
+
         # resample audio
         if input_sr != self.sample_rate:
             logger.warning(
@@ -151,7 +155,7 @@ def separate(
                 orig_freq=input_sr, new_freq=self.sample_rate
             )(audio)
 
-        # downsampling to mono
+        # downsampling to mono
         if audio.shape[0] == 2:
             audio = audio.mean(dim=0, keepdim=True)
             logger.info(
@@ -163,16 +167,22 @@ def separate(
             audio = audio / audio.max()
         initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
-        pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size
+        pad_length = (
+            self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        ) % self.chunk_size
         audio = torch.nn.functional.pad(audio, (0, pad_length))
 
-        chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size)
+        chunk_size = audio.shape[-1] // (
+            (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size
+        )
         hop_size = int(chunk_size * (1 - self.overlap))
         num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1
 
         window = torch.hann_window(chunk_size)
         out = torch.zeros((2, audio.shape[-1]))  # (Channels=2, Time)
-        weight_sum = torch.zeros(audio.shape[-1])  # Weight accumulation for normalization
+        weight_sum = torch.zeros(
+            audio.shape[-1]
+        )  # Weight accumulation for normalization
 
         # Process chunks
         for i in range(num_chunks):
@@ -183,7 +193,9 @@ def separate(
             audio_chunk = audio[start:end].reshape(1, 1, -1)
 
             # Apply model separation (assumes 2-channel output)
-            separated_chunk = self.forward(audio_chunk).reshape(2, -1)  # (2, chunk_size)
+            separated_chunk = self.forward(audio_chunk).reshape(
+                2, -1
+            )  # (2, chunk_size)
 
             # Apply windowing
             separated_chunk *= window  # Smooth transition
@@ -201,7 +213,7 @@ def separate(
         violin_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
         )(out[:, 1, :])
-        
+
         vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1)
         violin_separation = violin_separation.detach().cpu().numpy().reshape(-1)
         return (vocal_separation, violin_separation)
diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
index 6a7d9dd5..ec93270b 100644
--- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
+++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
@@ -139,10 +139,14 @@ def separate(
 
         if len(mixture.shape) == 3:
             if mixture.shape[0] != 1:
-                raise ValueError("Batching is not supported. Please provide a single audio signal.")
+                raise ValueError(
+                    "Batching is not supported. Please provide a single audio signal."
+                )
             else:
                 mixture = mixture.squeeze(0)
-                mixture = tf.reduce_mean(mixture, axis=0, keepdims=False)  # Removing dimension
+                mixture = tf.reduce_mean(
+                    mixture, axis=0, keepdims=False
+                )  # Removing dimension
                 logger.info(
                     f"Downsampling to mono... your audio is stereo, \
                         and the model is trained on mono audio."
@@ -159,12 +163,12 @@ def separate(
 
         if normalize_input:
             # Normalizing audio for better performance overall
-            #mean = tf.reduce_mean(mixture, keepdims=True)
-            #std = tf.math.reduce_std(mixture, keepdims=True)
-            #mixture = (mixture - mean) / (1e-6 + std)
-            # For now, divide by maximum vale
+            # mean = tf.reduce_mean(mixture, keepdims=True)
+            # std = tf.math.reduce_std(mixture, keepdims=True)
+            # mixture = (mixture - mean) / (1e-6 + std)
+            # For now, divide by maximum value
             mixture = mixture / mixture.max()
-            
+
         output_voc = np.zeros(mixture.shape)
         hopsized_chunk = int((chunk_size * self.sample_rate) / 2)
         runs = math.floor(mixture.shape[0] / hopsized_chunk)
diff --git a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
index 71f277b5..ebd7def3 100644
--- a/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
+++ b/compiam/separation/singing_voice_extraction/convtdf_vocal_finetune.py
@@ -1,6 +1,6 @@
 import os
 
-import numpy as np 
+import numpy as np
 
 from compiam.exceptions import ModelNotTrainedError
 from compiam.utils import get_logger, WORKDIR
@@ -86,7 +86,9 @@ def load_model(self, model_path):
         ## Ensuring we can load the model for different torch versions
         ## -- (weights only might be deprecated)
         try:
-            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+            weights = torch.load(
+                model_path, weights_only=True, map_location=self.device
+            )
         except:
             weights = torch.load(model_path, map_location=self.device)
         self.model.load_state_dict(weights["model_state_dict"])
@@ -131,15 +133,17 @@ def separate(
             audio = input_data.to(torch.float32).to(self.device)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
-        
+
         if len(audio.shape) == 1:
             audio = audio.unsqueeze(0)  # Adding mono channel if no audio channels
 
         if len(audio.shape) == 3:
             if audio.shape[0] != 1:
-                raise ValueError("Batching is not supported. Please provide a single audio signal.")
+                raise ValueError(
+                    "Batching is not supported. Please provide a single audio signal."
+                )
             audio = audio.squeeze(0)  # Removing batch dimension
-        
+
         # resample audio
         if input_sr != self.sample_rate:
             logger.warning(
@@ -150,7 +154,7 @@ def separate(
                 orig_freq=input_sr, new_freq=self.sample_rate
             )(audio)
 
-        # downsampling to mono
+        # downsampling to mono
         if audio.shape[0] == 2:
             audio = audio.mean(dim=0, keepdim=True)
             logger.info(
@@ -162,16 +166,22 @@ def separate(
             audio = audio / audio.max()
         initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
-        pad_length = (self.chunk_size - (audio.shape[-1] % self.chunk_size)) % self.chunk_size
+        pad_length = (
+            self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        ) % self.chunk_size
         audio = torch.nn.functional.pad(audio, (0, pad_length))
 
-        chunk_size = audio.shape[-1] // ((audio.shape[-1] + self.chunk_size - 1) // self.chunk_size)
+        chunk_size = audio.shape[-1] // (
+            (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size
+        )
         hop_size = int(chunk_size * (1 - self.overlap))
         num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1
 
         window = torch.hann_window(chunk_size)
         out = torch.zeros(audio.shape[-1])  # (Time,)
-        weight_sum = torch.zeros(audio.shape[-1])  # Weight accumulation for normalization
+        weight_sum = torch.zeros(
+            audio.shape[-1]
+        )  # Weight accumulation for normalization
 
         # Process chunks
         for i in range(num_chunks):
diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
index 81cd7d9c..6a101c8f 100644
--- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
+++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/__init__.py
@@ -183,9 +183,7 @@ def load_model(self, model_path):
                 torch.load(model_path, weights_only=True, map_location=self.device)
             )
         except:
-            self.model.load_state_dict(
-                torch.load(model_path, map_location=self.device)
-            )
+            self.model.load_state_dict(torch.load(model_path, map_location=self.device))
         self.model.eval()
         self.loaded_model_path = model_path
         self.trained = True
diff --git a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py
index 07c68bb5..ef99b1c8 100644
--- a/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py
+++ b/compiam/structure/segmentation/dhrupad_bandish_segmentation/audio_processing.py
@@ -10,7 +10,6 @@
 logger = get_logger(__name__)
 
 
-
 def split_audios(save_dir=None, annotations_path=None, audios_path=None):
     """Split audio of Dhrupad dataset
 
diff --git a/compiam/utils/__init__.py b/compiam/utils/__init__.py
index ef971b90..436cd314 100644
--- a/compiam/utils/__init__.py
+++ b/compiam/utils/__init__.py
@@ -180,7 +180,9 @@ def stereo_to_mono(audio):
         if audio.shape[0] > audio.shape[1]:
             audio = audio.T
             if audio.shape[0] > 2:
-                raise ValueError("Expected mono or stereo audio, got multi-channel audio")
+                raise ValueError(
+                    "Expected mono or stereo audio, got multi-channel audio"
+                )
         # If stereo, average the channels
         if audio.shape[0] == 2:
             audio = np.mean(audio, axis=0)
diff --git a/compiam/utils/download.py b/compiam/utils/download.py
index 3cc2ee60..9c4a3046 100644
--- a/compiam/utils/download.py
+++ b/compiam/utils/download.py
@@ -75,19 +75,17 @@ def download_remote_model(
 def download_zip(url, root_path):
     """Download a ZIP file from a URL."""
     # Get the file name from the URL
-    local_filename = os.path.join(
-        root_path,
-        url.split("/")[-1].split("?")[0]
-    )
+    local_filename = os.path.join(root_path, url.split("/")[-1].split("?")[0])
 
     # Stream the download and save the file
     with requests.get(url, stream=True) as r:
         r.raise_for_status()
         total_size = int(r.headers.get("content-length", 0))
         chunk_size = 8192
-        with open(local_filename, "wb") as f, tqdm(
-            total=total_size, unit="iB", unit_scale=True
-        ) as pbar:
+        with (
+            open(local_filename, "wb") as f,
+            tqdm(total=total_size, unit="iB", unit_scale=True) as pbar,
+        ):
             for chunk in r.iter_content(chunk_size=chunk_size):
                 if chunk:  # filter out keep-alive new chunks
                     f.write(chunk)
diff --git a/compiam/version.py b/compiam/version.py
index 8afe466f..a1c2a810 100644
--- a/compiam/version.py
+++ b/compiam/version.py
@@ -1,4 +1,4 @@
 """Version info"""
 
 short_version = "0.4"
-version = "0.4.1"
\ No newline at end of file
+version = "0.4.1"
diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py
index 51f9a523..98c57dbb 100644
--- a/tests/melody/test_deepsrgm.py
+++ b/tests/melody/test_deepsrgm.py
@@ -44,7 +44,9 @@ def _get_features():
         feat = deepsrgm.get_features(
             os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
         )
-    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    audio = librosa.load(
+        os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100
+    )[0]
     audio = np.tile(audio, 9)
     feat_1 = deepsrgm.get_features(audio)
     feat_2 = deepsrgm.get_features(np.stack([audio, audio]))
diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py
index 622c969a..e8c634cb 100644
--- a/tests/melody/test_essentia_extractors.py
+++ b/tests/melody/test_essentia_extractors.py
@@ -71,10 +71,14 @@ def _predict_normalized_pitch():
     tonic = tonic_multipitch.extract(
         os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav")
     )
-    audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0]
+    audio = librosa.load(
+        os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100
+    )[0]
     tonic_2 = tonic_multipitch.extract(audio)  # Testing input array
     tonic_3 = tonic_multipitch.extract(np.stack([audio, audio]))  # Testing input array
-    tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T)  # Testing input array
+    tonic_4 = tonic_multipitch.extract(
+        np.stack([audio, audio]).T
+    )  # Testing input array
 
     assert isinstance(tonic, float)
     assert tonic == 157.64892578125
diff --git a/tests/separation/test_convtdf_finetune.py b/tests/separation/test_convtdf_finetune.py
index a93f3cf8..dabcc157 100644
--- a/tests/separation/test_convtdf_finetune.py
+++ b/tests/separation/test_convtdf_finetune.py
@@ -19,7 +19,9 @@ def _separate():
     with pytest.raises(FileNotFoundError):
         convtdf_vocal.separate(os.path.join(TESTDIR, "resources", "melody", "hola.wav"))
 
-    convtdf_vocal = compiam.load_model("separation:convtdf-vocal-finetune", data_home=TESTDIR)
+    convtdf_vocal = compiam.load_model(
+        "separation:convtdf-vocal-finetune", data_home=TESTDIR
+    )
     audio_in, sr = np.array(np.ones([1, 44100]), dtype=np.float32), 44100
     separation = convtdf_vocal.separate(audio_in, input_sr=sr)
     assert isinstance(separation, np.ndarray)

From 38ce53e0603b57c462e6e0a0ee2888dd2833c201 Mon Sep 17 00:00:00 2001
From: genisplaja <genis.plaja@upf.edu>
Date: Thu, 24 Jul 2025 00:42:03 +0200
Subject: [PATCH 11/11] correct TF max function

---
 .../singing_voice_extraction/cold_diff_sep/__init__.py         | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
index ec93270b..d76c1327 100644
--- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
+++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py
@@ -167,7 +167,8 @@ def separate(
             # std = tf.math.reduce_std(mixture, keepdims=True)
             # mixture = (mixture - mean) / (1e-6 + std)
             # For now, divide by maximum value
-            mixture = mixture / mixture.max()
+            mixture = mixture / tf.reduce_max(mixture)
+
 
         output_voc = np.zeros(mixture.shape)
         hopsized_chunk = int((chunk_size * self.sample_rate) / 2)