Updated model data fetching to support VR models, added method to list available models, solidified properties in common class vs. arch-specific

beveradb · beveradb · commit ae7e422efe9c · 2024-02-04T00:54:46.000-05:00
diff --git a/README.md b/README.md
@@ -168,10 +168,10 @@ separator = Separator()
 separator.load_model()
 
 # Perform the separation on specific audio files without reloading the model
-primary_stem_path, secondary_stem_path = separator.separate('audio1.wav')
+primary_stem_output_path, secondary_stem_output_path = separator.separate('audio1.wav')
 
-print(f'Primary stem saved at {primary_stem_path}')
-print(f'Secondary stem saved at {secondary_stem_path}')
+print(f'Primary stem saved at {primary_stem_output_path}')
+print(f'Secondary stem saved at {secondary_stem_output_path}')
 ```
 
 #### Batch processing, or processing with multiple models
diff --git a/audio_separator/separator/architectures/mdx_separator.py b/audio_separator/separator/architectures/mdx_separator.py
@@ -26,6 +26,14 @@ def __init__(self, common_config, arch_config):
         self.overlap = arch_config.get("overlap")
         self.batch_size = arch_config.get("batch_size")
 
+        # Initializing model parameters
+        self.compensate = self.model_data["compensate"]
+        self.dim_f = self.model_data["mdx_dim_f_set"]
+        self.dim_t = 2 ** self.model_data["mdx_dim_t_set"]
+        self.n_fft = self.model_data["mdx_n_fft_scale_set"]
+        
+        self.config_yaml = self.model_data.get("config_yaml", None)
+
         self.logger.debug(f"Model params: primary_stem={self.primary_stem_name}, secondary_stem={self.secondary_stem_name}")
         self.logger.debug(f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}")
         self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}")
@@ -107,20 +115,20 @@ def separate(self, audio_file_path):
         # Save and process the secondary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
             self.logger.info(f"Saving {self.secondary_stem_name} stem...")
-            if not self.secondary_stem_path:
-                self.secondary_stem_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
-            self.secondary_source_map = self.final_process(self.secondary_stem_path, self.secondary_source, self.secondary_stem_name)
-            output_files.append(self.secondary_stem_path)
+            if not self.secondary_stem_output_path:
+                self.secondary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.secondary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            self.secondary_source_map = self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            output_files.append(self.secondary_stem_output_path)
 
         # Save and process the primary stem if needed
         if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
             self.logger.info(f"Saving {self.primary_stem_name} stem...")
-            if not self.primary_stem_path:
-                self.primary_stem_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
+            if not self.primary_stem_output_path:
+                self.primary_stem_output_path = os.path.join(f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}")
             if not isinstance(self.primary_source, np.ndarray):
                 self.primary_source = source.T
-            self.primary_source_map = self.final_process(self.primary_stem_path, self.primary_source, self.primary_stem_name)
-            output_files.append(self.primary_stem_path)
+            self.primary_source_map = self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            output_files.append(self.primary_stem_output_path)
 
         # TODO: In UVR, this is where the vocal split chain gets processed - see process_vocal_split_chain()
 
diff --git a/audio_separator/separator/architectures/vr_separator.py b/audio_separator/separator/architectures/vr_separator.py
@@ -1,13 +1,14 @@
 """Module for separating audio sources using VR architecture models."""
 
 import os
+import sys
+import math
+
 import torch
 import librosa
-import onnxruntime as ort
 import numpy as np
-import onnx2torch
+
 from audio_separator.separator import spec_utils
-from audio_separator.separator.stft import STFT
 from audio_separator.separator.common_separator import CommonSeparator
 
 
@@ -20,31 +21,10 @@ class VRSeparator(CommonSeparator):
     def __init__(self, common_config, arch_config):
         super().__init__(config=common_config)
 
-        self.hop_length = arch_config.get("hop_length")
-        self.segment_size = arch_config.get("segment_size")
-        self.overlap = arch_config.get("overlap")
-        self.batch_size = arch_config.get("batch_size")
+        self.logger.debug(f"Model data: ", self.model_data)
 
         self.logger.debug(f"Model params: primary_stem={self.primary_stem_name}, secondary_stem={self.secondary_stem_name}")
-        self.logger.debug(f"Model params: batch_size={self.batch_size}, compensate={self.compensate}, segment_size={self.segment_size}, dim_f={self.dim_f}, dim_t={self.dim_t}")
-        self.logger.debug(f"Model params: n_fft={self.n_fft}, hop={self.hop_length}")
-
-        # Loading the model for inference
-        self.logger.debug("Loading ONNX model for inference...")
-        if self.segment_size == self.dim_t:
-            ort_ = ort.InferenceSession(self.model_path, providers=self.onnx_execution_provider)
-            self.model_run = lambda spek: ort_.run(None, {"input": spek.cpu().numpy()})[0]
-            self.logger.debug("Model loaded successfully using ONNXruntime inferencing session.")
-        else:
-            self.model_run = onnx2torch.convert(self.model_path)
-            self.model_run.to(self.torch_device).eval()
-            self.logger.warning("Model converted from onnx to pytorch due to segment size not matching dim_t, processing may be slower.")
 
-        self.n_bins = None
-        self.trim = None
-        self.chunk_size = None
-        self.gen_size = None
-        self.stft = None
 
         self.primary_source = None
         self.secondary_source = None
@@ -53,49 +33,52 @@ def __init__(self, common_config, arch_config):
         self.secondary_source_map = None
         self.primary_source_map = None
 
+        self.is_vr_51_model = model_data.is_vr_51_model
+
+    def separate(self, audio_file_path):
+        """
+        Separates the audio file into primary and secondary sources based on the model's configuration.
+        It processes the mix, demixes it into sources, normalizes the sources, and saves the output files.
 
+        Args:
+            audio_file_path (str): The path to the audio file to be processed.
 
+        Returns:
+            list: A list of paths to the output files generated by the separation process.
+        """
+        self.primary_source = None
+        self.secondary_source = None
 
+        self.audio_file_path = audio_file_path
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
+        self.logger.debug("Starting inference...")
 
-    def seperate(self):
-        self.logger.debug("Starting separation process in SeperateVR...")
-        if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, tuple):
-            self.logger.debug("Using cached primary sources...")
-            y_spec, v_spec = self.primary_sources
-            self.load_cached_sources()
+        nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
+        vr_5_1_models = [56817, 218409]
+        model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
+        nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size))
+        self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}")
+
+        if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
+            self.logger.debug("Using CascadedNet for VR 5.1 model...")
+            self.model_run = nets_new.CascadedNet(self.mp.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
+            self.is_vr_51_model = True
         else:
-            self.logger.debug("Starting inference...")
-            self.start_inference_console_write()
-
-            device = self.device
-            self.logger.debug(f"Device set to: {device}")
-
-            nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
-            vr_5_1_models = [56817, 218409]
-            model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
-            nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size))
-            self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}")
-
-            if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
-                self.logger.debug("Using CascadedNet for VR 5.1 model...")
-                self.model_run = nets_new.CascadedNet(self.mp.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
-                self.is_vr_51_model = True
-            else:
-                self.logger.debug("Determining model capacity...")
-                self.model_run = nets.determine_model_capacity(self.mp.param["bins"] * 2, nn_arch_size)
+            self.logger.debug("Determining model capacity...")
+            self.model_run = nets.determine_model_capacity(self.mp.param["bins"] * 2, nn_arch_size)
 
-            self.model_run.load_state_dict(torch.load(self.model_path, map_location=cpu))
-            self.model_run.to(device)
-            self.logger.debug("Model loaded and moved to device.")
+        self.model_run.load_state_dict(torch.load(self.model_path, map_location=cpu))
+        self.model_run.to(device)
+        self.logger.debug("Model loaded and moved to device.")
 
-            self.running_inference_console_write()
+        self.running_inference_console_write()
 
-            y_spec, v_spec = self.inference_vr(self.loading_mix(), device, self.aggressiveness)
-            self.logger.debug("Inference completed.")
-            if not self.is_vocal_split_model:
-                self.cache_source((y_spec, v_spec))
-            self.write_to_console(DONE, base_text="")
+        y_spec, v_spec = self.inference_vr(self.loading_mix(), device, self.aggressiveness)
+        self.logger.debug("Inference completed.")
+        if not self.is_vocal_split_model:
+            self.cache_source((y_spec, v_spec))
+        self.write_to_console(DONE, base_text="")
 
         if self.is_secondary_model_activated and self.secondary_model:
             self.logger.debug("Processing secondary model...")
@@ -104,7 +87,7 @@ def seperate(self):
             )
 
         if not self.is_secondary_stem_only:
-            primary_stem_path = os.path.join(self.export_path, f"{self.audio_file_base}_({self.primary_stem}).wav")
+            primary_stem_output_path = os.path.join(self.export_path, f"{self.audio_file_base}_({self.primary_stem}).wav")
             self.logger.debug(f"Processing primary stem: {self.primary_stem}")
             if not isinstance(self.primary_source, np.ndarray):
                 self.primary_source = self.spec_to_wav(y_spec).T
@@ -113,11 +96,11 @@ def seperate(self):
                     self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
                     self.logger.debug("Resampling primary source to 44100Hz.")
 
-            self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, 44100)
+            self.primary_source_map = self.final_process(primary_stem_output_path, self.primary_source, self.secondary_source_primary, self.primary_stem, 44100)
             self.logger.debug("Primary stem processed.")
 
         if not self.is_primary_stem_only:
-            secondary_stem_path = os.path.join(self.export_path, f"{self.audio_file_base}_({self.secondary_stem}).wav")
+            secondary_stem_output_path = os.path.join(self.export_path, f"{self.audio_file_base}_({self.secondary_stem}).wav")
             self.logger.debug(f"Processing secondary stem: {self.secondary_stem}")
             if not isinstance(self.secondary_source, np.ndarray):
                 self.secondary_source = self.spec_to_wav(v_spec).T
@@ -126,7 +109,7 @@ def seperate(self):
                     self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
                     self.logger.debug("Resampling secondary source to 44100Hz.")
 
-            self.secondary_source_map = self.final_process(secondary_stem_path, self.secondary_source, self.secondary_source_secondary, self.secondary_stem, 44100)
+            self.secondary_source_map = self.final_process(secondary_stem_output_path, self.secondary_source, self.secondary_source_secondary, self.secondary_stem, 44100)
             self.logger.debug("Secondary stem processed.")
 
         clear_gpu_cache()
diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py
@@ -15,31 +15,38 @@ class CommonSeparator:
     def __init__(self, config):
 
         self.logger: Logger = config.get("logger")
+
+        # Inferencing device / acceleration config
         self.torch_device = config.get("torch_device")
         self.onnx_execution_provider = config.get("onnx_execution_provider")
+
+        # Model data
         self.model_name = config.get("model_name")
         self.model_path = config.get("model_path")
         self.model_data = config.get("model_data")
-        self.primary_stem_path = config.get("primary_stem_path")
-        self.secondary_stem_path = config.get("secondary_stem_path")
-        self.output_format = config.get("output_format")
-        self.output_subtype = config.get("output_subtype")
+
+        # Optional custom output paths for the primary and secondary stems
+        # If left as None, the arch-specific class decides the output filename, e.g. something like:
+        # f"{self.audio_file_base}_({self.primary_stem_name})_{self.model_name}.{self.output_format.lower()}"
+        self.primary_stem_output_path = config.get("primary_stem_output_path")
+        self.secondary_stem_output_path = config.get("secondary_stem_output_path")
+
+        # Output directory and format
         self.output_dir = config.get("output_dir")
+        self.output_format = config.get("output_format")
+
+        # Functional options which are applicable to all architectures and the user may tweak to affect the output
         self.normalization_threshold = config.get("normalization_threshold")
         self.denoise_enabled = config.get("denoise_enabled")
         self.output_single_stem = config.get("output_single_stem")
         self.invert_using_spec = config.get("invert_using_spec")
         self.sample_rate = config.get("sample_rate")
 
-        # Initializing model parameters
-        self.compensate, self.dim_f, self.dim_t, self.n_fft, self.primary_stem_name = (
-            self.model_data["compensate"],
-            self.model_data["mdx_dim_f_set"],
-            2 ** self.model_data["mdx_dim_t_set"],
-            self.model_data["mdx_n_fft_scale_set"],
-            self.model_data["primary_stem"],
-        )
+        # Model specific properties
+        self.primary_stem_name = self.model_data["primary_stem"]
         self.secondary_stem_name = "Vocals" if self.primary_stem_name == "Instrumental" else "Instrumental"
+        self.is_karaoke = self.model_data.get("is_karaoke", False)
+        self.is_bv_model = self.model_data.get("is_bv_model", False)
 
         # In UVR, these variables are set but either aren't useful or are better handled in audio-separator.
         # Leaving these comments explaining to help myself or future developers understand why these aren't in audio-separator.
@@ -62,12 +69,6 @@ def __init__(self, config):
 
         self.cached_sources_map = {}
 
-    def prepare_mix(self, mix):
-        """
-        Placeholder method for preparing the mix. Should be overridden by subclasses.
-        """
-        raise NotImplementedError("This method should be overridden by subclasses.")
-
     def separate(self, audio_file_path):
         """
         Placeholder method for separating audio sources. Should be overridden by subclasses.
@@ -79,7 +80,7 @@ def final_process(self, stem_path, source, stem_name):
         Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
         """
         self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
-        self.write_audio(stem_path, source, stem_name=stem_name)
+        self.write_audio(stem_path, source)
 
         return {stem_name: source}
 
@@ -126,11 +127,11 @@ def cached_model_source_holder(self, model_architecture, sources, model_name=Non
         """
         self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}}
 
-    def write_audio(self, stem_path: str, stem_source, stem_name=None):
+    def write_audio(self, stem_path: str, stem_source):
         """
         Writes the separated audio source to a file.
         """
-        self.logger.debug(f"Entering write_audio with stem_name: {stem_name} and stem_path: {stem_path}")
+        self.logger.debug(f"Entering write_audio with stem_path: {stem_path}")
 
         stem_source = spec_utils.normalize(self.logger, wave=stem_source, max_peak=self.normalization_threshold)
 
diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
diff --git a/audio_separator/utils/cli.py b/audio_separator/utils/cli.py