Refactored STFT class and added unit tests

beveradb · beveradb · commit 03103f12b7e3 · 2024-01-30T01:34:56.000-05:00
diff --git a/audio_separator/separator/stft.py b/audio_separator/separator/stft.py
@@ -9,50 +9,117 @@ def __init__(self, logger, n_fft, hop_length, dim_f, device):
         self.logger = logger
         self.n_fft = n_fft
         self.hop_length = hop_length
-        self.window = torch.hann_window(window_length=self.n_fft, periodic=True)
         self.dim_f = dim_f
         self.device = device
+        # Create a Hann window tensor for use in the STFT.
+        self.hann_window = torch.hann_window(window_length=self.n_fft, periodic=True)
 
-    def __call__(self, x):
-        x_is_mps = not x.device.type in ["cuda", "cpu"]
-        if x_is_mps:
-            x = x.cpu()
-
-        initial_shape = x.shape
-        window = self.window.to(x.device)
-        batch_dims = x.shape[:-2]
-        c, t = x.shape[-2:]
-        x = x.reshape([-1, t])
-        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True, return_complex=False)
-        x = x.permute([0, 3, 1, 2])
-        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape([*batch_dims, c * 2, -1, x.shape[-1]])
-
-        if x_is_mps:
-            x = x.to(self.device)
-
-        # self.logger.debug(f"STFT applied. Initial shape: {initial_shape} Resulting shape: {x.shape}")
-        return x[..., : self.dim_f, :]
-
-    def inverse(self, x):
-        x_is_mps = not x.device.type in ["cuda", "cpu"]
-        if x_is_mps:
-            x = x.cpu()
-
-        initial_shape = x.shape
-        window = self.window.to(x.device)
-        batch_dims = x.shape[:-3]
-        c, f, t = x.shape[-3:]
-        n = self.n_fft // 2 + 1
-        f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
-        x = torch.cat([x, f_pad], -2)
-        x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
-        x = x.permute([0, 2, 3, 1])
-        x = x[..., 0] + x[..., 1] * 1.0j
-        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True)
-        x = x.reshape([*batch_dims, 2, -1])
-
-        if x_is_mps:
-            x = x.to(self.device)
-
-        # self.logger.debug(f"Inverse STFT applied. Initial shape: {initial_shape} Resulting shape: {x.shape}")
-        return x
+    def __call__(self, input_tensor):
+        # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
+        is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+
+        # If on a non-standard device, temporarily move the tensor to CPU for processing.
+        if is_non_standard_device:
+            input_tensor = input_tensor.cpu()
+
+        # Transfer the pre-defined window tensor to the same device as the input tensor.
+        stft_window = self.hann_window.to(input_tensor.device)
+
+        # Extract batch dimensions (all dimensions except the last two which are channel and time).
+        batch_dimensions = input_tensor.shape[:-2]
+
+        # Extract channel and time dimensions (last two dimensions of the tensor).
+        channel_dim, time_dim = input_tensor.shape[-2:]
+
+        # Reshape the tensor to merge batch and channel dimensions for STFT processing.
+        reshaped_tensor = input_tensor.reshape([-1, time_dim])
+
+        # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor.
+        stft_output = torch.stft(
+            reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False
+        )
+
+        # Rearrange the dimensions of the STFT output to bring the frequency dimension forward.
+        permuted_stft_output = stft_output.permute([0, 3, 1, 2])
+
+        # Reshape the output to restore the original batch and channel dimensions, while keeping the newly formed frequency and time dimensions.
+        final_output = permuted_stft_output.reshape([*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]).reshape(
+            [*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]
+        )
+
+        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
+        if is_non_standard_device:
+            final_output = final_output.to(self.device)
+
+        # Return the transformed tensor, sliced to retain only the required frequency dimension (`dim_f`).
+        return final_output[..., : self.dim_f, :]
+
+    def pad_frequency_dimension(self, input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins):
+        """
+        Adds zero padding to the frequency dimension of the input tensor.
+        """
+        # Create a padding tensor for the frequency dimension
+        freq_padding = torch.zeros([*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]).to(input_tensor.device)
+
+        # Concatenate the padding to the input tensor along the frequency dimension.
+        padded_tensor = torch.cat([input_tensor, freq_padding], -2)
+
+        return padded_tensor
+
+    def calculate_inverse_dimensions(self, input_tensor):
+        # Extract batch dimensions and frequency-time dimensions.
+        batch_dimensions = input_tensor.shape[:-3]
+        channel_dim, freq_dim, time_dim = input_tensor.shape[-3:]
+
+        # Calculate the number of frequency bins for the inverse STFT.
+        num_freq_bins = self.n_fft // 2 + 1
+
+        return batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins
+
+    def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim):
+        """
+        Prepares the tensor for Inverse Short-Time Fourier Transform (ISTFT) by reshaping
+        and creating a complex tensor from the real and imaginary parts.
+        """
+        # Reshape the tensor to separate real and imaginary parts and prepare for ISTFT.
+        reshaped_tensor = padded_tensor.reshape([*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim])
+
+        # Flatten batch dimensions and rearrange for ISTFT.
+        flattened_tensor = reshaped_tensor.reshape([-1, 2, num_freq_bins, time_dim])
+
+        # Rearrange the dimensions of the tensor to bring the frequency dimension forward.
+        permuted_tensor = flattened_tensor.permute([0, 2, 3, 1])
+
+        # Combine real and imaginary parts into a complex tensor.
+        complex_tensor = permuted_tensor[..., 0] + permuted_tensor[..., 1] * 1.0j
+
+        return complex_tensor
+
+    def inverse(self, input_tensor):
+        # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
+        is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+
+        # If on a non-standard device, temporarily move the tensor to CPU for processing.
+        if is_non_standard_device:
+            input_tensor = input_tensor.cpu()
+
+        # Transfer the pre-defined Hann window tensor to the same device as the input tensor.
+        stft_window = self.hann_window.to(input_tensor.device)
+
+        batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = self.calculate_inverse_dimensions(input_tensor)
+
+        padded_tensor = self.pad_frequency_dimension(input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins)
+
+        complex_tensor = self.prepare_for_istft(padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim)
+
+        # Perform the Inverse Short-Time Fourier Transform (ISTFT).
+        istft_result = torch.istft(complex_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True)
+
+        # Reshape ISTFT result to restore original batch and channel dimensions.
+        final_output = istft_result.reshape([*batch_dimensions, 2, -1])
+
+        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
+        if is_non_standard_device:
+            final_output = final_output.to(self.device)
+
+        return final_output
diff --git a/pytest.ini b/pytest.ini
@@ -1,4 +1,5 @@
 # Used by PyDub, which uses a pure-python fallback when needed already, not an issue
 [pytest]
 filterwarnings =
+    ignore:stft with return_complex=False is deprecated:UserWarning
     ignore:'audioop' is deprecated:DeprecationWarning
diff --git a/tests/unit/test_stft.py b/tests/unit/test_stft.py
@@ -0,0 +1,177 @@
+import unittest
+import numpy as np
+import torch
+from unittest.mock import Mock, patch
+from audio_separator.separator.stft import STFT
+
+# Short-Time Fourier Transform (STFT) Process Overview:
+#
+# STFT transforms a time-domain signal into a frequency-domain representation.
+#   This transformation is achieved by dividing the signal into short frames (or segments) and applying the Fourier Transform to each frame.
+#
+# n_fft: The number of points used in the Fourier Transform, which determines the resolution of the frequency domain representation.
+#   Essentially, it dictates how many frequency bins we get in our STFT.
+#
+# hop_length: The number of samples by which we shift each frame of the signal.
+#   It affects the overlap between consecutive frames. If the hop_length is less than n_fft, we get overlapping frames.
+#
+# Windowing: Each frame of the signal is multiplied by a window function (e.g. Hann window) before applying the Fourier Transform.
+#   This is done to minimize discontinuities at the borders of each frame.
+
+
+class TestSTFT(unittest.TestCase):
+    def setUp(self):
+        self.n_fft = 2048
+        self.hop_length = 512
+        self.dim_f = 1025
+        self.device = torch.device("cpu")
+        self.stft = STFT(logger=Mock(), n_fft=self.n_fft, hop_length=self.hop_length, dim_f=self.dim_f, device=self.device)
+
+    def create_mock_tensor(self, shape, device=None):
+        tensor = torch.rand(shape)
+        if device:
+            tensor = tensor.to(device)
+        return tensor
+
+    def test_stft_initialization(self):
+        self.assertEqual(self.stft.n_fft, self.n_fft)
+        self.assertEqual(self.stft.hop_length, self.hop_length)
+        self.assertEqual(self.stft.dim_f, self.dim_f)
+        self.assertEqual(self.stft.device.type, "cpu")
+        self.assertIsInstance(self.stft.hann_window, torch.Tensor)
+
+    def test_stft_call(self):
+        input_tensor = self.create_mock_tensor((1, 16000))
+
+        # Apply STFT
+        stft_result = self.stft(input_tensor)
+
+        # Test conditions
+        self.assertIsNotNone(stft_result)
+        self.assertIsInstance(stft_result, torch.Tensor)
+
+        # Calculate the expected shape based on input parameters:
+
+        # Frequency Dimension (dim_f): This corresponds to the number of frequency bins in the STFT output.
+        #   In the case of a real-valued input signal (like audio), the Fourier Transform produces a symmetric output.
+        #   Hence, for an n_fft of 2048, we would typically get 2049 frequency bins (from 0 Hz to the Nyquist frequency).
+        #   However, we often don't need the full symmetric spectrum.
+        #   So, dim_f is used to specify how many frequency bins we are interested in.
+        #   In this test, it's set to 1025, which is about half of n_fft + 1 (as the Fourier Transform of a real-valued signal is symmetric).
+
+        # Time Dimension: This corresponds to how many frames (or segments) the input signal has been divided into.
+        #   It depends on the length of the input signal and the hop_length.
+        #   The formula for calculating the number of frames is derived from how we stride the window across the signal:
+        #     Length of Input Signal: Let's denote it as L. In this test, the input tensor has a shape of [1, 16000], so L is 16000 (ignoring the batch dimension for simplicity).
+        #     Number of Frames: The number of frames depends on how we stride the window across the signal. For each frame, we move the window by hop_length samples.
+        #     Therefore, the number of frames N_frames can be roughly estimated by dividing the length of the signal by the hop_length.
+        #     However, since the window overlaps the signal, we add an extra frame to account for the last segment of the signal. This gives us N_frames = (L // hop_length) + 1.
+
+        # Putting It All Together
+        #   expected_shape thus becomes (dim_f, N_frames), which is (1025, (16000 // 512) + 1) in this test case.
+
+        expected_shape = (self.dim_f, (input_tensor.shape[1] // self.hop_length) + 1)
+
+        self.assertEqual(stft_result.shape[-2:], expected_shape)
+
+    def test_calculate_inverse_dimensions(self):
+        # Create a sample input tensor
+        sample_input = torch.randn(1, 2, 500, 32)  # Batch, Channel, Frequency, Time dimensions
+        batch_dims, channel_dim, freq_dim, time_dim, num_freq_bins = self.stft.calculate_inverse_dimensions(sample_input)
+
+        # Expected values
+        expected_num_freq_bins = self.n_fft // 2 + 1
+
+        # Assertions
+        self.assertEqual(batch_dims, sample_input.shape[:-3])
+        self.assertEqual(channel_dim, 2)
+        self.assertEqual(freq_dim, 500)
+        self.assertEqual(time_dim, 32)
+        self.assertEqual(num_freq_bins, expected_num_freq_bins)
+
+    def test_pad_frequency_dimension(self):
+        # Create a sample input tensor
+        sample_input = torch.randn(1, 2, 500, 32)  # Batch, Channel, Frequency, Time dimensions
+        batch_dims, channel_dim, freq_dim, time_dim, num_freq_bins = self.stft.calculate_inverse_dimensions(sample_input)
+
+        # Apply padding
+        padded_output = self.stft.pad_frequency_dimension(sample_input, batch_dims, channel_dim, freq_dim, time_dim, num_freq_bins)
+
+        # Expected frequency dimension after padding
+        expected_freq_dim = num_freq_bins
+
+        # Assertions
+        self.assertEqual(padded_output.shape[-2], expected_freq_dim)
+
+    def test_prepare_for_istft(self):
+        # Create a sample input tensor
+        sample_input = torch.randn(1, 2, 500, 32)  # Batch, Channel, Frequency, Time dimensions
+        batch_dims, channel_dim, freq_dim, time_dim, num_freq_bins = self.stft.calculate_inverse_dimensions(sample_input)
+        padded_output = self.stft.pad_frequency_dimension(sample_input, batch_dims, channel_dim, freq_dim, time_dim, num_freq_bins)
+
+        # Apply prepare_for_istft
+        complex_tensor = self.stft.prepare_for_istft(padded_output, batch_dims, channel_dim, num_freq_bins, time_dim)
+
+        # Calculate the expected flattened batch size (flattening batch and channel dimensions)
+        expected_flattened_batch_size = batch_dims[0] * (channel_dim // 2)
+
+        # Expected shape of the complex tensor
+        expected_shape = (expected_flattened_batch_size, num_freq_bins, time_dim)
+
+        # Assertions
+        self.assertEqual(complex_tensor.shape, expected_shape)
+
+    def test_inverse_device_handling(self):
+        # Create a mock tensor with the correct input shape
+        input_tensor = torch.rand(1, 2, 1025, 32)  # shape matching output of STFT
+
+        # Initialize STFT
+        stft = STFT(logger=MockLogger(), n_fft=2048, hop_length=512, dim_f=1025, device="cpu")
+
+        # Apply inverse STFT
+        output_tensor = stft.inverse(input_tensor)
+
+        # Check if the output tensor is on the CPU
+        self.assertEqual(output_tensor.device.type, "cpu")
+
+    def test_inverse_output_shape(self):
+        # Create a mock tensor
+        input_tensor = torch.rand(1, 2, 1025, 32)  # shape matching output of STFT
+
+        # Initialize STFT
+        stft = STFT(logger=MockLogger(), n_fft=2048, hop_length=512, dim_f=1025, device="cpu")
+
+        # Apply inverse STFT
+        output_tensor = stft.inverse(input_tensor)
+
+        # Expected output shape: (Batch size, Channel dimension, Time dimension)
+        expected_shape = (1, 2, 7936)  # Calculated based on STFT parameters
+
+        # Check if the output tensor has the expected shape
+        self.assertEqual(output_tensor.shape, expected_shape)
+
+    def test_stft_with_mps_device(self):
+        mps_device = torch.device("mps")
+        self.stft.device = mps_device
+        input_tensor = self.create_mock_tensor((1, 16000), device=mps_device)
+        stft_result = self.stft(input_tensor)
+        self.assertIsNotNone(stft_result)
+        self.assertIsInstance(stft_result, torch.Tensor)
+
+    def test_inverse_with_mps_device(self):
+        mps_device = torch.device("mps")
+        self.stft.device = mps_device
+        input_tensor = self.create_mock_tensor((1, 2, 1025, 32), device=mps_device)
+        istft_result = self.stft.inverse(input_tensor)
+        self.assertIsNotNone(istft_result)
+        self.assertIsInstance(istft_result, torch.Tensor)
+
+
+# Mock logger to use in tests
+class MockLogger:
+    def debug(self, message):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()