feat: add first working implementation

flavioschneider · flavioschneider · commit 4c8bb6fb3d5a · 2022-12-06T09:31:01.000+01:00
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 
-# CQT - PyTorch 
+# CQT - PyTorch
 
-An invertible and differentiable implementation of the Constant-Q Transform (CQT), in PyTorch. 
+An invertible and differentiable implementation of the Constant-Q Transform (CQT) using Non-stationary Gabor Transform (NSGT), in PyTorch.
 
 ```bash
-pip install cqt-pytorch 
+pip install cqt-pytorch
 ```
 [![PyPI - Python Version](https://img.shields.io/pypi/v/cqt-pytorch?style=flat&colorA=black&colorB=black)](https://pypi.org/project/cqt-pytorch/)
 
@@ -15,7 +15,33 @@ pip install cqt-pytorch
 from cqt_pytorch import CQT
 
 transform = CQT(
-    ...
+    num_octaves = 7,
+    num_bins_per_octave = 65,
+    sample_rate = 48000,
+    block_length = 2 ** 18
 )
 
+# (Random) audio waveform tensor x
+x = torch.randn(1, 2, 2**18) # [1, 1, 262144] = [batch_size, channels, timesteps]
+z = transform.encode(x) # [1, 2, 455, 2796] = [batch_size, channels, frequencies, time]
+y = transform.decode(z) # [1, 1, 262144]
+```
+
+## TODO
+[ ] Understand why/if inverse window is necessary.
+[ ] Allow variable audio lengths by chunking.
+
+## Appreciation
+Special thanks to [Eloi Moliner](https://github.com/eloimoliner) for taking the time to help me understand how CQT works. Check out his own implementation with interesting features at [eloimoliner/CQT_pytorch](https://github.com/eloimoliner/CQT_pytorch).
+
+## Citations
+
+```bibtex
+@article{1210.0084,
+Author = {Nicki Holighaus and Monika Dörfler and Gino Angelo Velasco and Thomas Grill},
+Title = {A framework for invertible, real-time constant-Q transforms},
+Year = {2012},
+Eprint = {arXiv:1210.0084},
+Doi = {10.1109/TASL.2012.2234114},
+}
 ```
diff --git a/cqt_pytorch/__init__.py b/cqt_pytorch/__init__.py
@@ -0,0 +1 @@
+from .cqt import CQT
diff --git a/cqt_pytorch/cqt.py b/cqt_pytorch/cqt.py
@@ -1,35 +1,142 @@
-from typing import Optional, TypeVar
+from math import floor
 
 import torch
-from typing_extensions import TypeGuard
+import torch.nn.functional as F
+from torch import Tensor, nn
 
-T = TypeVar("T")
 
+def get_center_frequencies(
+    num_octaves: int, num_bins_per_octave: int, sample_rate: int  # C  # B  # Xi_s
+) -> Tensor:  # Xi_k for k in [1, 2*K+1]
+    """Compute log scaled center frequencies tensor"""
+    frequency_nyquist = sample_rate / 2
+    frequency_min = frequency_nyquist / (2**num_octaves)
+    num_bins = num_octaves * num_bins_per_octave  # K
+    # Exponential increase from min to Nyquist
+    frequencies = frequency_min * (2 ** (torch.arange(num_bins) / num_bins_per_octave))
+    frequencies_all = torch.cat(
+        [
+            frequencies,
+            torch.tensor([frequency_nyquist]),
+            # sample_rate - torch.flip(frequencies, dims=[0]) # not necessary
+        ],
+        dim=0,
+    )
+    return frequencies_all
 
-"""
-Utils
-"""
 
+def get_bandwidths(
+    num_octaves: int,  # C
+    num_bins_per_octave: int,  # B
+    sample_rate: int,  # Xi_s
+    frequencies: Tensor,  # Xi_k for k in [1, 2*K+1]
+) -> Tensor:  # Omega_k for k in [1, 2*K+1]
+    """Compute bandwidths tensor from center frequencies"""
+    num_bins = num_octaves * num_bins_per_octave  # K
+    q_factor = 1.0 / (
+        2 ** (1.0 / num_bins_per_octave) - 2 ** (-1.0 / num_bins_per_octave)
+    )
+    bandwidths = frequencies[1 : num_bins + 1] / q_factor
+    bandwidths_symmetric = (
+        torch.flip(frequencies[1 : num_bins + 1], dims=[0]) / q_factor
+    )
+    bandwidths_all = torch.cat(
+        [
+            bandwidths,
+            torch.tensor([sample_rate - 2 * frequencies[num_bins]]),
+            bandwidths_symmetric,
+        ],
+        dim=0,
+    )
+    return bandwidths_all
 
-def exists(val: Optional[T]) -> TypeGuard[T]:
-    return val is not None
 
+def get_windows_range_indices(lengths: Tensor, positions: Tensor) -> Tensor:
+    """Compute windowing tensor of indices"""
+    num_bins = lengths.shape[0] // 2
+    max_length = lengths.max()
+    ranges = []
+    for i in range(num_bins):
+        start = positions[i] - max_length
+        ranges += [torch.arange(start=start, end=start + max_length)]  # type: ignore
+    return torch.stack(ranges, dim=0).long()
 
-"""
-CQT
-"""
 
-class CQT(nn.Module):
+def get_windows(lengths: Tensor) -> Tensor:
+    """Compute tensor of stacked (centered) windows"""
+    num_bins = lengths.shape[0] // 2
+    max_length = lengths.max()
+    windows = []
+    for length in lengths[:num_bins]:
+        # Pad windows left and right to center them
+        pad_left = floor(max_length / 2 - length / 2)
+        pad_right = int(max_length - length - pad_left)
+        windows += [F.pad(torch.hann_window(int(length)), pad=(pad_left, pad_right))]
+    return torch.stack(windows, dim=0)
+
+
+def get_windows_inverse(windows: Tensor, lengths: Tensor) -> Tensor:
+    num_bins = windows.shape[0]
+    return torch.einsum("k m, k -> k m", windows**2, lengths[:num_bins])
 
+
+class CQT(nn.Module):
     def __init__(
         self,
+        num_octaves: int,
+        num_bins_per_octave: int,
+        sample_rate: int,
+        block_length: int,
     ):
-        super().__init__() 
+        super().__init__()
+        self.block_length = block_length
+
+        frequencies = get_center_frequencies(
+            num_octaves=num_octaves,
+            num_bins_per_octave=num_bins_per_octave,
+            sample_rate=sample_rate,
+        )
+
+        bandwidths = get_bandwidths(
+            num_octaves=num_octaves,
+            num_bins_per_octave=num_bins_per_octave,
+            sample_rate=sample_rate,
+            frequencies=frequencies,
+        )
+
+        window_lengths = torch.round(bandwidths * block_length / sample_rate)
+
+        self.register_buffer(
+            "windows_range_indices",
+            get_windows_range_indices(
+                lengths=window_lengths,
+                positions=torch.round(frequencies * block_length / sample_rate),
+            ),
+        )
 
+        self.register_buffer("windows", get_windows(lengths=window_lengths))
 
-    def encode(self, x: Tensor) -> Tensor:
-        pass 
+        self.register_buffer(
+            "windows_inverse",
+            get_windows_inverse(windows=self.windows, lengths=window_lengths),  # type: ignore # noqa
+        )
 
+    def encode(self, waveform: Tensor) -> Tensor:
+        frequencies = torch.fft.fft(waveform)
+        crops = frequencies[:, :, self.windows_range_indices]
+        crops_windowed = torch.einsum("... t k, t k -> ... t k", crops, self.windows)
+        transform = torch.fft.ifft(crops_windowed)
+        return transform
 
-    def decode(self, x: Tensor) -> Tensor:
-        pass 
+    def decode(self, transform: Tensor) -> Tensor:
+        b, c, length = *transform.shape[0:2], self.block_length
+        crops_windowed = torch.fft.fft(transform)
+        crops_unwindowed = crops_windowed  # TODO crops_unwindowed = torch.einsum('... t k, t k -> ... t k', transformed, self.windows_inverse) # noqa
+        frequencies = torch.zeros(b, c, length).to(transform)
+        frequencies.scatter_add_(
+            dim=-1,
+            index=self.windows_range_indices.view(-1).expand(b, c, -1) % l,  # type: ignore # noqa
+            src=crops_unwindowed.view(b, c, -1),
+        )
+        waveform = torch.fft.ifft(frequencies)
+        return waveform