maintenance

sp-nitech · Nov 12, 2024 · d5420df · d5420df
1 parent 3dbad2d
commit d5420df
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 14 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,8 +21,8 @@ jobs:
             torch: 2.0.0
             torchaudio: 2.0.1
           - python: 3.12
-            torch: 2.5.0
-            torchaudio: 2.5.0
+            torch: 2.5.1
+            torchaudio: 2.5.1
 
     steps:
       - name: Clone

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 [![Stable Manual](https://img.shields.io/badge/docs-stable-blue.svg)](https://sp-nitech.github.io/diffsptk/2.2.0/)
 [![Downloads](https://static.pepy.tech/badge/diffsptk)](https://pepy.tech/project/diffsptk)
 [![Python Version](https://img.shields.io/pypi/pyversions/diffsptk.svg)](https://pypi.python.org/pypi/diffsptk)
-[![PyTorch Version](https://img.shields.io/badge/pytorch-2.0.0%20%7C%202.5.0-orange.svg)](https://pypi.python.org/pypi/diffsptk)
+[![PyTorch Version](https://img.shields.io/badge/pytorch-2.0.0%20%7C%202.5.1-orange.svg)](https://pypi.python.org/pypi/diffsptk)
 [![PyPI Version](https://img.shields.io/pypi/v/diffsptk.svg)](https://pypi.python.org/pypi/diffsptk)
 [![Codecov](https://codecov.io/gh/sp-nitech/diffsptk/branch/master/graph/badge.svg)](https://app.codecov.io/gh/sp-nitech/diffsptk)
 [![License](https://img.shields.io/github/license/sp-nitech/diffsptk.svg)](https://github.com/sp-nitech/diffsptk/blob/master/LICENSE)

diff --git a/diffsptk/misc/utils.py b/diffsptk/misc/utils.py
@@ -134,6 +134,27 @@ def remove_gain(a, value=1, return_gain=False):
     return ret
 
 
+def get_resample_params(mode="kaiser_best"):
+    # From https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html
+    if mode == "kaiser_best":
+        params = {
+            "lowpass_filter_width": 64,
+            "rolloff": 0.9475937167399596,
+            "resampling_method": "sinc_interp_kaiser",
+            "beta": 14.769656459379492,
+        }
+    elif mode == "kaiser_fast":
+        params = {
+            "lowpass_filter_width": 16,
+            "rolloff": 0.85,
+            "resampling_method": "sinc_interp_kaiser",
+            "beta": 8.555504641634386,
+        }
+    else:
+        raise ValueError("Only kaiser_best and kaiser_fast are supported.")
+    return params
+
+
 def get_alpha(sr, mode="hts", n_freq=10, n_alpha=100):
     """Compute an appropriate frequency warping factor under given sample rate.
 

diff --git a/diffsptk/modules/cqt.py b/diffsptk/modules/cqt.py
@@ -37,6 +37,7 @@
 
 from ..misc.utils import Lambda
 from ..misc.utils import delayed_import
+from ..misc.utils import get_resample_params
 from ..misc.utils import numpy_to_torch
 from .stft import ShortTimeFourierTransform as STFT
 
@@ -79,6 +80,9 @@ class ConstantQTransform(nn.Module):
     scale : bool
         If True, scale the CQT responce by the length of filter.
 
+    res_type : ['kaiser_best', 'kaiser_fast'] or None
+        Resampling type.
+
     **kwargs : additional keyword arguments
         See `torchaudio.transforms.Resample
         <https://pytorch.org/audio/main/generated/torchaudio.transforms.Resample.html>`_.
@@ -99,6 +103,7 @@ def __init__(
         sparsity=1e-2,
         window="hann",
         scale=True,
+        res_type="kaiser_best",
         **kwargs,
     ):
         super().__init__()
@@ -144,6 +149,8 @@ def __init__(
         downsample_count = early_downsample_count(
             sample_rate * 0.5, filter_cutoff, frame_period, n_octave
         )
+        if res_type is not None:
+            kwargs.update(get_resample_params(res_type))
         if 0 < downsample_count:
             downsample_factor = 2**downsample_count
             early_downsample.append(

diff --git a/diffsptk/modules/excite.py b/diffsptk/modules/excite.py
@@ -135,13 +135,20 @@ def _forward(p, frame_period, voiced_region, unvoiced_region, polarity):
             unipolar = polarity == "unipolar"
         e = torch.zeros_like(p)
         if voiced_region == "pulse":
-            r = torch.ceil(phase)
-            r = F.pad(r, (1, 0))
-            pulse_pos = torch.ge(torch.diff(r), 1)
+
+            def get_pulse_pos(p):
+                r = torch.ceil(p)
+                r = F.pad(r, (1, 0))
+                return torch.ge(torch.diff(r), 1)
+
             if unipolar:
+                pulse_pos = get_pulse_pos(phase)
                 e[pulse_pos] = torch.sqrt(p[pulse_pos])
             else:
-                raise RuntimeError
+                pulse_pos1 = get_pulse_pos(phase)
+                pulse_pos2 = get_pulse_pos(0.5 * phase)
+                e[pulse_pos1] = torch.sqrt(p[pulse_pos1])
+                e[pulse_pos1 & ~pulse_pos2] *= -1
         elif voiced_region == "sinusoidal":
             if unipolar:
                 e[mask] = 0.5 * (1 - torch.cos(TWO_PI * phase[mask]))

diff --git a/diffsptk/modules/icqt.py b/diffsptk/modules/icqt.py
@@ -36,6 +36,7 @@
 import torchaudio
 
 from ..misc.utils import delayed_import
+from ..misc.utils import get_resample_params
 from ..misc.utils import numpy_to_torch
 from .istft import InverseShortTimeFourierTransform as ISTFT
 
@@ -78,6 +79,9 @@ class InverseConstantQTransform(nn.Module):
     scale : bool
         If True, scale the CQT responce by the length of filter.
 
+    res_type : ['kaiser_best', 'kaiser_fast'] or None
+        Resampling type.
+
     **kwargs : additional keyword arguments
         See `torchaudio.transforms.Resample
         <https://pytorch.org/audio/main/generated/torchaudio.transforms.Resample.html>`_.
@@ -98,6 +102,7 @@ def __init__(
         sparsity=1e-2,
         window="hann",
         scale=True,
+        res_type="kaiser_best",
         **kwargs,
     ):
         super().__init__()
@@ -156,6 +161,9 @@ def __init__(
         transforms = []
         resamplers = []
 
+        if res_type is not None:
+            kwargs.update(get_resample_params(res_type))
+
         for i in range(n_octave):
             n_filter = min(B, K - B * i)
             sl = slice(B * i, B * i + n_filter)

diff --git a/tests/test_cqt.py b/tests/test_cqt.py
@@ -27,7 +27,8 @@
 @pytest.mark.parametrize("fp", [511, 512])
 @pytest.mark.parametrize("K", [1, 24])
 @pytest.mark.parametrize("scale", [False, True])
-def test_compatibility(device, fp, K, scale, B=12, f_min=32.7):
+@pytest.mark.parametrize("res_type", ["kaiser_best", "kaiser_fast"])
+def test_compatibility(device, fp, K, scale, res_type, B=12, f_min=32.7):
     if device == "cuda" and not torch.cuda.is_available():
         return
 
@@ -45,12 +46,12 @@ def test_compatibility(device, fp, K, scale, B=12, f_min=32.7):
         bins_per_octave=B,
         hop_length=fp,
         scale=scale,
-        res_type="kaiser_best",
+        res_type=res_type,
         dtype=None,
     ).T
 
     cqt = diffsptk.CQT(
-        fp, sr, f_min=f_min, n_bin=K, n_bin_per_octave=B, scale=scale
+        fp, sr, f_min=f_min, n_bin=K, n_bin_per_octave=B, scale=scale, res_type=res_type
     ).to(device)
     c2 = cqt(x).cpu().numpy()
 

diff --git a/tests/test_excite.py b/tests/test_excite.py
@@ -91,15 +91,14 @@ def compute_error(infile):
 )
 @pytest.mark.parametrize("polarity", ["unipolar", "bipolar"])
 def test_waveform(voiced_region, polarity, P=80, verbose=False):
-    if voiced_region == "pulse" and polarity == "bipolar":
-        return
-
     excite = diffsptk.ExcitationGeneration(
         P, voiced_region=voiced_region, unvoiced_region="zeros", polarity=polarity
     )
     pitch = torch.from_numpy(
-        U.call("x2x +sd tools/SPTK/asset/data.short | " f"pitch -s 16 -p {P} -o 0 -a 2")
+        U.call(f"x2x +sd tools/SPTK/asset/data.short | pitch -s 16 -p {P} -o 0 -a 2")
     )
     e = excite(pitch)
+    if voiced_region == "pulse":
+        e = e / e.abs().max()
     if verbose:
         sf.write(f"excite_{voiced_region}_{polarity}.wav", e, 16000)