diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 046b31c3..51c26dd5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,8 +21,8 @@ jobs: torch: 2.0.0 torchaudio: 2.0.1 - python: 3.12 - torch: 2.5.0 - torchaudio: 2.5.0 + torch: 2.5.1 + torchaudio: 2.5.1 steps: - name: Clone diff --git a/README.md b/README.md index f9b38a1a..d88e4270 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Stable Manual](https://img.shields.io/badge/docs-stable-blue.svg)](https://sp-nitech.github.io/diffsptk/2.2.0/) [![Downloads](https://static.pepy.tech/badge/diffsptk)](https://pepy.tech/project/diffsptk) [![Python Version](https://img.shields.io/pypi/pyversions/diffsptk.svg)](https://pypi.python.org/pypi/diffsptk) -[![PyTorch Version](https://img.shields.io/badge/pytorch-2.0.0%20%7C%202.5.0-orange.svg)](https://pypi.python.org/pypi/diffsptk) +[![PyTorch Version](https://img.shields.io/badge/pytorch-2.0.0%20%7C%202.5.1-orange.svg)](https://pypi.python.org/pypi/diffsptk) [![PyPI Version](https://img.shields.io/pypi/v/diffsptk.svg)](https://pypi.python.org/pypi/diffsptk) [![Codecov](https://codecov.io/gh/sp-nitech/diffsptk/branch/master/graph/badge.svg)](https://app.codecov.io/gh/sp-nitech/diffsptk) [![License](https://img.shields.io/github/license/sp-nitech/diffsptk.svg)](https://github.com/sp-nitech/diffsptk/blob/master/LICENSE) diff --git a/diffsptk/misc/utils.py b/diffsptk/misc/utils.py index 3adf79e5..da12c173 100644 --- a/diffsptk/misc/utils.py +++ b/diffsptk/misc/utils.py @@ -134,6 +134,27 @@ def remove_gain(a, value=1, return_gain=False): return ret +def get_resample_params(mode="kaiser_best"): + # From https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html + if mode == "kaiser_best": + params = { + "lowpass_filter_width": 64, + "rolloff": 0.9475937167399596, + "resampling_method": "sinc_interp_kaiser", + "beta": 14.769656459379492, + } + elif mode == "kaiser_fast": + params = { + "lowpass_filter_width": 16, + "rolloff": 0.85, + "resampling_method": "sinc_interp_kaiser", + "beta": 8.555504641634386, + } + else: + raise ValueError("Only kaiser_best and kaiser_fast are supported.") + return params + + def get_alpha(sr, mode="hts", n_freq=10, n_alpha=100): """Compute an appropriate frequency warping factor under given sample rate. diff --git a/diffsptk/modules/cqt.py b/diffsptk/modules/cqt.py index 1b8e7148..b8634c55 100644 --- a/diffsptk/modules/cqt.py +++ b/diffsptk/modules/cqt.py @@ -37,6 +37,7 @@ from ..misc.utils import Lambda from ..misc.utils import delayed_import +from ..misc.utils import get_resample_params from ..misc.utils import numpy_to_torch from .stft import ShortTimeFourierTransform as STFT @@ -79,6 +80,9 @@ class ConstantQTransform(nn.Module): scale : bool If True, scale the CQT responce by the length of filter. + res_type : ['kaiser_best', 'kaiser_fast'] or None + Resampling type. + **kwargs : additional keyword arguments See `torchaudio.transforms.Resample `_. @@ -99,6 +103,7 @@ def __init__( sparsity=1e-2, window="hann", scale=True, + res_type="kaiser_best", **kwargs, ): super().__init__() @@ -144,6 +149,8 @@ def __init__( downsample_count = early_downsample_count( sample_rate * 0.5, filter_cutoff, frame_period, n_octave ) + if res_type is not None: + kwargs.update(get_resample_params(res_type)) if 0 < downsample_count: downsample_factor = 2**downsample_count early_downsample.append( diff --git a/diffsptk/modules/excite.py b/diffsptk/modules/excite.py index f7098877..fc599cd5 100644 --- a/diffsptk/modules/excite.py +++ b/diffsptk/modules/excite.py @@ -135,13 +135,20 @@ def _forward(p, frame_period, voiced_region, unvoiced_region, polarity): unipolar = polarity == "unipolar" e = torch.zeros_like(p) if voiced_region == "pulse": - r = torch.ceil(phase) - r = F.pad(r, (1, 0)) - pulse_pos = torch.ge(torch.diff(r), 1) + + def get_pulse_pos(p): + r = torch.ceil(p) + r = F.pad(r, (1, 0)) + return torch.ge(torch.diff(r), 1) + if unipolar: + pulse_pos = get_pulse_pos(phase) e[pulse_pos] = torch.sqrt(p[pulse_pos]) else: - raise RuntimeError + pulse_pos1 = get_pulse_pos(phase) + pulse_pos2 = get_pulse_pos(0.5 * phase) + e[pulse_pos1] = torch.sqrt(p[pulse_pos1]) + e[pulse_pos1 & ~pulse_pos2] *= -1 elif voiced_region == "sinusoidal": if unipolar: e[mask] = 0.5 * (1 - torch.cos(TWO_PI * phase[mask])) diff --git a/diffsptk/modules/icqt.py b/diffsptk/modules/icqt.py index a6e20bd1..545ba691 100644 --- a/diffsptk/modules/icqt.py +++ b/diffsptk/modules/icqt.py @@ -36,6 +36,7 @@ import torchaudio from ..misc.utils import delayed_import +from ..misc.utils import get_resample_params from ..misc.utils import numpy_to_torch from .istft import InverseShortTimeFourierTransform as ISTFT @@ -78,6 +79,9 @@ class InverseConstantQTransform(nn.Module): scale : bool If True, scale the CQT responce by the length of filter. + res_type : ['kaiser_best', 'kaiser_fast'] or None + Resampling type. + **kwargs : additional keyword arguments See `torchaudio.transforms.Resample `_. @@ -98,6 +102,7 @@ def __init__( sparsity=1e-2, window="hann", scale=True, + res_type="kaiser_best", **kwargs, ): super().__init__() @@ -156,6 +161,9 @@ def __init__( transforms = [] resamplers = [] + if res_type is not None: + kwargs.update(get_resample_params(res_type)) + for i in range(n_octave): n_filter = min(B, K - B * i) sl = slice(B * i, B * i + n_filter) diff --git a/tests/test_cqt.py b/tests/test_cqt.py index 1ef0d682..c990aa6a 100644 --- a/tests/test_cqt.py +++ b/tests/test_cqt.py @@ -27,7 +27,8 @@ @pytest.mark.parametrize("fp", [511, 512]) @pytest.mark.parametrize("K", [1, 24]) @pytest.mark.parametrize("scale", [False, True]) -def test_compatibility(device, fp, K, scale, B=12, f_min=32.7): +@pytest.mark.parametrize("res_type", ["kaiser_best", "kaiser_fast"]) +def test_compatibility(device, fp, K, scale, res_type, B=12, f_min=32.7): if device == "cuda" and not torch.cuda.is_available(): return @@ -45,12 +46,12 @@ def test_compatibility(device, fp, K, scale, B=12, f_min=32.7): bins_per_octave=B, hop_length=fp, scale=scale, - res_type="kaiser_best", + res_type=res_type, dtype=None, ).T cqt = diffsptk.CQT( - fp, sr, f_min=f_min, n_bin=K, n_bin_per_octave=B, scale=scale + fp, sr, f_min=f_min, n_bin=K, n_bin_per_octave=B, scale=scale, res_type=res_type ).to(device) c2 = cqt(x).cpu().numpy() diff --git a/tests/test_excite.py b/tests/test_excite.py index 707cbd39..abd61c0d 100644 --- a/tests/test_excite.py +++ b/tests/test_excite.py @@ -91,15 +91,14 @@ def compute_error(infile): ) @pytest.mark.parametrize("polarity", ["unipolar", "bipolar"]) def test_waveform(voiced_region, polarity, P=80, verbose=False): - if voiced_region == "pulse" and polarity == "bipolar": - return - excite = diffsptk.ExcitationGeneration( P, voiced_region=voiced_region, unvoiced_region="zeros", polarity=polarity ) pitch = torch.from_numpy( - U.call("x2x +sd tools/SPTK/asset/data.short | " f"pitch -s 16 -p {P} -o 0 -a 2") + U.call(f"x2x +sd tools/SPTK/asset/data.short | pitch -s 16 -p {P} -o 0 -a 2") ) e = excite(pitch) + if voiced_region == "pulse": + e = e / e.abs().max() if verbose: sf.write(f"excite_{voiced_region}_{polarity}.wav", e, 16000)