diff --git a/README.md b/README.md index c68e466..601f9c9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ *diffsptk* is a differentiable version of [SPTK](https://github.com/sp-nitech/SPTK) based on the PyTorch framework. [![Latest Manual](https://img.shields.io/badge/docs-latest-blue.svg)](https://sp-nitech.github.io/diffsptk/latest/) -[![Stable Manual](https://img.shields.io/badge/docs-stable-blue.svg)](https://sp-nitech.github.io/diffsptk/2.3.0/) +[![Stable Manual](https://img.shields.io/badge/docs-stable-blue.svg)](https://sp-nitech.github.io/diffsptk/2.4.0/) [![Downloads](https://static.pepy.tech/badge/diffsptk)](https://pepy.tech/project/diffsptk) [![Python Version](https://img.shields.io/pypi/pyversions/diffsptk.svg)](https://pypi.python.org/pypi/diffsptk) [![PyTorch Version](https://img.shields.io/badge/pytorch-2.0.0%20%7C%202.6.0-orange.svg)](https://pypi.python.org/pypi/diffsptk) diff --git a/diffsptk/modules/c2mpir.py b/diffsptk/modules/c2mpir.py index e756f64..d77ec57 100644 --- a/diffsptk/modules/c2mpir.py +++ b/diffsptk/modules/c2mpir.py @@ -77,7 +77,7 @@ def forward(self, c): @staticmethod def _forward(c, ir_length, n_fft): C = torch.fft.fft(c, n=n_fft) - h = torch.fft.ifft(cexp(C))[..., :ir_length].real + h = torch.fft.ifft(cexp(C)).real[..., :ir_length] return h _func = _forward diff --git a/diffsptk/modules/imglsadf.py b/diffsptk/modules/imglsadf.py index b62cf05..5d94a16 100644 --- a/diffsptk/modules/imglsadf.py +++ b/diffsptk/modules/imglsadf.py @@ -35,8 +35,11 @@ def forward(self, y, mc): y : Tensor [shape=(..., T)] Audio signal. - mc : Tensor [shape=(..., T/P, M+1)] - Mel-generalized cepstrum, not MLSA digital filter coefficients. + mc : Tensor [shape=(..., T/P, M+1)] or [shape=(..., T/P, N+M+1)] + Mel-generalized cepstrum, not MLSA digital filter coefficients. Note that + the mixed-phase case assumes that the coefficients are of the form + c_{-N}, ..., c_{0}, ..., c_{M}, where M is the order of the minimum-phase + part and N is the order of the maximum-phase part. Returns ------- diff --git a/diffsptk/modules/mglsadf.py b/diffsptk/modules/mglsadf.py index 4eaf76b..8cb5074 100644 --- a/diffsptk/modules/mglsadf.py +++ b/diffsptk/modules/mglsadf.py @@ -15,6 +15,7 @@ # ------------------------------------------------------------------------ # import torch +import torch.nn.functional as F from torch import nn from ..misc.utils import Lambda @@ -22,6 +23,7 @@ from ..misc.utils import get_gamma from ..misc.utils import remove_gain from .b2mc import MLSADigitalFilterCoefficientsToMelCepstrum +from .c2mpir import CepstrumToMinimumPhaseImpulseResponse from .gnorm import GeneralizedCepstrumGainNormalization from .istft import InverseShortTimeFourierTransform from .linear_intpl import LinearInterpolation @@ -31,6 +33,23 @@ from .stft import ShortTimeFourierTransform +def is_array_like(x): + """Return True if the input is array-like. + + Parameters + ---------- + x : object + Any object. + + Returns + ------- + out : bool + True if the input is array-like. + + """ + return isinstance(x, (tuple, list)) + + def mirror(x, half=False): """Mirror the input tensor. @@ -61,8 +80,9 @@ class PseudoMGLSADigitalFilter(nn.Module): Parameters ---------- - filter_order : int >= 0 - Order of filter coefficients, :math:`M`. + filter_order : int >= 0 or tuple[int, int] + Order of filter coefficients, :math:`M` or :math:`(N, M)`. A tuple input is + allowed only if **phase** is 'mixed'. frame_period : int >= 1 Frame period, :math:`P`. @@ -79,7 +99,7 @@ class PseudoMGLSADigitalFilter(nn.Module): ignore_gain : bool If True, perform filtering without gain. - phase : ['minimum', 'maximum', 'zero'] + phase : ['minimum', 'maximum', 'zero', 'mixed'] Filter type. mode : ['multi-stage', 'single-stage', 'freq-domain'] @@ -95,10 +115,10 @@ class PseudoMGLSADigitalFilter(nn.Module): taylor_order : int >= 0 Order of Taylor series expansion (valid only if **mode** is 'multi-stage'). - cep_order : int >= 0 + cep_order : int >= 0 or tuple[int, int] Order of linear cepstrum (valid only if **mode** is 'multi-stage'). - ir_length : int >= 1 + ir_length : int >= 1 or tuple[int, int] Length of impulse response (valid only if **mode** is 'single-stage'). **kwargs : additional keyword arguments @@ -127,40 +147,59 @@ def __init__( ): super().__init__() - self.filter_order = filter_order self.frame_period = frame_period + # Format parameters. + if phase == "mixed" and not is_array_like(filter_order): + filter_order = (filter_order, filter_order) gamma = get_gamma(gamma, c) + if phase == "mixed": + self.split_sections = (filter_order[0], filter_order[1] + 1) + else: + self.split_sections = (filter_order + 1,) + + def flip(x): + if is_array_like(x): + return x[1], x[0] + return x + + flip_keys = ("cep_order", "ir_length") + modified_kwargs = kwargs.copy() + for key in flip_keys: + if key in kwargs: + modified_kwargs[key] = flip(kwargs[key]) + flipped_filter_order = flip(filter_order) + if mode == "multi-stage": self.mglsadf = MultiStageFIRFilter( - filter_order, + flipped_filter_order, frame_period, alpha=alpha, gamma=gamma, ignore_gain=ignore_gain, phase=phase, - **kwargs, + **modified_kwargs, ) elif mode == "single-stage": self.mglsadf = SingleStageFIRFilter( - filter_order, + flipped_filter_order, frame_period, alpha=alpha, gamma=gamma, ignore_gain=ignore_gain, phase=phase, - **kwargs, + **modified_kwargs, ) elif mode == "freq-domain": self.mglsadf = FrequencyDomainFIRFilter( - filter_order, + flipped_filter_order, frame_period, alpha=alpha, gamma=gamma, ignore_gain=ignore_gain, phase=phase, - **kwargs, + **modified_kwargs, ) else: raise ValueError(f"mode {mode} is not supported.") @@ -173,8 +212,11 @@ def forward(self, x, mc): x : Tensor [shape=(..., T)] Excitation signal. - mc : Tensor [shape=(..., T/P, M+1)] - Mel-generalized cepstrum, not MLSA digital filter coefficients. + mc : Tensor [shape=(..., T/P, M+1)] or [shape=(..., T/P, N+M+1)] + Mel-generalized cepstrum, not MLSA digital filter coefficients. Note that + the mixed-phase case assumes that the coefficients are of the form + c_{-N}, ..., c_{0}, ..., c_{M}, where M is the order of the minimum-phase + part and N is the order of the maximum-phase part. Returns ------- @@ -195,9 +237,12 @@ def forward(self, x, mc): tensor([[0.4011, 0.8760, 3.5677, 4.8725]]) """ - check_size(mc.size(-1), self.filter_order + 1, "dimension of mel-cepstrum") + check_size(mc.size(-1), sum(self.split_sections), "dimension of mel-cepstrum") check_size(x.size(-1), mc.size(-2) * self.frame_period, "sequence length") - + if len(self.split_sections) != 1: + mc_max, mc_min = torch.split(mc, self.split_sections, dim=-1) + mc_max = F.pad(mc_max.flip(-1), (1, 0)) + mc = (mc_min, mc_max) # (c0, c1, ..., cM), (0, c-1, ..., c-N) y = self.mglsadf(x, mc) return y @@ -227,36 +272,64 @@ def __init__( if alpha == 0 and gamma == 0: cep_order = filter_order + # Prepare padding module. if self.phase == "minimum": - self.pad = nn.ConstantPad1d((cep_order, 0), 0) + padding = (cep_order, 0) elif self.phase == "maximum": - self.pad = nn.ConstantPad1d((0, cep_order), 0) + padding = (0, cep_order) elif self.phase == "zero": - self.pad = nn.ConstantPad1d((cep_order, cep_order), 0) + padding = (cep_order, cep_order) + elif self.phase == "mixed": + padding = cep_order if is_array_like(cep_order) else (cep_order, cep_order) else: raise ValueError(f"phase {phase} is not supported.") + self.pad = nn.ConstantPad1d(padding, 0) + + # Prepare frequency transformation module. + if self.phase == "mixed": + self.mgc2c = nn.ModuleList() + for i in range(2): + self.mgc2c.append( + MelGeneralizedCepstrumToMelGeneralizedCepstrum( + filter_order[i], + padding[i], + in_alpha=alpha, + in_gamma=gamma, + n_fft=n_fft, + ) + ) + else: + self.mgc2c = MelGeneralizedCepstrumToMelGeneralizedCepstrum( + filter_order, + cep_order, + in_alpha=alpha, + in_gamma=gamma, + n_fft=n_fft, + ) - self.mgc2c = MelGeneralizedCepstrumToMelGeneralizedCepstrum( - filter_order, - cep_order, - in_alpha=alpha, - in_gamma=gamma, - n_fft=n_fft, - ) self.linear_intpl = LinearInterpolation(frame_period) def forward(self, x, mc): - c = self.mgc2c(mc) - c0, c = remove_gain(c, value=0, return_gain=True) - - if self.phase == "minimum": - c = c.flip(-1) - elif self.phase == "maximum": - pass - elif self.phase == "zero": - c = mirror(c, half=True) + if self.phase == "mixed": + mc_min, mc_max = mc + c_min = self.mgc2c[0](mc_min) + c_max = self.mgc2c[1](mc_max) + c0 = c_min[..., :1] + c_max[..., :1] + c1_min = c_min[..., 1:].flip(-1) + c0_dummy = torch.zeros_like(c0) + c1_max = c_max[..., 1:] + c = torch.cat([c1_min, c0_dummy, c1_max], dim=-1) else: - raise RuntimeError + c = self.mgc2c(mc) + c0, c = remove_gain(c, value=0, return_gain=True) + if self.phase == "minimum": + c = c.flip(-1) + elif self.phase == "maximum": + pass + elif self.phase == "zero": + c = mirror(c, half=True) + else: + raise RuntimeError c = self.linear_intpl(c) @@ -290,18 +363,28 @@ def __init__( self.ignore_gain = ignore_gain self.phase = phase + self.n_fft = n_fft + # Prepare padding module. taps = ir_length - 1 if self.phase == "minimum": - self.pad = nn.ConstantPad1d((taps, 0), 0) + padding = (taps, 0) elif self.phase == "maximum": - self.pad = nn.ConstantPad1d((0, taps), 0) + padding = (0, taps) elif self.phase == "zero": - self.pad = nn.ConstantPad1d((taps, taps), 0) + padding = (taps, taps) + elif self.phase == "mixed": + padding = ( + (ir_length[0] - 1, ir_length[1] - 1) + if is_array_like(ir_length) + else (taps, taps) + ) else: raise ValueError(f"phase {phase} is not supported.") + self.pad = nn.ConstantPad1d(padding, 0) + self.padding = padding - if self.phase in ["minimum", "maximum"]: + if self.phase in ("minimum", "maximum"): self.mgc2ir = MelGeneralizedCepstrumToMelGeneralizedCepstrum( filter_order, ir_length - 1, @@ -311,7 +394,7 @@ def __init__( out_mul=True, n_fft=n_fft, ) - else: + elif self.phase == "zero": self.mgc2c = MelGeneralizedCepstrumToMelGeneralizedCepstrum( filter_order, ir_length - 1, @@ -323,24 +406,52 @@ def __init__( Lambda(lambda x: torch.fft.hfft(x, n=n_fft)), Lambda(lambda x: torch.fft.ifft(torch.exp(x)).real[..., :ir_length]), ) + elif self.phase == "mixed": + self.mgc2c = nn.ModuleList() + for i in range(2): + self.mgc2c.append( + MelGeneralizedCepstrumToMelGeneralizedCepstrum( + filter_order[i], + padding[i], + in_alpha=alpha, + in_gamma=gamma, + n_fft=n_fft, + ) + ) + self.c2ir = CepstrumToMinimumPhaseImpulseResponse( + n_fft - 1, n_fft, n_fft=n_fft + ) + else: + raise ValueError(f"phase {phase} is not supported.") + self.linear_intpl = LinearInterpolation(frame_period) def forward(self, x, mc): - if self.phase == "zero": + if self.phase == "minimum": + h = self.mgc2ir(mc) + h = h.flip(-1) + elif self.phase == "maximum": + h = self.mgc2ir(mc) + elif self.phase == "zero": c = self.mgc2c(mc) c[..., 1:] *= 0.5 if self.ignore_gain: c = remove_gain(c, value=0) h = self.c2ir(c) - else: - h = self.mgc2ir(mc) - - if self.phase == "minimum": - h = h.flip(-1) - elif self.phase == "maximum": - pass - elif self.phase == "zero": h = mirror(h) + elif self.phase == "mixed": + mc_min, mc_max = mc + c_min = self.mgc2c[0](mc_min) + c_max = self.mgc2c[1](mc_max) + if self.ignore_gain: + c0 = torch.zeros_like(c_min[..., :1]) + else: + c0 = c_min[..., :1] + c_max[..., :1] + c = torch.cat([c_min[..., 1:].flip(-1), c0, c_max[..., 1:]], dim=-1) + c = F.pad(c, (0, self.n_fft - c.size(-1))) + c = torch.roll(c, -self.padding[0], dims=-1) + h = self.c2ir(c) + h = torch.roll(h, self.padding[0], dims=-1)[..., : sum(self.padding) + 1] else: raise RuntimeError @@ -351,10 +462,6 @@ def forward(self, x, mc): h = h / h[..., -1:] elif self.phase == "maximum": h = h / h[..., :1] - elif self.phase == "zero": - pass - else: - raise RuntimeError x = self.pad(x) x = x.unfold(-1, h.size(-1), 1) @@ -382,14 +489,42 @@ def __init__( assert 2 * frame_period < frame_length self.ignore_gain = ignore_gain + self.phase = phase if self.ignore_gain: - self.gnorm = GeneralizedCepstrumGainNormalization(filter_order, gamma=gamma) - self.mc2b = MelCepstrumToMLSADigitalFilterCoefficients( - filter_order, alpha=alpha - ) - self.b2mc = MLSADigitalFilterCoefficientsToMelCepstrum( - filter_order, alpha=alpha + self.gnorm = nn.ModuleList() + self.mc2b = nn.ModuleList() + self.b2mc = nn.ModuleList() + self.mgc2sp = nn.ModuleList() + + if not is_array_like(filter_order): + filter_order = (filter_order, filter_order) + + n = 2 if phase == "mixed" else 1 + for i in range(n): + if self.ignore_gain: + self.gnorm.append( + GeneralizedCepstrumGainNormalization(filter_order[i], gamma=gamma) + ) + self.mc2b.append( + MelCepstrumToMLSADigitalFilterCoefficients( + filter_order[i], alpha=alpha + ) + ) + self.b2mc.append( + MLSADigitalFilterCoefficientsToMelCepstrum( + filter_order[i], alpha=alpha + ) + ) + self.mgc2sp.append( + MelGeneralizedCepstrumToSpectrum( + filter_order[i], + fft_length, + alpha=alpha, + gamma=gamma, + out_format="complex", + n_fft=n_fft, + ) ) self.stft = ShortTimeFourierTransform( @@ -398,23 +533,31 @@ def __init__( self.istft = InverseShortTimeFourierTransform( frame_length, frame_period, fft_length, **stft_kwargs ) - self.mgc2sp = MelGeneralizedCepstrumToSpectrum( - filter_order, - fft_length, - alpha=alpha, - gamma=gamma, - out_format="magnitude" if phase == "zero" else "complex", - n_fft=n_fft, - ) def forward(self, x, mc): - if self.ignore_gain: - b = self.mc2b(mc) - b = self.gnorm(b) - b[..., 0] = 0 - mc = self.b2mc(b) + if torch.is_tensor(mc): + mc = [mc] + + Hs = [] + for i, c in enumerate(mc): + if self.ignore_gain: + b = self.mc2b[i](c) + b = self.gnorm[i](b) + b[..., 0] = 0 + c = self.b2mc[i](b) + Hs.append(self.mgc2sp[i](c)) + + if self.phase == "minimum": + H = Hs[0] + elif self.phase == "maximum": + H = Hs[0].conj() + elif self.phase == "zero": + H = Hs[0].abs() + elif self.phase == "mixed": + H = Hs[0] * Hs[1].conj() + else: + raise RuntimeError - H = self.mgc2sp(mc) X = self.stft(x) Y = H * X y = self.istft(Y, out_length=x.size(-1)) diff --git a/diffsptk/version.py b/diffsptk/version.py index 55e4709..3d67cd6 100644 --- a/diffsptk/version.py +++ b/diffsptk/version.py @@ -1 +1 @@ -__version__ = "2.3.0" +__version__ = "2.4.0" diff --git a/tests/test_mglsadf.py b/tests/test_mglsadf.py index 434cf8a..a1215fb 100644 --- a/tests/test_mglsadf.py +++ b/tests/test_mglsadf.py @@ -18,6 +18,7 @@ import numpy as np import pytest +import torch import diffsptk import tests.utils as U @@ -28,7 +29,16 @@ @pytest.mark.parametrize("mode", ["multi-stage", "single-stage", "freq-domain"]) @pytest.mark.parametrize("c", [0, 10]) def test_compatibility( - device, ignore_gain, mode, c, alpha=0.42, M=24, P=80, L=400, fft_length=512 + device, + ignore_gain, + mode, + c, + alpha=0.42, + M=24, + P=80, + L=400, + fft_length=512, + B=2, ): if mode == "multi-stage": params = {"taylor_order": 7, "cep_order": 100} @@ -70,20 +80,119 @@ def test_compatibility( eq=lambda a, b: np.corrcoef(a, b)[0, 1] > 0.98, ) + S = T // 10 + U.check_differentiability(device, mglsadf, [(B, S), (B, S // P, M + 1)]) -@pytest.mark.parametrize("device", ["cpu", "cuda"]) + +@pytest.mark.parametrize("phase", ["zero", "maximum"]) @pytest.mark.parametrize("ignore_gain", [False, True]) -@pytest.mark.parametrize("phase", ["minimum", "maximum", "zero"]) -@pytest.mark.parametrize("mode", ["multi-stage", "single-stage", "freq-domain"]) -def test_differentiable(device, ignore_gain, phase, mode, B=4, T=20, P=2, M=4): - if mode == "multi-stage": - params = {"cep_order": 10} - elif mode == "single-stage": - params = {"ir_length": 20, "n_fft": 32} - elif mode == "freq-domain": - params = {"frame_length": 6, "fft_length": 16} +def test_zero_and_maximum_phase( + phase, + ignore_gain, + device="cpu", + alpha=0.42, + M=24, + P=80, + L=400, + fft_length=512, + B=2, +): + T = os.path.getsize("tools/SPTK/asset/data.short") // 2 + cmd_x = f"nrand -l {T}" + x = torch.from_numpy(U.call(cmd_x)) - mglsadf = diffsptk.MLSA( - M, P, ignore_gain=ignore_gain, phase=phase, mode=mode, **params + cmd_mc = ( + f"x2x +sd tools/SPTK/asset/data.short | " + f"frame -p {P} -l {L} | " + f"window -w 1 -n 1 -l {L} -L {fft_length} | " + f"mgcep -a {alpha} -m {M} -l {fft_length} -E -60" + ) + mc = torch.from_numpy(U.call(cmd_mc).reshape(-1, M + 1)) + + params1 = {"mode": "multi-stage", "cep_order": 200} + mglsadf1 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase=phase, **params1 + ) + y1 = mglsadf1(x, mc).cpu().numpy() + + params2 = {"mode": "single-stage", "ir_length": 200, "n_fft": 512} + mglsadf2 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase=phase, **params2 + ) + y2 = mglsadf2(x, mc).cpu().numpy() + assert np.corrcoef(y1, y2)[0, 1] > 0.99 + + params3 = {"mode": "freq-domain", "frame_length": L, "fft_length": fft_length} + mglsadf3 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase=phase, **params3 + ) + y3 = mglsadf3(x, mc).cpu().numpy() + assert np.corrcoef(y1, y3)[0, 1] > 0.98 + + S = T // 10 + U.check_differentiability(device, mglsadf1, [(B, S), (B, S // P, M + 1)]) + U.check_differentiability(device, mglsadf2, [(B, S), (B, S // P, M + 1)]) + U.check_differentiability(device, mglsadf3, [(B, S), (B, S // P, M + 1)]) + + +@pytest.mark.parametrize("phase", ["zero", "maximum"]) +@pytest.mark.parametrize("ignore_gain", [False, True]) +def test_mixed_phase( + phase, + ignore_gain, + device="cpu", + alpha=0.42, + M=24, + P=80, + L=400, + fft_length=512, + B=2, +): + T = os.path.getsize("tools/SPTK/asset/data.short") // 2 + cmd_x = f"nrand -l {T}" + x = torch.from_numpy(U.call(cmd_x)) + + cmd_mc = ( + f"x2x +sd tools/SPTK/asset/data.short | " + f"frame -p {P} -l {L} | " + f"window -w 1 -n 1 -l {L} -L {fft_length} | " + f"mgcep -a {alpha} -m {M} -l {fft_length} -E -60" ) - U.check_differentiability(device, mglsadf, [(B, T), (B, T // P, M + 1)]) + mc = torch.from_numpy(U.call(cmd_mc).reshape(-1, M + 1)) + if phase == "zero": + half_mc = mc[..., 1:] * 0.5 + mc_mix = torch.cat([half_mc.flip(-1), mc[..., :1], half_mc], dim=-1) + elif phase == "maximum": + mc_mix = torch.cat([mc.flip(-1), 0 * mc[..., 1:]], dim=-1) + + params0 = {"mode": "multi-stage", "cep_order": 200} + mglsadf0 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase=phase, **params0 + ) + y0 = mglsadf0(x, mc).cpu().numpy() + + params1 = params0 + mglsadf1 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase="mixed", **params1 + ) + y1 = mglsadf1(x, mc_mix).cpu().numpy() + assert U.allclose(y0, y1) + + params2 = {"mode": "single-stage", "ir_length": 200, "n_fft": 512} + mglsadf2 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase="mixed", **params2 + ) + y2 = mglsadf2(x, mc_mix).cpu().numpy() + assert np.corrcoef(y1, y2)[0, 1] > 0.99 + + params3 = {"mode": "freq-domain", "frame_length": L, "fft_length": fft_length} + mglsadf3 = diffsptk.MLSA( + M, P, ignore_gain=ignore_gain, alpha=alpha, phase="mixed", **params3 + ) + y3 = mglsadf3(x, mc_mix).cpu().numpy() + assert np.corrcoef(y1, y3)[0, 1] > 0.98 + + S = T // 10 + U.check_differentiability(device, mglsadf1, [(B, S), (B, S // P, 2 * M + 1)]) + U.check_differentiability(device, mglsadf2, [(B, S), (B, S // P, 2 * M + 1)]) + U.check_differentiability(device, mglsadf3, [(B, S), (B, S // P, 2 * M + 1)])