From 2bdf26b82ff9d7df59aed303c4d8bfdac82ecae4 Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 20 Dec 2023 15:45:19 -0600 Subject: [PATCH 1/5] Multiprocess API fix --- README.md | 5 ++++- penn/__main__.py | 5 +++++ penn/core.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa531bc..9473151 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,7 @@ python -m penn [--batch_size BATCH_SIZE] [--center {half-window,half-hop,zero}] [--interp_unvoiced_at INTERP_UNVOICED_AT] + [--num_workers NUM_WORKERS] [--gpu GPU] required arguments: @@ -271,8 +272,10 @@ optional arguments: The number of frames per batch. Defaults to 2048. --center {half-window,half-hop,zero} Padding options - --interp_unvoiced_at INTERP_UNVOICED_AT + --interp_unvoiced_at INTERP_UNVOICED_AT Specifies voicing threshold for interpolation. Defaults to 0.1625. + --num_workers + Number of CPU threads for async data I/O --gpu GPU The index of the gpu to perform inference on. Defaults to CPU. ``` diff --git a/penn/__main__.py b/penn/__main__.py index b287f54..bd0c4c7 100644 --- a/penn/__main__.py +++ b/penn/__main__.py @@ -68,6 +68,11 @@ def parse_args(): '--interp_unvoiced_at', type=float, help='Specifies voicing threshold for interpolation') + parser.add_argument( + '--num_workers', + type=int, + default=0, + help='Number of CPU threads for async data I/O') parser.add_argument( '--gpu', type=int, diff --git a/penn/core.py b/penn/core.py index 9c9fffd..51ae247 100644 --- a/penn/core.py +++ b/penn/core.py @@ -198,7 +198,7 @@ def from_files_to_files( batch_size: Optional[int] = None, center: str = 'half-window', interp_unvoiced_at: Optional[float] = None, - num_workers: int = penn.NUM_WORKERS, + num_workers: int = 0, gpu: Optional[int] = None ) -> None: """Perform pitch and periodicity estimation from files on disk and save From 2a9dead51f18f64aa6f1a109487f68478c201293 Mon Sep 17 00:00:00 2001 From: Max Date: Wed, 20 Dec 2023 15:46:58 -0600 Subject: [PATCH 2/5] Viterbi decoding experiments --- config/fcnf0++-ablate-chunkviterbi-normal.py | 10 +++ config/fcnf0++-ablate-chunkviterbi.py | 10 +++ penn/config/defaults.py | 5 +- penn/decode.py | 67 ++++++++++---------- setup.py | 3 +- 5 files changed, 60 insertions(+), 35 deletions(-) create mode 100644 config/fcnf0++-ablate-chunkviterbi-normal.py create mode 100644 config/fcnf0++-ablate-chunkviterbi.py diff --git a/config/fcnf0++-ablate-chunkviterbi-normal.py b/config/fcnf0++-ablate-chunkviterbi-normal.py new file mode 100644 index 0000000..8cd0fa4 --- /dev/null +++ b/config/fcnf0++-ablate-chunkviterbi-normal.py @@ -0,0 +1,10 @@ +MODULE = 'penn' + +# Configuration name +CONFIG = 'fcnf0++-ablate-chunkviterbi-normal' + +# The decoder to use for postprocessing +DECODER = 'viterbi-normal' + +# Maximum chunk size for chunked Viterbi decoding +VITERBI_MAX_CHUNK_SIZE = 64 diff --git a/config/fcnf0++-ablate-chunkviterbi.py b/config/fcnf0++-ablate-chunkviterbi.py new file mode 100644 index 0000000..2c01205 --- /dev/null +++ b/config/fcnf0++-ablate-chunkviterbi.py @@ -0,0 +1,10 @@ +MODULE = 'penn' + +# Configuration name +CONFIG = 'fcnf0++-ablate-chunkviterbi' + +# The decoder to use for postprocessing +DECODER = 'viterbi' + +# Maximum chunk size for chunked Viterbi decoding +VITERBI_MAX_CHUNK_SIZE = 8 diff --git a/penn/config/defaults.py b/penn/config/defaults.py index eb33231..dfe916c 100644 --- a/penn/config/defaults.py +++ b/penn/config/defaults.py @@ -32,7 +32,7 @@ LOCAL_PITCH_WINDOW_SIZE = 19 # Pitch velocity constraint for viterbi decoding -MAX_OCTAVES_PER_SECOND = 35.92 +MAX_OCTAVES_PER_SECOND = 6. # Whether to normalize input audio to mean zero and variance one NORMALIZE_INPUT = False @@ -49,6 +49,9 @@ # Audio sample rate SAMPLE_RATE = 8000 # hz +# Maximum chunk size for chunked Viterbi decoding +VITERBI_MAX_CHUNK_SIZE = None + # Size of the analysis window WINDOW_SIZE = 1024 # samples diff --git a/penn/decode.py b/penn/decode.py index bc787eb..20202c1 100644 --- a/penn/decode.py +++ b/penn/decode.py @@ -1,4 +1,5 @@ import numpy as np +import torbi import torch import penn @@ -22,8 +23,6 @@ def argmax(logits): def viterbi(logits): """Decode pitch using viterbi decoding (from librosa)""" - import librosa - # Normalize and convert to numpy if penn.METHOD == 'pyin': periodicity = penn.periodicity.sum(logits).T @@ -31,74 +30,76 @@ def viterbi(logits): (1 - periodicity) / penn.PITCH_BINS).repeat(penn.PITCH_BINS, 1) distributions = torch.cat( (torch.exp(logits.permute(2, 1, 0)), unvoiced[None]), - dim=1).numpy() + dim=1) + else: # Viterbi REQUIRES a categorical distribution, even if the loss was BCE distributions = torch.nn.functional.softmax(logits, dim=1) distributions = distributions.permute(2, 1, 0) - distributions = distributions.to( - device=torch.device('cpu'), - dtype=torch.float32 - ).numpy() + distributions = distributions.to(device='cpu', dtype=torch.float32) # Cache viterbi probabilities if not hasattr(viterbi, 'transition'): - # Get number of bins per frame + xx, yy = torch.meshgrid( + torch.arange(penn.PITCH_BINS), + torch.arange(penn.PITCH_BINS), + indexing='ij') bins_per_octave = penn.OCTAVE / penn.CENTS_PER_BIN max_octaves_per_frame = \ penn.MAX_OCTAVES_PER_SECOND * penn.HOPSIZE / penn.SAMPLE_RATE max_bins_per_frame = max_octaves_per_frame * bins_per_octave + 1 - - # Construct the within voicing transition probabilities - viterbi.transition = librosa.sequence.transition_local( - penn.PITCH_BINS, - max_bins_per_frame, - window='triangle', - wrap=False) + transition = torch.clip(max_bins_per_frame - (xx - yy).abs(), 0) + transition = transition / transition.sum(dim=1, keepdims=True) + viterbi.transition = transition if penn.METHOD == 'pyin': # Add unvoiced probabilities - viterbi.transition = np.kron( - librosa.sequence.transition_loop(2, .99), + viterbi.transition = torch.kron( + torch.tensor([[.99, .01], [.01, .99]]), viterbi.transition) # Uniform initial probabilities - viterbi.initial = np.zeros(2 * penn.PITCH_BINS) + viterbi.initial = torch.zeros(2 * penn.PITCH_BINS) viterbi.initial[penn.PITCH_BINS:] = 1 / penn.PITCH_BINS else: # Uniform initial probabilities - viterbi.initial = np.full(penn.PITCH_BINS, 1 / penn.PITCH_BINS) + viterbi.initial = torch.full( + (penn.PITCH_BINS,), + 1 / penn.PITCH_BINS) # Viterbi decoding - bins = librosa.sequence.viterbi( - distributions, + bins = torbi.decode( + distributions[0].T, viterbi.transition, - p_init=viterbi.initial) - bins = torch.from_numpy(bins.astype(np.int32)) + viterbi.initial, + penn.VITERBI_MAX_CHUNK_SIZE + )[:, None] # Convert to frequency in Hz if penn.DECODER.endswith('normal'): # Decode using an assumption of normality around to the viterbi path - pitch = local_expected_value_from_bins( - bins.T.to(logits.device), - logits).T - - else: + bins = bins.to(logits.device).T + pitch = local_expected_value_from_bins(bins, logits).T - # Argmax decoding - pitch = penn.convert.bins_to_frequency(bins) - - if penn.METHOD == 'pyin': + elif penn.METHOD == 'pyin': # Linearly interpolate unvoiced regions + pitch = penn.convert.bins_to_frequency(bins) pitch[bins >= penn.PITCH_BINS] = 0 pitch = penn.data.preprocess.interpolate_unvoiced(pitch.numpy())[0] - pitch = torch.from_numpy(pitch) + pitch = torch.from_numpy(pitch).to(logits.device) + bins = bins.to(logits.device) + + else: + + # Argmax decoding + bins = bins.to(logits.device).T + pitch = penn.convert.bins_to_frequency(bins) return bins.T, pitch.T diff --git a/setup.py b/setup.py index 9be23e1..0abe787 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='penn', description='Pitch Estimating Neural Networks (PENN)', - version='0.0.13', + version='0.0.14', author='Max Morrison, Caedon Hsieh, Nathan Pruyne, and Bryan Pardo', author_email='interactiveaudiolab@gmail.com', url='https://github.com/interactiveaudiolab/penn', @@ -24,6 +24,7 @@ install_requires=[ 'huggingface_hub', # 0.11.1 'numpy', # 1.23.4 + 'torbi', # 0.0.1 'torch', # 1.12.1+cu113 'torchaudio', # 0.12.1+cu113 'torchutil', # 0.0.7 From 0fe93b9470d5e1b8cd7d64b6f4a80c3ed72fa5bf Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 14 Jan 2024 14:25:07 -0600 Subject: [PATCH 3/5] Update decoding --- README.md | 69 +++--- config/crepe++.py | 3 + config/deepf0++.py | 3 + config/fcnf0++-ablate-batchsize.py | 3 + config/fcnf0++-ablate-chunkviterbi-normal.py | 4 +- config/fcnf0++-ablate-chunkviterbi.py | 5 +- config/fcnf0++-ablate-decoder.py | 3 + config/fcnf0++-ablate-earlystop.py | 3 + config/fcnf0++-ablate-inputnorm.py | 3 + config/fcnf0++-ablate-layernorm.py | 3 + config/fcnf0++-ablate-loss.py | 3 + config/fcnf0++-ablate-quantization.py | 3 + config/fcnf0++-ablate-unvoiced.py | 3 + config/fcnf0++-mdb.py | 3 + config/fcnf0++-ptdb.py | 3 + config/fcnf0.py | 3 + config/pyin.py | 2 +- config/torchcrepe.py | 3 + penn/__main__.py | 5 + penn/config/defaults.py | 33 ++- penn/core.py | 221 +++++++++++++++---- penn/decode.py | 207 +++++++++++------ setup.py | 2 +- 23 files changed, 425 insertions(+), 165 deletions(-) diff --git a/README.md b/README.md index 9473151..a1281be 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,9 @@ center = 'half-hop' # (Optional) Linearly interpolate unvoiced regions below periodicity threshold interp_unvoiced_at = .065 +# (Optional) Select a decoding method. One of ['argmax', 'pyin', 'viterbi']. +decoder = 'viterbi' + # Infer pitch and periodicity pitch, periodicity = penn.from_audio( audio, @@ -85,6 +88,7 @@ pitch, periodicity = penn.from_audio( checkpoint=checkpoint, batch_size=batch_size, center=center, + decoder=decoder, interp_unvoiced_at=interp_unvoiced_at, gpu=gpu) ``` @@ -96,16 +100,17 @@ pitch, periodicity = penn.from_audio( ``` def from_audio( - audio: torch.Tensor, - sample_rate: int = penn.SAMPLE_RATE, - hopsize: float = penn.HOPSIZE_SECONDS, - fmin: float = penn.FMIN, - fmax: float = penn.FMAX, - checkpoint: Optional[Path] = None, - batch_size: Optional[int] = None, - center: str = 'half-window', - interp_unvoiced_at: Optional[float] = None, - gpu: Optional[int] = None + audio: torch.Tensor, + sample_rate: int = penn.SAMPLE_RATE, + hopsize: float = penn.HOPSIZE_SECONDS, + fmin: float = penn.FMIN, + fmax: float = penn.FMAX, + checkpoint: Optional[Path] = None, + batch_size: Optional[int] = None, + center: str = 'half-window', + decoder: str = penn.DECODER, + interp_unvoiced_at: Optional[float] = None, + gpu: Optional[int] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """Perform pitch and periodicity estimation @@ -134,15 +139,16 @@ Returns: ``` def from_file( - file: Path, - hopsize: float = penn.HOPSIZE_SECONDS, - fmin: float = penn.FMIN, - fmax: float = penn.FMAX, - checkpoint: Optional[Path] = None, - batch_size: Optional[int] = None, - center: str = 'half-window', - interp_unvoiced_at: Optional[float] = None, - gpu: Optional[int] = None + file: Path, + hopsize: float = penn.HOPSIZE_SECONDS, + fmin: float = penn.FMIN, + fmax: float = penn.FMAX, + checkpoint: Optional[Path] = None, + batch_size: Optional[int] = None, + center: str = 'half-window', + decoder: str = penn.DECODER, + interp_unvoiced_at: Optional[float] = None, + gpu: Optional[int] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """Perform pitch and periodicity estimation from audio on disk @@ -168,16 +174,17 @@ Returns: ``` def from_file_to_file( - file: Path, - output_prefix: Optional[Path] = None, - hopsize: float = penn.HOPSIZE_SECONDS, - fmin: float = penn.FMIN, - fmax: float = penn.FMAX, - checkpoint: Optional[Path] = None, - batch_size: Optional[int] = None, - center: str = 'half-window', - interp_unvoiced_at: Optional[float] = None, - gpu: Optional[int] = None + file: Path, + output_prefix: Optional[Path] = None, + hopsize: float = penn.HOPSIZE_SECONDS, + fmin: float = penn.FMIN, + fmax: float = penn.FMAX, + checkpoint: Optional[Path] = None, + batch_size: Optional[int] = None, + center: str = 'half-window', + decoder: str = penn.DECODER, + interp_unvoiced_at: Optional[float] = None, + gpu: Optional[int] = None ) -> None: """Perform pitch and periodicity estimation from audio on disk and save @@ -208,6 +215,7 @@ def from_files_to_files( checkpoint: Optional[Path] = None, batch_size: Optional[int] = None, center: str = 'half-window', + decoder: str = penn.DECODER, interp_unvoiced_at: Optional[float] = None, num_workers: int = penn.NUM_WORKERS, gpu: Optional[int] = None @@ -244,6 +252,7 @@ python -m penn [--checkpoint CHECKPOINT] [--batch_size BATCH_SIZE] [--center {half-window,half-hop,zero}] + [--decoder {argmax,pyin,viterbi}] [--interp_unvoiced_at INTERP_UNVOICED_AT] [--num_workers NUM_WORKERS] [--gpu GPU] @@ -272,6 +281,8 @@ optional arguments: The number of frames per batch. Defaults to 2048. --center {half-window,half-hop,zero} Padding options + --decoder {argmax,pyin,viterbi} + Posteriorgram decoder --interp_unvoiced_at INTERP_UNVOICED_AT Specifies voicing threshold for interpolation. Defaults to 0.1625. --num_workers diff --git a/config/crepe++.py b/config/crepe++.py index 79efec6..b162399 100644 --- a/config/crepe++.py +++ b/config/crepe++.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # The name of the model to use for training MODEL = 'crepe' diff --git a/config/deepf0++.py b/config/deepf0++.py index aede086..ebb6d1a 100644 --- a/config/deepf0++.py +++ b/config/deepf0++.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # The name of the model to use for training MODEL = 'deepf0' diff --git a/config/fcnf0++-ablate-batchsize.py b/config/fcnf0++-ablate-batchsize.py index a003d38..507cb86 100644 --- a/config/fcnf0++-ablate-batchsize.py +++ b/config/fcnf0++-ablate-batchsize.py @@ -8,3 +8,6 @@ # The decoder to use for postprocessing DECODER = 'argmax' + +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False diff --git a/config/fcnf0++-ablate-chunkviterbi-normal.py b/config/fcnf0++-ablate-chunkviterbi-normal.py index 8cd0fa4..87e9a95 100644 --- a/config/fcnf0++-ablate-chunkviterbi-normal.py +++ b/config/fcnf0++-ablate-chunkviterbi-normal.py @@ -4,7 +4,7 @@ CONFIG = 'fcnf0++-ablate-chunkviterbi-normal' # The decoder to use for postprocessing -DECODER = 'viterbi-normal' +DECODER = 'viterbi' # Maximum chunk size for chunked Viterbi decoding -VITERBI_MAX_CHUNK_SIZE = 64 +VITERBI_MIN_CHUNK_SIZE = 64 diff --git a/config/fcnf0++-ablate-chunkviterbi.py b/config/fcnf0++-ablate-chunkviterbi.py index 2c01205..e563a5e 100644 --- a/config/fcnf0++-ablate-chunkviterbi.py +++ b/config/fcnf0++-ablate-chunkviterbi.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'viterbi' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Maximum chunk size for chunked Viterbi decoding -VITERBI_MAX_CHUNK_SIZE = 8 +VITERBI_MIN_CHUNK_SIZE = 8 diff --git a/config/fcnf0++-ablate-decoder.py b/config/fcnf0++-ablate-decoder.py index fc2950f..8e2920a 100644 --- a/config/fcnf0++-ablate-decoder.py +++ b/config/fcnf0++-ablate-decoder.py @@ -3,5 +3,8 @@ # Configuration name CONFIG = 'fcnf0++-ablate-decoder' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # The decoder to use for postprocessing DECODER = 'argmax' diff --git a/config/fcnf0++-ablate-earlystop.py b/config/fcnf0++-ablate-earlystop.py index 34ad2f8..14e67be 100644 --- a/config/fcnf0++-ablate-earlystop.py +++ b/config/fcnf0++-ablate-earlystop.py @@ -9,5 +9,8 @@ # Whether to stop training when validation loss stops improving EARLY_STOPPING = True +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Number of steps between logging to Tensorboard LOG_INTERVAL = 500 # steps diff --git a/config/fcnf0++-ablate-inputnorm.py b/config/fcnf0++-ablate-inputnorm.py index 135cfba..c97e8be 100644 --- a/config/fcnf0++-ablate-inputnorm.py +++ b/config/fcnf0++-ablate-inputnorm.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Whether to normalize input audio to mean zero and variance one NORMALIZE_INPUT = True diff --git a/config/fcnf0++-ablate-layernorm.py b/config/fcnf0++-ablate-layernorm.py index 25c27f4..493ccd3 100644 --- a/config/fcnf0++-ablate-layernorm.py +++ b/config/fcnf0++-ablate-layernorm.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Type of model normalization NORMALIZATION = 'batch' diff --git a/config/fcnf0++-ablate-loss.py b/config/fcnf0++-ablate-loss.py index 9375956..679437e 100644 --- a/config/fcnf0++-ablate-loss.py +++ b/config/fcnf0++-ablate-loss.py @@ -6,5 +6,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Loss function LOSS = 'binary_cross_entropy' diff --git a/config/fcnf0++-ablate-quantization.py b/config/fcnf0++-ablate-quantization.py index 82fd558..13c25c2 100644 --- a/config/fcnf0++-ablate-quantization.py +++ b/config/fcnf0++-ablate-quantization.py @@ -9,5 +9,8 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Number of pitch bins to predict PITCH_BINS = 486 diff --git a/config/fcnf0++-ablate-unvoiced.py b/config/fcnf0++-ablate-unvoiced.py index e372a8d..563eec0 100644 --- a/config/fcnf0++-ablate-unvoiced.py +++ b/config/fcnf0++-ablate-unvoiced.py @@ -6,6 +6,9 @@ # The decoder to use for postprocessing DECODER = 'argmax' +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Whether to only use voiced start frames VOICED_ONLY = True diff --git a/config/fcnf0++-mdb.py b/config/fcnf0++-mdb.py index e54f54c..40676b2 100644 --- a/config/fcnf0++-mdb.py +++ b/config/fcnf0++-mdb.py @@ -5,3 +5,6 @@ # The decoder to use for postprocessing DECODER = 'argmax' + +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False diff --git a/config/fcnf0++-ptdb.py b/config/fcnf0++-ptdb.py index 0431c26..0882ed6 100644 --- a/config/fcnf0++-ptdb.py +++ b/config/fcnf0++-ptdb.py @@ -5,3 +5,6 @@ # The decoder to use for postprocessing DECODER = 'argmax' + +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False diff --git a/config/fcnf0.py b/config/fcnf0.py index 8954cf6..5f75d97 100644 --- a/config/fcnf0.py +++ b/config/fcnf0.py @@ -18,6 +18,9 @@ # Minimum representable frequency FMIN = 30. # Hz +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Number of steps between logging to Tensorboard LOG_INTERVAL = 500 # steps diff --git a/config/pyin.py b/config/pyin.py index 5429fff..827edbc 100644 --- a/config/pyin.py +++ b/config/pyin.py @@ -4,7 +4,7 @@ CONFIG = 'pyin' # The decoder to use for postprocessing -DECODER = 'argmax' +DECODER = 'pyin' # Distance between adjacent frames HOPSIZE = 160 # samples diff --git a/config/torchcrepe.py b/config/torchcrepe.py index 8b743a8..fe824b4 100644 --- a/config/torchcrepe.py +++ b/config/torchcrepe.py @@ -32,6 +32,9 @@ # Distance between adjacent frames HOPSIZE = 160 # samples +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = False + # Number of steps between logging to Tensorboard LOG_INTERVAL = 500 # steps diff --git a/penn/__main__.py b/penn/__main__.py index bd0c4c7..1ddf30e 100644 --- a/penn/__main__.py +++ b/penn/__main__.py @@ -64,6 +64,11 @@ def parse_args(): choices=['half-window', 'half-hop', 'zero'], default='half-window', help='Padding options') + parser.add_argument( + '--decoder', + choices=['argmax', 'pyin', 'viterbi'], + default=penn.DECODER, + help='Posteriorgram decoder') parser.add_argument( '--interp_unvoiced_at', type=float, diff --git a/penn/config/defaults.py b/penn/config/defaults.py index dfe916c..87c720c 100644 --- a/penn/config/defaults.py +++ b/penn/config/defaults.py @@ -28,12 +28,6 @@ # Distance between adjacent frames HOPSIZE = 80 # samples -# The size of the window used for locally normal pitch decoding -LOCAL_PITCH_WINDOW_SIZE = 19 - -# Pitch velocity constraint for viterbi decoding -MAX_OCTAVES_PER_SECOND = 6. - # Whether to normalize input audio to mean zero and variance one NORMALIZE_INPUT = False @@ -49,13 +43,31 @@ # Audio sample rate SAMPLE_RATE = 8000 # hz -# Maximum chunk size for chunked Viterbi decoding -VITERBI_MAX_CHUNK_SIZE = None - # Size of the analysis window WINDOW_SIZE = 1024 # samples +############################################################################### +# Decoder parameters +############################################################################### + + +# The decoder to use for postprocessing. One of ['argmax', 'pyin', 'viterbi']. +DECODER = 'viterbi' + +# Whether to perform local expected value decoding of pitch +LOCAL_EXPECTED_VALUE = True + +# The size of the window used for local expected value pitch decoding +LOCAL_PITCH_WINDOW_SIZE = 19 + +# Pitch velocity constraint for viterbi decoding +MAX_OCTAVES_PER_SECOND = 6. + +# Maximum chunk size for chunked Viterbi decoding +VITERBI_MIN_CHUNK_SIZE = None + + ############################################################################### # Directories ############################################################################### @@ -118,9 +130,6 @@ ############################################################################### -# The decoder to use for postprocessing -DECODER = 'local_expected_value' - # The dropout rate. Set to None to turn off dropout. DROPOUT = None diff --git a/penn/core.py b/penn/core.py index 51ae247..6cd9c56 100644 --- a/penn/core.py +++ b/penn/core.py @@ -28,6 +28,7 @@ def from_audio( checkpoint: Optional[Path] = None, batch_size: Optional[int] = None, center: str = 'half-window', + decoder: str = penn.DECODER, interp_unvoiced_at: Optional[float] = None, gpu: Optional[int] = None ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -42,6 +43,7 @@ def from_audio( checkpoint: The checkpoint file batch_size: The number of frames per batch center: Padding options. One of ['half-window', 'half-hop', 'zero']. + decoder: Posteriorgram decoder. One of ['argmax', 'pyin', 'viterbi']. interp_unvoiced_at: Specifies voicing threshold for interpolation gpu: The index of the gpu to run inference on @@ -51,7 +53,14 @@ def from_audio( periodicity: torch.tensor( shape=(1, int(samples // penn.seconds_to_sample(hopsize)))) """ - pitch, periodicity = [], [] + device = 'cpu' if gpu is None else f'cuda:{gpu}' + + # Storage for batching + if batch_size is not None: + if decoder == 'argmax': + pitch, periodicity = [], [] + else: + logits = [] # Preprocess audio for frames in preprocess( @@ -64,19 +73,50 @@ def from_audio( # Copy to device with torchutil.time.context('copy-to'): - frames = frames.to('cpu' if gpu is None else f'cuda:{gpu}') + frames = frames.to(device) # Infer - logits = infer(frames, checkpoint).detach() + inferred = infer(frames, checkpoint).detach() + + if batch_size is None: + + # Postprocess full file + with torchutil.time.context('postprocess'): + _, pitch, periodicity = postprocess( + inferred, + fmin, + fmax, + decoder) - # Postprocess - with torchutil.time.context('postprocess'): - result = postprocess(logits, fmin, fmax) - pitch.append(result[1]) - periodicity.append(result[2]) + elif decoder == 'argmax': + + # Postprocess partial file + with torchutil.time.context('postprocess'): + result = postprocess(inferred, fmin, fmax, decoder) + pitch.append(result[1]) + periodicity.append(result[2]) + + else: + + # Save logits off GPU for later decoding + logits.append(inferred.cpu()) + + if batch_size is not None: + + if decoder == 'argmax': + + # Concatenate results + pitch = torch.cat(pitch, 1) + periodicity = torch.cat(periodicity, 1) + + else: - # Concatenate results - pitch, periodicity = torch.cat(pitch, 1), torch.cat(periodicity, 1) + # Postprocess full file + _, pitch, periodicity = postprocess( + torch.cat(logits, 0).to(device), + fmin, + fmax, + decoder) # Maybe interpolate unvoiced regions if interp_unvoiced_at is not None: @@ -96,6 +136,7 @@ def from_file( checkpoint: Optional[Path] = None, batch_size: Optional[int] = None, center: str = 'half-window', + decoder: str = penn.DECODER, interp_unvoiced_at: Optional[float] = None, gpu: Optional[int] = None ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -109,6 +150,7 @@ def from_file( checkpoint: The checkpoint file batch_size: The number of frames per batch center: Padding options. One of ['half-window', 'half-hop', 'zero']. + decoder: Posteriorgram decoder. One of ['argmax', 'pyin', 'viterbi']. interp_unvoiced_at: Specifies voicing threshold for interpolation gpu: The index of the gpu to run inference on @@ -130,6 +172,7 @@ def from_file( checkpoint, batch_size, center, + decoder, interp_unvoiced_at, gpu) @@ -143,6 +186,7 @@ def from_file_to_file( checkpoint: Optional[Path] = None, batch_size: Optional[int] = None, center: str = 'half-window', + decoder: str = penn.DECODER, interp_unvoiced_at: Optional[float] = None, gpu: Optional[int] = None ) -> None: @@ -157,6 +201,7 @@ def from_file_to_file( checkpoint: The checkpoint file batch_size: The number of frames per batch center: Padding options. One of ['half-window', 'half-hop', 'zero']. + decoder: Posteriorgram decoder. One of ['argmax', 'pyin', 'viterbi']. interp_unvoiced_at: Specifies voicing threshold for interpolation gpu: The index of the gpu to run inference on """ @@ -169,6 +214,7 @@ def from_file_to_file( checkpoint, batch_size, center, + decoder, interp_unvoiced_at, gpu) @@ -197,6 +243,7 @@ def from_files_to_files( checkpoint: Optional[Path] = None, batch_size: Optional[int] = None, center: str = 'half-window', + decoder: str = penn.DECODER, interp_unvoiced_at: Optional[float] = None, num_workers: int = 0, gpu: Optional[int] = None @@ -212,6 +259,7 @@ def from_files_to_files( checkpoint: The checkpoint file batch_size: The number of frames per batch center: Padding options. One of ['half-window', 'half-hop', 'zero']. + decoder: Posteriorgram decoder. One of ['argmax', 'pyin', 'viterbi']. interp_unvoiced_at: Specifies voicing threshold for interpolation num_workers: Number of CPU threads for async data I/O gpu: The index of the gpu to run inference on @@ -240,6 +288,7 @@ def from_files_to_files( checkpoint, batch_size, center, + decoder, interp_unvoiced_at, gpu) @@ -271,14 +320,22 @@ def from_files_to_files( try: + device = 'cpu' if gpu is None else f'cuda:{gpu}' + # Track residual to fill batch residual_files = [] residual_frames = torch.zeros((0, 1, 1024)) residual_lengths = torch.zeros((0,), dtype=torch.long) - # Iterate over data - pitch, periodicity = torch.zeros((1, 0)), torch.zeros((1, 0)) + # Storage for batching within files + if batch_size is not None: + if decoder == 'argmax': + pitch, periodicity = torch.zeros((1, 0)), torch.zeros((1, 0)) + else: + logits = torch.zeros((1, 0, 0)) + # Iterate over data + num_inferred_unsaved = 0 for frames, lengths, input_files in loader: # Prepend residual @@ -292,40 +349,77 @@ def from_files_to_files( # Copy to device size = len(frames) if batch_size is None else batch_size - batch_frames = frames[i:i + size].to( - 'cpu' if gpu is None else f'cuda:{gpu}') + batch_frames = frames[i:i + size].to(device) # Infer - logits = infer(batch_frames, checkpoint).detach() + inferred = infer(batch_frames, checkpoint).detach() + i += len(batch_frames) + num_inferred_unsaved += len(batch_frames) - # Postprocess - results = postprocess(logits, fmin, fmax) + if batch_size is None: - # Append to residual - pitch = torch.cat((pitch, results[1].cpu()), dim=1) - periodicity = torch.cat( - (periodicity, results[2].cpu()), - dim=1) + # Postprocess full file + _, pitch, periodicity = postprocess( + inferred, + fmin, + fmax, + decoder) + break - i += len(batch_frames) + elif decoder == 'argmax': + + # Postprocess partial file + results = postprocess(inferred, fmin, fmax, decoder) + pitch = torch.cat((pitch, results[1].cpu()), dim=1) + periodicity = torch.cat( + (periodicity, results[2].cpu()), + dim=1) + + else: + + # Save logits for later decoding + # NOTE - This differs from from_audio and does not + # handle large files that do not fit on GPU. + # However, it saves a GPU -> CPU -> GPU copy. + logits = torch.cat((logits, inferred), dim=0) # Save to disk j, k = 0, 0 for length, file in zip(lengths, input_files): # Slice and save in another process - if j + length <= pitch.shape[-1]: + if j + length <= num_inferred_unsaved: + + if batch_size is not None: + + if decoder == 'argmax': + + # Slice results + save_pitch = pitch[:, j:j + length] + save_periodicity = periodicity[:, j:j + length] + + else: + + # Postprocess full file + _, save_pitch, save_periodicity = postprocess( + logits[j:j + length], + fmin, + fmax, + decoder) + + # Async save futures.append( pool.apply_async( save_worker, args=( output_prefixes[file], - pitch[:, j:j + length], - periodicity[:, j:j + length], + save_pitch, + save_periodicity, interp_unvoiced_at))) while len(futures) > 100: futures = [f for f in futures if not f.ready()] time.sleep(.1) + j += length k += 1 progress.update() @@ -333,8 +427,10 @@ def from_files_to_files( break # Setup residual for next iteration + num_inferred_unsaved -= j pitch = pitch[:, j:] periodicity = periodicity[:, j:] + logits = logits[j:] residual_files = input_files[k:] residual_lengths = lengths[k:] residual_frames = frames[i:] @@ -343,32 +439,57 @@ def from_files_to_files( if residual_frames.numel(): # Copy to device - batch_frames = residual_frames.to( - 'cpu' if gpu is None else f'cuda:{gpu}') + batch_frames = residual_frames.to(device) # Infer - logits = infer(batch_frames, checkpoint).detach() + inferred = infer(batch_frames, checkpoint).detach() + num_inferred_unsaved += len(batch_frames) + + if decoder == 'argmax': + + # Postprocess partial file + results = postprocess(inferred, fmin, fmax, decoder) + pitch = torch.cat((pitch, results[1].cpu()), dim=1) + periodicity = torch.cat( + (periodicity, results[2].cpu()), + dim=1) - # Postprocess - results = postprocess(logits, fmin, fmax) + else: - # Append to residual - pitch = torch.cat((pitch, results[1].cpu()), dim=1) - periodicity = torch.cat((periodicity, results[2].cpu()), dim=1) + # Save logits for later decoding + # NOTE - This differs from from_audio and does not + # handle large files that do not fit on GPU. + # However, it saves a GPU -> CPU -> GPU copy. + logits = torch.cat((logits, inferred), dim=0) # Save i = 0 for length, file in zip(residual_lengths, residual_files): + if decoder == 'argmax': + + # Slice results + save_pitch = pitch[:, i:i + length] + save_periodicity = periodicity[:, i:i + length] + + else: + + # Postprocess full file + _, save_pitch, save_periodicity = postprocess( + logits[i:i + length], + fmin, + fmax, + decoder) + # Slice and save in another process - if i + length <= pitch.shape[-1]: + if i + length <= num_inferred_unsaved: futures.append( pool.apply_async( save_worker, args=( output_prefixes[file], - pitch[:, i:i + length], - periodicity[:, i:i + length], + save_pitch, + save_periodicity, interp_unvoiced_at))) while len(futures) > 100: futures = [f for f in futures if not f.ready()] @@ -443,8 +564,23 @@ def infer(frames, checkpoint=None): return logits -def postprocess(logits, fmin=penn.FMIN, fmax=penn.FMAX): +def postprocess(logits, fmin=penn.FMIN, fmax=penn.FMAX, decoder=penn.DECODER): """Convert model output to pitch and periodicity""" + # Cache decoder + if ( + not hasattr(postprocess, 'decoder') or + postprocess.decoder_name != decoder + ): + if decoder == 'argmax': + postprocess.decoder = penn.decode.Argmax() + elif decoder == 'pyin': + postprocess.decoder = penn.decode.PYIN() + elif decoder == 'viterbi': + postprocess.decoder = penn.decode.Viterbi() + else: + raise ValueError(f'Decoder method {decoder} is not defined') + postprocess.decoder_name = decoder + # Turn off gradients with torch.inference_mode(): @@ -459,14 +595,7 @@ def postprocess(logits, fmin=penn.FMIN, fmax=penn.FMAX): logits[:, maxidx:] = -float('inf') # Decode pitch from logits - if penn.DECODER == 'argmax': - bins, pitch = penn.decode.argmax(logits) - elif penn.DECODER.startswith('viterbi'): - bins, pitch = penn.decode.viterbi(logits) - elif penn.DECODER == 'local_expected_value': - bins, pitch = penn.decode.local_expected_value(logits) - else: - raise ValueError(f'Decoder method {penn.DECODER} is not defined') + bins, pitch = postprocess.decoder(logits) # Decode periodicity from logits if penn.PERIODICITY == 'entropy': diff --git a/penn/decode.py b/penn/decode.py index 20202c1..16dbf6d 100644 --- a/penn/decode.py +++ b/penn/decode.py @@ -1,3 +1,6 @@ +import abc +import functools + import numpy as np import torbi import torch @@ -6,25 +9,59 @@ ############################################################################### -# Decode pitch contour from logits of pitch posteriorgram +# Base pitch posteriorgram decoder ############################################################################### -def argmax(logits): +class Decoder(abc.ABC): + """Base decoder""" + + def __init__(self, local_expected_value=True): + self.local_expected_value = local_expected_value + + @abc.abstractmethod + def __call__(self, logits): + """Perform decoding""" + pass + + +############################################################################### +# Derived pitch posteriorgram decoders +############################################################################### + + +class Argmax(Decoder): """Decode pitch using argmax""" - # Get pitch bins - bins = logits.argmax(dim=1) - # Convert to hz - pitch = penn.convert.bins_to_frequency(bins) + def __init__(self, local_expected_value=penn.LOCAL_EXPECTED_VALUE): + super().__init__(local_expected_value) + + def __call__(self, logits): + # Get pitch bins + bins = logits.argmax(dim=1) - return bins, pitch + # Convert to frequency in Hz + if self.local_expected_value: + # Decode using an assumption of normality around the argmax path + pitch = local_expected_value_from_bins(bins, logits) -def viterbi(logits): - """Decode pitch using viterbi decoding (from librosa)""" - # Normalize and convert to numpy - if penn.METHOD == 'pyin': + else: + + # Linearly interpolate unvoiced regions + pitch = penn.convert.bins_to_frequency(bins) + + return bins, pitch + + +class PYIN(Decoder): + """Decode pitch via peak picking + Viterbi. Used by PYIN.""" + + def __init__(self, local_expected_value=False): + super().__init__(local_expected_value) + + def __call__(self, logits): + """PYIN decoding""" periodicity = penn.periodicity.sum(logits).T unvoiced = ( (1 - periodicity) / penn.PITCH_BINS).repeat(penn.PITCH_BINS, 1) @@ -32,84 +69,94 @@ def viterbi(logits): (torch.exp(logits.permute(2, 1, 0)), unvoiced[None]), dim=1) - else: - - # Viterbi REQUIRES a categorical distribution, even if the loss was BCE - distributions = torch.nn.functional.softmax(logits, dim=1) - distributions = distributions.permute(2, 1, 0) - distributions = distributions.to(device='cpu', dtype=torch.float32) - - # Cache viterbi probabilities - if not hasattr(viterbi, 'transition'): - xx, yy = torch.meshgrid( - torch.arange(penn.PITCH_BINS), - torch.arange(penn.PITCH_BINS), - indexing='ij') - bins_per_octave = penn.OCTAVE / penn.CENTS_PER_BIN - max_octaves_per_frame = \ - penn.MAX_OCTAVES_PER_SECOND * penn.HOPSIZE / penn.SAMPLE_RATE - max_bins_per_frame = max_octaves_per_frame * bins_per_octave + 1 - transition = torch.clip(max_bins_per_frame - (xx - yy).abs(), 0) - transition = transition / transition.sum(dim=1, keepdims=True) - viterbi.transition = transition - - if penn.METHOD == 'pyin': - - # Add unvoiced probabilities - viterbi.transition = torch.kron( - torch.tensor([[.99, .01], [.01, .99]]), - viterbi.transition) - - # Uniform initial probabilities - viterbi.initial = torch.zeros(2 * penn.PITCH_BINS) - viterbi.initial[penn.PITCH_BINS:] = 1 / penn.PITCH_BINS - - else: - - # Uniform initial probabilities - viterbi.initial = torch.full( - (penn.PITCH_BINS,), - 1 / penn.PITCH_BINS) + # Viterbi decoding + gpu = ( + None if distributions.device.type == 'cpu' + else distributions.device.index) + bins = torbi.decode( + distributions[0].T, + self.transition(), + self.initial(), + gpu) - # Viterbi decoding - bins = torbi.decode( - distributions[0].T, - viterbi.transition, - viterbi.initial, - penn.VITERBI_MAX_CHUNK_SIZE - )[:, None] + # Convert to frequency in Hz + if self.local_expected_value: - # Convert to frequency in Hz - if penn.DECODER.endswith('normal'): + # Decode using an assumption of normality around the viterbi path + pitch = local_expected_value_from_bins(bins.T, logits).T - # Decode using an assumption of normality around to the viterbi path - bins = bins.to(logits.device).T - pitch = local_expected_value_from_bins(bins, logits).T + else: - elif penn.METHOD == 'pyin': + # Argmax decoding + pitch = penn.convert.bins_to_frequency(bins) # Linearly interpolate unvoiced regions - pitch = penn.convert.bins_to_frequency(bins) pitch[bins >= penn.PITCH_BINS] = 0 pitch = penn.data.preprocess.interpolate_unvoiced(pitch.numpy())[0] pitch = torch.from_numpy(pitch).to(logits.device) bins = bins.to(logits.device) - else: + return bins.T, pitch.T - # Argmax decoding - bins = bins.to(logits.device).T - pitch = penn.convert.bins_to_frequency(bins) + @functools.cached_property + def initial(self): + """Create initial probability matrix for PYIN""" + initial = torch.zeros(2 * penn.PITCH_BINS) + initial[penn.PITCH_BINS:] = 1 / penn.PITCH_BINS - return bins.T, pitch.T + @functools.cached_property + def transition(self): + """Create the Viterbi transition matrix for PYIN""" + transition = triangular_transition_matrix() + # Add unvoiced probabilities + transition = torch.kron( + torch.tensor([[.99, .01], [.01, .99]]), + transition) -def local_expected_value(logits, window=penn.LOCAL_PITCH_WINDOW_SIZE): - """Decode pitch using a normal assumption around the argmax""" - # Get center bins - bins = logits.argmax(dim=1) - return bins, local_expected_value_from_bins(bins, logits, window) +class Viterbi(Decoder): + + def __init__(self, local_expected_value=True): + super().__init__(local_expected_value) + + def __call__(self, logits): + """Decode pitch using viterbi decoding (from librosa)""" + distributions = torch.nn.functional.softmax(logits, dim=1) + distributions = distributions.permute(2, 1, 0) + + # Viterbi decoding + gpu = ( + None if distributions.device.type == 'cpu' + else distributions.device.index) + bins = torbi.decode( + distributions[0].T, + self.transition(), + self.initial(), + gpu) + + # Convert to frequency in Hz + if self.local_expected_value: + + # Decode using an assumption of normality around the viterbi path + pitch = local_expected_value_from_bins(bins.T, logits).T + + else: + + # Argmax decoding + pitch = penn.convert.bins_to_frequency(bins) + + return bins.T, pitch.T + + @functools.cached_property + def transition(self): + """Create uniform initial probabilities""" + return torch.full((penn.PITCH_BINS,), 1 / penn.PITCH_BINS) + + @functools.cached_property + def transition(self): + """Create Viterbi transition probability matrix""" + return triangular_transition_matrix() ############################################################################### @@ -158,3 +205,17 @@ def local_expected_value_from_bins( # Decode using local expected value return expected_value(torch.gather(padded, 1, indices), cents) + + +def triangular_transition_matrix(): + """Create a triangular distribution transition matrix""" + xx, yy = torch.meshgrid( + torch.arange(penn.PITCH_BINS), + torch.arange(penn.PITCH_BINS), + indexing='ij') + bins_per_octave = penn.OCTAVE / penn.CENTS_PER_BIN + max_octaves_per_frame = \ + penn.MAX_OCTAVES_PER_SECOND * penn.HOPSIZE / penn.SAMPLE_RATE + max_bins_per_frame = max_octaves_per_frame * bins_per_octave + 1 + transition = torch.clip(max_bins_per_frame - (xx - yy).abs(), 0) + return transition / transition.sum(dim=1, keepdims=True) diff --git a/setup.py b/setup.py index 0abe787..962cdcb 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='penn', description='Pitch Estimating Neural Networks (PENN)', - version='0.0.14', + version='0.2.0', author='Max Morrison, Caedon Hsieh, Nathan Pruyne, and Bryan Pardo', author_email='interactiveaudiolab@gmail.com', url='https://github.com/interactiveaudiolab/penn', From 61e7b4bef6ba66a17a059ff7ace5730e6d8e1f12 Mon Sep 17 00:00:00 2001 From: Cameron Date: Sun, 14 Jan 2024 17:31:03 -0600 Subject: [PATCH 4/5] torbi --- penn/decode.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/penn/decode.py b/penn/decode.py index 16dbf6d..f9e22b3 100644 --- a/penn/decode.py +++ b/penn/decode.py @@ -104,6 +104,8 @@ def initial(self): initial = torch.zeros(2 * penn.PITCH_BINS) initial[penn.PITCH_BINS:] = 1 / penn.PITCH_BINS + return initial + @functools.cached_property def transition(self): """Create the Viterbi transition matrix for PYIN""" @@ -114,6 +116,8 @@ def transition(self): torch.tensor([[.99, .01], [.01, .99]]), transition) + return transition + class Viterbi(Decoder): @@ -129,11 +133,11 @@ def __call__(self, logits): gpu = ( None if distributions.device.type == 'cpu' else distributions.device.index) - bins = torbi.decode( - distributions[0].T, - self.transition(), - self.initial(), - gpu) + bins = torbi.from_probabilities( + observation=distributions[0].T.unsqueeze(dim=0), + transition=self.transition, + initial=self.initial, + gpu=gpu) # Convert to frequency in Hz if self.local_expected_value: @@ -149,7 +153,7 @@ def __call__(self, logits): return bins.T, pitch.T @functools.cached_property - def transition(self): + def initial(self): """Create uniform initial probabilities""" return torch.full((penn.PITCH_BINS,), 1 / penn.PITCH_BINS) From 9e515d25125bebac8b0e6c0a3448f061a85428ec Mon Sep 17 00:00:00 2001 From: Max Date: Sun, 14 Jan 2024 17:38:48 -0600 Subject: [PATCH 5/5] Decoding fixes --- penn/decode.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/penn/decode.py b/penn/decode.py index 16dbf6d..2722293 100644 --- a/penn/decode.py +++ b/penn/decode.py @@ -75,8 +75,8 @@ def __call__(self, logits): else distributions.device.index) bins = torbi.decode( distributions[0].T, - self.transition(), - self.initial(), + self.transition, + self.initial, gpu) # Convert to frequency in Hz @@ -103,6 +103,7 @@ def initial(self): """Create initial probability matrix for PYIN""" initial = torch.zeros(2 * penn.PITCH_BINS) initial[penn.PITCH_BINS:] = 1 / penn.PITCH_BINS + return initial @functools.cached_property def transition(self): @@ -113,6 +114,7 @@ def transition(self): transition = torch.kron( torch.tensor([[.99, .01], [.01, .99]]), transition) + return transition class Viterbi(Decoder): @@ -123,7 +125,7 @@ def __init__(self, local_expected_value=True): def __call__(self, logits): """Decode pitch using viterbi decoding (from librosa)""" distributions = torch.nn.functional.softmax(logits, dim=1) - distributions = distributions.permute(2, 1, 0) + distributions = distributions.permute(2, 1, 0) # F x C x 1 -> 1 x C x F # Viterbi decoding gpu = ( @@ -131,8 +133,8 @@ def __call__(self, logits): else distributions.device.index) bins = torbi.decode( distributions[0].T, - self.transition(), - self.initial(), + self.transition, + self.initial, gpu) # Convert to frequency in Hz @@ -149,7 +151,7 @@ def __call__(self, logits): return bins.T, pitch.T @functools.cached_property - def transition(self): + def initial(self): """Create uniform initial probabilities""" return torch.full((penn.PITCH_BINS,), 1 / penn.PITCH_BINS)