Version merge

interactiveaudiolab · Jul 15, 2024 · 6445b76 · 6445b76
2 parents 6fe69eb + 7b39a6e
commit 6445b76
Show file tree

Hide file tree

Showing 23 changed files with 458 additions and 163 deletions.
diff --git a/README.md b/README.md
@@ -75,6 +75,9 @@ center = 'half-hop'
 # (Optional) Linearly interpolate unvoiced regions below periodicity threshold
 interp_unvoiced_at = .065
 
+# (Optional) Select a decoding method. One of ['argmax', 'pyin', 'viterbi'].
+decoder = 'viterbi'
+
 # Infer pitch and periodicity
 pitch, periodicity = penn.from_audio(
     audio,
@@ -85,6 +88,7 @@ pitch, periodicity = penn.from_audio(
     checkpoint=checkpoint,
     batch_size=batch_size,
     center=center,
+    decoder=decoder,
     interp_unvoiced_at=interp_unvoiced_at,
     gpu=gpu)
 ```
@@ -96,16 +100,17 @@ pitch, periodicity = penn.from_audio(
 
 ```
 def from_audio(
-        audio: torch.Tensor,
-        sample_rate: int = penn.SAMPLE_RATE,
-        hopsize: float = penn.HOPSIZE_SECONDS,
-        fmin: float = penn.FMIN,
-        fmax: float = penn.FMAX,
-        checkpoint: Optional[Path] = None,
-        batch_size: Optional[int] = None,
-        center: str = 'half-window',
-        interp_unvoiced_at: Optional[float] = None,
-        gpu: Optional[int] = None
+    audio: torch.Tensor,
+    sample_rate: int = penn.SAMPLE_RATE,
+    hopsize: float = penn.HOPSIZE_SECONDS,
+    fmin: float = penn.FMIN,
+    fmax: float = penn.FMAX,
+    checkpoint: Optional[Path] = None,
+    batch_size: Optional[int] = None,
+    center: str = 'half-window',
+    decoder: str = penn.DECODER,
+    interp_unvoiced_at: Optional[float] = None,
+    gpu: Optional[int] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
 """Perform pitch and periodicity estimation
 
@@ -134,15 +139,16 @@ Returns:
 
 ```
 def from_file(
-        file: Path,
-        hopsize: float = penn.HOPSIZE_SECONDS,
-        fmin: float = penn.FMIN,
-        fmax: float = penn.FMAX,
-        checkpoint: Optional[Path] = None,
-        batch_size: Optional[int] = None,
-        center: str = 'half-window',
-        interp_unvoiced_at: Optional[float] = None,
-        gpu: Optional[int] = None
+    file: Path,
+    hopsize: float = penn.HOPSIZE_SECONDS,
+    fmin: float = penn.FMIN,
+    fmax: float = penn.FMAX,
+    checkpoint: Optional[Path] = None,
+    batch_size: Optional[int] = None,
+    center: str = 'half-window',
+    decoder: str = penn.DECODER,
+    interp_unvoiced_at: Optional[float] = None,
+    gpu: Optional[int] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
 """Perform pitch and periodicity estimation from audio on disk
 
@@ -168,16 +174,17 @@ Returns:
 
 ```
 def from_file_to_file(
-        file: Path,
-        output_prefix: Optional[Path] = None,
-        hopsize: float = penn.HOPSIZE_SECONDS,
-        fmin: float = penn.FMIN,
-        fmax: float = penn.FMAX,
-        checkpoint: Optional[Path] = None,
-        batch_size: Optional[int] = None,
-        center: str = 'half-window',
-        interp_unvoiced_at: Optional[float] = None,
-        gpu: Optional[int] = None
+    file: Path,
+    output_prefix: Optional[Path] = None,
+    hopsize: float = penn.HOPSIZE_SECONDS,
+    fmin: float = penn.FMIN,
+    fmax: float = penn.FMAX,
+    checkpoint: Optional[Path] = None,
+    batch_size: Optional[int] = None,
+    center: str = 'half-window',
+    decoder: str = penn.DECODER,
+    interp_unvoiced_at: Optional[float] = None,
+    gpu: Optional[int] = None
 ) -> None:
 """Perform pitch and periodicity estimation from audio on disk and save
 
@@ -208,6 +215,7 @@ def from_files_to_files(
     checkpoint: Optional[Path] = None,
     batch_size: Optional[int] = None,
     center: str = 'half-window',
+    decoder: str = penn.DECODER,
     interp_unvoiced_at: Optional[float] = None,
     num_workers: int = penn.NUM_WORKERS,
     gpu: Optional[int] = None
@@ -244,7 +252,9 @@ python -m penn
     [--checkpoint CHECKPOINT]
     [--batch_size BATCH_SIZE]
     [--center {half-window,half-hop,zero}]
+    [--decoder {argmax,pyin,viterbi}]
     [--interp_unvoiced_at INTERP_UNVOICED_AT]
+    [--num_workers NUM_WORKERS]
     [--gpu GPU]
 
 required arguments:
@@ -271,8 +281,12 @@ optional arguments:
         The number of frames per batch. Defaults to 2048.
     --center {half-window,half-hop,zero}
         Padding options
-  --interp_unvoiced_at INTERP_UNVOICED_AT
+    --decoder {argmax,pyin,viterbi}
+        Posteriorgram decoder
+    --interp_unvoiced_at INTERP_UNVOICED_AT
         Specifies voicing threshold for interpolation. Defaults to 0.1625.
+    --num_workers
+        Number of CPU threads for async data I/O
     --gpu GPU
         The index of the gpu to perform inference on. Defaults to CPU.
 ```

diff --git a/config/crepe++.py b/config/crepe++.py
@@ -6,5 +6,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # The name of the model to use for training
 MODEL = 'crepe'
diff --git a/config/deepf0++.py b/config/deepf0++.py
@@ -6,5 +6,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # The name of the model to use for training
 MODEL = 'deepf0'
diff --git a/config/fcnf0++-ablate-batchsize.py b/config/fcnf0++-ablate-batchsize.py
@@ -8,3 +8,6 @@
 
 # The decoder to use for postprocessing
 DECODER = 'argmax'
+
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
diff --git a/config/fcnf0++-ablate-chunkviterbi-normal.py b/config/fcnf0++-ablate-chunkviterbi-normal.py
@@ -0,0 +1,10 @@
+MODULE = 'penn'
+
+# Configuration name
+CONFIG = 'fcnf0++-ablate-chunkviterbi-normal'
+
+# The decoder to use for postprocessing
+DECODER = 'viterbi'
+
+# Maximum chunk size for chunked Viterbi decoding
+VITERBI_MIN_CHUNK_SIZE = 64
diff --git a/config/fcnf0++-ablate-chunkviterbi.py b/config/fcnf0++-ablate-chunkviterbi.py
@@ -0,0 +1,13 @@
+MODULE = 'penn'
+
+# Configuration name
+CONFIG = 'fcnf0++-ablate-chunkviterbi'
+
+# The decoder to use for postprocessing
+DECODER = 'viterbi'
+
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
+# Maximum chunk size for chunked Viterbi decoding
+VITERBI_MIN_CHUNK_SIZE = 8
diff --git a/config/fcnf0++-ablate-decoder.py b/config/fcnf0++-ablate-decoder.py
@@ -3,5 +3,8 @@
 # Configuration name
 CONFIG = 'fcnf0++-ablate-decoder'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # The decoder to use for postprocessing
 DECODER = 'argmax'
diff --git a/config/fcnf0++-ablate-earlystop.py b/config/fcnf0++-ablate-earlystop.py
@@ -9,5 +9,8 @@
 # Whether to stop training when validation loss stops improving
 EARLY_STOPPING = True
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Number of steps between logging to Tensorboard
 LOG_INTERVAL = 500  # steps
diff --git a/config/fcnf0++-ablate-inputnorm.py b/config/fcnf0++-ablate-inputnorm.py
@@ -6,5 +6,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Whether to normalize input audio to mean zero and variance one
 NORMALIZE_INPUT = True
diff --git a/config/fcnf0++-ablate-layernorm.py b/config/fcnf0++-ablate-layernorm.py
@@ -6,5 +6,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Type of model normalization
 NORMALIZATION = 'batch'
diff --git a/config/fcnf0++-ablate-loss.py b/config/fcnf0++-ablate-loss.py
@@ -6,5 +6,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Loss function
 LOSS = 'binary_cross_entropy'
diff --git a/config/fcnf0++-ablate-quantization.py b/config/fcnf0++-ablate-quantization.py
@@ -9,5 +9,8 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Number of pitch bins to predict
 PITCH_BINS = 486
diff --git a/config/fcnf0++-ablate-unvoiced.py b/config/fcnf0++-ablate-unvoiced.py
@@ -6,6 +6,9 @@
 # The decoder to use for postprocessing
 DECODER = 'argmax'
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Whether to only use voiced start frames
 VOICED_ONLY = True
 
diff --git a/config/fcnf0++-mdb.py b/config/fcnf0++-mdb.py
@@ -5,3 +5,6 @@
 
 # The decoder to use for postprocessing
 DECODER = 'argmax'
+
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
diff --git a/config/fcnf0++-ptdb.py b/config/fcnf0++-ptdb.py
@@ -5,3 +5,6 @@
 
 # The decoder to use for postprocessing
 DECODER = 'argmax'
+
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
diff --git a/config/fcnf0.py b/config/fcnf0.py
@@ -18,6 +18,9 @@
 # Minimum representable frequency
 FMIN = 30.  # Hz
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Number of steps between logging to Tensorboard
 LOG_INTERVAL = 500  # steps
 

diff --git a/config/pyin.py b/config/pyin.py
@@ -4,7 +4,7 @@
 CONFIG = 'pyin'
 
 # The decoder to use for postprocessing
-DECODER = 'argmax'
+DECODER = 'pyin'
 
 # Distance between adjacent frames
 HOPSIZE = 160  # samples

diff --git a/config/torchcrepe.py b/config/torchcrepe.py
@@ -32,6 +32,9 @@
 # Distance between adjacent frames
 HOPSIZE = 160  # samples
 
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = False
+
 # Number of steps between logging to Tensorboard
 LOG_INTERVAL = 500  # steps
 

diff --git a/penn/__main__.py b/penn/__main__.py
@@ -64,10 +64,20 @@ def parse_args():
         choices=['half-window', 'half-hop', 'zero'],
         default='half-window',
         help='Padding options')
+    parser.add_argument(
+        '--decoder',
+        choices=['argmax', 'pyin', 'viterbi'],
+        default=penn.DECODER,
+        help='Posteriorgram decoder')
     parser.add_argument(
         '--interp_unvoiced_at',
         type=float,
         help='Specifies voicing threshold for interpolation')
+    parser.add_argument(
+        '--num_workers',
+        type=int,
+        default=0,
+        help='Number of CPU threads for async data I/O')
     parser.add_argument(
         '--gpu',
         type=int,

diff --git a/penn/config/defaults.py b/penn/config/defaults.py
@@ -28,12 +28,6 @@
 # Distance between adjacent frames
 HOPSIZE = 80  # samples
 
-# The size of the window used for locally normal pitch decoding
-LOCAL_PITCH_WINDOW_SIZE = 19
-
-# Pitch velocity constraint for viterbi decoding
-MAX_OCTAVES_PER_SECOND = 35.92
-
 # Whether to normalize input audio to mean zero and variance one
 NORMALIZE_INPUT = False
 
@@ -53,6 +47,27 @@
 WINDOW_SIZE = 1024  # samples
 
 
+###############################################################################
+# Decoder parameters
+###############################################################################
+
+
+# The decoder to use for postprocessing. One of ['argmax', 'pyin', 'viterbi'].
+DECODER = 'viterbi'
+
+# Whether to perform local expected value decoding of pitch
+LOCAL_EXPECTED_VALUE = True
+
+# The size of the window used for local expected value pitch decoding
+LOCAL_PITCH_WINDOW_SIZE = 19
+
+# Pitch velocity constraint for viterbi decoding
+MAX_OCTAVES_PER_SECOND = 6.
+
+# Maximum chunk size for chunked Viterbi decoding
+VITERBI_MIN_CHUNK_SIZE = None
+
+
 ###############################################################################
 # Directories
 ###############################################################################
@@ -115,9 +130,6 @@
 ###############################################################################
 
 
-# The decoder to use for postprocessing
-DECODER = 'local_expected_value'
-
 # The dropout rate. Set to None to turn off dropout.
 DROPOUT = None