Skip to content

Commit

Permalink
Version merge
Browse files Browse the repository at this point in the history
  • Loading branch information
maxrmorrison committed Jul 15, 2024
2 parents 6fe69eb + 7b39a6e commit 6445b76
Show file tree
Hide file tree
Showing 23 changed files with 458 additions and 163 deletions.
74 changes: 44 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ center = 'half-hop'
# (Optional) Linearly interpolate unvoiced regions below periodicity threshold
interp_unvoiced_at = .065
# (Optional) Select a decoding method. One of ['argmax', 'pyin', 'viterbi'].
decoder = 'viterbi'
# Infer pitch and periodicity
pitch, periodicity = penn.from_audio(
audio,
Expand All @@ -85,6 +88,7 @@ pitch, periodicity = penn.from_audio(
checkpoint=checkpoint,
batch_size=batch_size,
center=center,
decoder=decoder,
interp_unvoiced_at=interp_unvoiced_at,
gpu=gpu)
```
Expand All @@ -96,16 +100,17 @@ pitch, periodicity = penn.from_audio(

```
def from_audio(
audio: torch.Tensor,
sample_rate: int = penn.SAMPLE_RATE,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
audio: torch.Tensor,
sample_rate: int = penn.SAMPLE_RATE,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
decoder: str = penn.DECODER,
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Perform pitch and periodicity estimation
Expand Down Expand Up @@ -134,15 +139,16 @@ Returns:

```
def from_file(
file: Path,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
file: Path,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
decoder: str = penn.DECODER,
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Perform pitch and periodicity estimation from audio on disk
Expand All @@ -168,16 +174,17 @@ Returns:

```
def from_file_to_file(
file: Path,
output_prefix: Optional[Path] = None,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
file: Path,
output_prefix: Optional[Path] = None,
hopsize: float = penn.HOPSIZE_SECONDS,
fmin: float = penn.FMIN,
fmax: float = penn.FMAX,
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
decoder: str = penn.DECODER,
interp_unvoiced_at: Optional[float] = None,
gpu: Optional[int] = None
) -> None:
"""Perform pitch and periodicity estimation from audio on disk and save
Expand Down Expand Up @@ -208,6 +215,7 @@ def from_files_to_files(
checkpoint: Optional[Path] = None,
batch_size: Optional[int] = None,
center: str = 'half-window',
decoder: str = penn.DECODER,
interp_unvoiced_at: Optional[float] = None,
num_workers: int = penn.NUM_WORKERS,
gpu: Optional[int] = None
Expand Down Expand Up @@ -244,7 +252,9 @@ python -m penn
[--checkpoint CHECKPOINT]
[--batch_size BATCH_SIZE]
[--center {half-window,half-hop,zero}]
[--decoder {argmax,pyin,viterbi}]
[--interp_unvoiced_at INTERP_UNVOICED_AT]
[--num_workers NUM_WORKERS]
[--gpu GPU]
required arguments:
Expand All @@ -271,8 +281,12 @@ optional arguments:
The number of frames per batch. Defaults to 2048.
--center {half-window,half-hop,zero}
Padding options
--interp_unvoiced_at INTERP_UNVOICED_AT
--decoder {argmax,pyin,viterbi}
Posteriorgram decoder
--interp_unvoiced_at INTERP_UNVOICED_AT
Specifies voicing threshold for interpolation. Defaults to 0.1625.
--num_workers
Number of CPU threads for async data I/O
--gpu GPU
The index of the gpu to perform inference on. Defaults to CPU.
```
Expand Down
3 changes: 3 additions & 0 deletions config/crepe++.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# The name of the model to use for training
MODEL = 'crepe'
3 changes: 3 additions & 0 deletions config/deepf0++.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# The name of the model to use for training
MODEL = 'deepf0'
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-batchsize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@

# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False
10 changes: 10 additions & 0 deletions config/fcnf0++-ablate-chunkviterbi-normal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
MODULE = 'penn'

# Configuration name
CONFIG = 'fcnf0++-ablate-chunkviterbi-normal'

# The decoder to use for postprocessing
DECODER = 'viterbi'

# Maximum chunk size for chunked Viterbi decoding
VITERBI_MIN_CHUNK_SIZE = 64
13 changes: 13 additions & 0 deletions config/fcnf0++-ablate-chunkviterbi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
MODULE = 'penn'

# Configuration name
CONFIG = 'fcnf0++-ablate-chunkviterbi'

# The decoder to use for postprocessing
DECODER = 'viterbi'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Maximum chunk size for chunked Viterbi decoding
VITERBI_MIN_CHUNK_SIZE = 8
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,8 @@
# Configuration name
CONFIG = 'fcnf0++-ablate-decoder'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# The decoder to use for postprocessing
DECODER = 'argmax'
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-earlystop.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@
# Whether to stop training when validation loss stops improving
EARLY_STOPPING = True

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Number of steps between logging to Tensorboard
LOG_INTERVAL = 500 # steps
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-inputnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Whether to normalize input audio to mean zero and variance one
NORMALIZE_INPUT = True
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Type of model normalization
NORMALIZATION = 'batch'
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Loss function
LOSS = 'binary_cross_entropy'
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Number of pitch bins to predict
PITCH_BINS = 486
3 changes: 3 additions & 0 deletions config/fcnf0++-ablate-unvoiced.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Whether to only use voiced start frames
VOICED_ONLY = True

3 changes: 3 additions & 0 deletions config/fcnf0++-mdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@

# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False
3 changes: 3 additions & 0 deletions config/fcnf0++-ptdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@

# The decoder to use for postprocessing
DECODER = 'argmax'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False
3 changes: 3 additions & 0 deletions config/fcnf0.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# Minimum representable frequency
FMIN = 30. # Hz

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Number of steps between logging to Tensorboard
LOG_INTERVAL = 500 # steps

Expand Down
2 changes: 1 addition & 1 deletion config/pyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
CONFIG = 'pyin'

# The decoder to use for postprocessing
DECODER = 'argmax'
DECODER = 'pyin'

# Distance between adjacent frames
HOPSIZE = 160 # samples
Expand Down
3 changes: 3 additions & 0 deletions config/torchcrepe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
# Distance between adjacent frames
HOPSIZE = 160 # samples

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = False

# Number of steps between logging to Tensorboard
LOG_INTERVAL = 500 # steps

Expand Down
10 changes: 10 additions & 0 deletions penn/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,20 @@ def parse_args():
choices=['half-window', 'half-hop', 'zero'],
default='half-window',
help='Padding options')
parser.add_argument(
'--decoder',
choices=['argmax', 'pyin', 'viterbi'],
default=penn.DECODER,
help='Posteriorgram decoder')
parser.add_argument(
'--interp_unvoiced_at',
type=float,
help='Specifies voicing threshold for interpolation')
parser.add_argument(
'--num_workers',
type=int,
default=0,
help='Number of CPU threads for async data I/O')
parser.add_argument(
'--gpu',
type=int,
Expand Down
30 changes: 21 additions & 9 deletions penn/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@
# Distance between adjacent frames
HOPSIZE = 80 # samples

# The size of the window used for locally normal pitch decoding
LOCAL_PITCH_WINDOW_SIZE = 19

# Pitch velocity constraint for viterbi decoding
MAX_OCTAVES_PER_SECOND = 35.92

# Whether to normalize input audio to mean zero and variance one
NORMALIZE_INPUT = False

Expand All @@ -53,6 +47,27 @@
WINDOW_SIZE = 1024 # samples


###############################################################################
# Decoder parameters
###############################################################################


# The decoder to use for postprocessing. One of ['argmax', 'pyin', 'viterbi'].
DECODER = 'viterbi'

# Whether to perform local expected value decoding of pitch
LOCAL_EXPECTED_VALUE = True

# The size of the window used for local expected value pitch decoding
LOCAL_PITCH_WINDOW_SIZE = 19

# Pitch velocity constraint for viterbi decoding
MAX_OCTAVES_PER_SECOND = 6.

# Maximum chunk size for chunked Viterbi decoding
VITERBI_MIN_CHUNK_SIZE = None


###############################################################################
# Directories
###############################################################################
Expand Down Expand Up @@ -115,9 +130,6 @@
###############################################################################


# The decoder to use for postprocessing
DECODER = 'local_expected_value'

# The dropout rate. Set to None to turn off dropout.
DROPOUT = None

Expand Down
Loading

0 comments on commit 6445b76

Please sign in to comment.