Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DASB refactor- hyperparam tunining #46

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 234 additions & 0 deletions benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
# ############################################################################
# Model: E2E ASR with CTC
# Auido Tokenizer: DAC
# Encoder: LSTM Encoder
# Decoder: CTC beam searcher and greedy searcher
# Tokens: character
# Training: Librispeech 960h
# Authors: Pooneh Mousavi 2024
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made

seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/dac/LSTM/<seed>
output_wer_folder: !ref <output_folder>/wer.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
cached_data_folder: !PLACEHOLDER #'path/to/cache'
testing: True
# Data files
data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
# then data_folder_rirs should be /localscratch/xxx_corpus
# otherwise the dataset will automatically be downloaded
# data_folder_rirs: !ref <data_folder>
train_splits: ["train-clean-100"]
dev_splits: ["dev-clean"]
test_splits: ["test-clean"]
skip_prep: False
train_csv: !ref <cached_data_folder>/train.csv
valid_csv: !ref <cached_data_folder>/dev-clean.csv
test_csv:
- !ref <cached_data_folder>/dev-clean.csv
- !ref <cached_data_folder>/test-clean.csv

####################### Training Parameters ####################################
# number_of_epochs: 20
number_of_epochs: 200 # @orion_step1: --number_of_epochs~"fidelity(1, 2, base=4)"
batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(1, 2,discrete=True)"
batch_size: !ref 2 ** <batch_size_exponent>
test_batch_size: 1
grad_accumulation_factor: 2
max_grad_norm: 5.0
sorting: descending #random
num_workers: 8
loss_reduction: batchmean
precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
valid_search_interval: 1
avg_checkpoints: 10 # Number of checkpoints to average for evaluation
cache_size: 1.e+10

lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
weight_decay: 0.0005


# Training parameters
# To make Transformers converge, the global bath size should be large enough.
# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
# Please, set your parameters accordingly.
dynamic_batching: True
max_batch_length_train: 850
max_batch_len_val: 100
num_bucket: 200
shuffle: False # if true re-creates batches at each epoch shuffling examples.
max_batch_ex: 128
batch_ordering: random

dynamic_batch_sampler_train:
max_batch_length: !ref <max_batch_length_train>
num_buckets: !ref <num_bucket>
shuffle: !ref <shuffle>
batch_ordering: !ref <batch_ordering>
max_batch_ex: !ref <max_batch_ex>

dynamic_batch_sampler_val:
max_batch_length: !ref <max_batch_len_val>
num_buckets: !ref <num_bucket>
shuffle: !ref <shuffle>
batch_ordering: !ref <batch_ordering>
max_batch_ex: !ref <max_batch_ex>

# Dataloader options
train_dataloader_opts:
batch_size: !ref <batch_size>
shuffle: True
num_workers: !ref <num_workers>

valid_dataloader_opts:
batch_size: !ref <test_batch_size>

test_dataloader_opts:
batch_size: !ref <test_batch_size>


####################### Model parameters ###########################
# Tokenizer parameters
# DAC parameters
# model_type: [16khz, 24khz, 44khz, 44khz]
# vocab_size: [1024, 1024, 1024, 1024]
# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
# max_num_codebooks: [12, 32, 9, 18]
# embedding_dim: [1024, 1024, 1024, 128]
model_type: 24khz
vocab_size: 1024
model_bitrate: 8kbps
num_codebooks: 2
sample_rate: 24000
# Feature parameters
encoder_dim: 1024
# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
pretrain_embeddings: False
freeze_embedding: False


# LSTM
activation: !name:torch.nn.Sigmoid
dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 2,discrete=True)"
dnn_neurons: 1024
dropout: 0.2
output_neurons: 31

# BPE parameters
# BPE parameters
token_type: char # ["unigram", "bpe", "char"]
character_coverage: 1.0
blank_index: 0
bos_index: 1
eos_index: 2

# Decoding parameters
beam_size: 100
beam_prune_logp: -12.0
token_prune_min_logp: -1.2
prune_history: False

############################## models ################################
# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
tokenizer: !new:model.tokenizer_interface.DACTokenizer
model_type: !ref <model_type>
model_bitrate: !ref <model_bitrate>
load_pretrained: True
tag: latest

discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
num_codebooks: !ref <num_codebooks>
vocab_size: !ref <vocab_size>
emb_dim: !ref <encoder_dim>
# hidden_dim: !ref <encoder_dim>
freeze: !ref <freeze_embedding>
init: !ref <pretrain_embeddings>

attention_mlp: !new:model.custom_model.AttentionMLP
input_dim: !ref <encoder_dim>
hidden_dim: !ref <encoder_dim>

encoder: !new:speechbrain.nnet.RNN.LSTM
input_shape: [Null, Null, !ref <encoder_dim>]
num_layers: !ref <dnn_layers>
bidirectional: True
dropout: !ref <dropout>
hidden_size: !ref <dnn_neurons>

ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: 2048
n_neurons: !ref <output_neurons>

modules:
encoder: !ref <encoder>
ctc_lin: !ref <ctc_lin>
attention_mlp: !ref <attention_mlp>
tokenizer: !ref <tokenizer>
discrete_embedding_layer: !ref <discrete_embedding_layer>


model: !new:torch.nn.ModuleList
- [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]

####################### Decoding & optimiser ###########################
# Decoding parameters
test_beam_search:
blank_index: !ref <blank_index>
beam_size: !ref <beam_size>
beam_prune_logp: !ref <beam_prune_logp>
token_prune_min_logp: !ref <token_prune_min_logp>
prune_history: !ref <prune_history>
alpha: 0.8
beta: 1.2

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: !ref <blank_index>


log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: True

scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: !ref <lr_model>
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
# lr_initial: !ref <lr_model>
# n_warmup_steps: 7500
# n_keep_steps: 36000

model_opt_class: !name:torch.optim.AdamW
lr: !ref <lr_model>
betas: (0.9, 0.98)
eps: 0.000000001
weight_decay: !ref <weight_decay>

############################## Logging and Pretrainer ##########################
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
model: !ref <model>
scheduler: !ref <scheduler>
counter: !ref <epoch_counter>


# Functions and classes
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter # epoch counter
limit: !new:int
- !ref <number_of_epochs>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: True
wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
Loading
Loading