From a4f38bd2a9cbb76e8bd56c944c66a8fccf0a7c04 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 5 Nov 2024 09:31:52 -0500 Subject: [PATCH 1/9] add tokenizer_interface --- benchmarks/DASB/model/tokenizer_interface.py | 164 +++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 benchmarks/DASB/model/tokenizer_interface.py diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py new file mode 100644 index 000000000..892bef6b3 --- /dev/null +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -0,0 +1,164 @@ + +""" +Unified interface for tokenizers, standardizing the output shape of encode and decode functions. + +This class reshapes the outputs of various tokenizers to ensure consistency, simplifying integration with recipes and workflows. + +Authors +--------- +* Pooneh Mousavi, 2024 +""" + +import torch + +from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.discrete.dac import DAC +from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface + + +class Tokenizer_Encodec(Encodec): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks, _ = self.encode(sig, lens) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + sig = self.decode(toks)[:, 0] # [B, T] + return sig + +class Tokenizer_DAC(DAC): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks, _ = self( + sig[:, None], n_quantizers=kwargs['num_codebooks'] + ) # [B, K, N] + toks = toks.movedim(-1, -2) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + qfeats, _, _ = self.quantizer.from_codes( + toks.movedim(-1, -2) # [B, K, N] + ) + sig = self.decode(qfeats)[:, 0] # [B, T] + return sig + +class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks = self(sig)[ + : kwargs['num_codebooks'] + ] # [K, B, N] + toks = toks.movedim(-3, -1) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + toks = toks.movedim(-1, -3) # [K, B, N] + sig = self.decode(toks) # [B, T] + return sig + +class Tokenizer_DiscreteSSL(DiscreteSSL): + @torch.no_grad() + def sig_to_toks(self, sig, lens): + # sig: [B, T] + self.hparams.codec_quantizer.to(self.device).eval() + toks, _, _ = self.hparams.codec_quantizer( + sig, + lens, + SSL_layers=self.hparams.SSL_layers, + deduplicates=[False] * len(self.hparams.SSL_layers), + bpe_tokenizers=[None] * len(self.hparams.SSL_layers), + ) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks): + # toks: [B, N, K] + self.hparams.codec_vocoder.device = self.device + self.hparams.codec_vocoder.to(self.device).eval() + + # Add offset for embedding layer + all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids + # TODO: remove after testing + assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23) + offsets = torch.arange( + 0, + len(all_layer_ids) * self.hparams.vocab_size, + self.hparams.vocab_size, + device=self.device, + ) + offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] + offsets = offsets[offset_idxes] + toks = toks + offsets + 1 + + # Handle missing codebooks + if len(self.hparams.SSL_layers) < len(all_layer_ids): + full_toks = torch.zeros( + *toks.shape[:2], + len(all_layer_ids), + dtype=toks.dtype, + device=self.device, + ) + for i, idx in enumerate(offset_idxes): + full_toks[..., idx] = toks[..., i] + toks = full_toks + + self.hparams.codec_vocoder.tokenize = False + sig = self.hparams.codec_vocoder(toks)[:, 0] # [B, T] + return sig + +class Tokenizer: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + @torch.no_grad() + def encode(self,sig, lens,**kwargs): + toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs) + return toks + + @torch.no_grad() + def decode(self,sig,**kwargs): + sig = self.tokenizer.toks_to_sig(sig,**kwargs) + return sig + + +# model_hub = "facebook/encodec_24khz" +# save_path = "savedir" +# model = Tokenizer_Encodec(model_hub, save_path) +# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT) +# inputs = torch.rand([3, 2000]) +# model_hub = "facebook/hubert-large-ll60k" +# save_path = "savedir" +# ssl_layer_num = [7,23] +# deduplicate =[False, True] +# bpe_tokenizers=[None, None] +# kmeans_repo_id = "speechbrain/SSL_Quantization" +# kmeans_dataset = "LJSpeech" +# num_clusters = 1000 +# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True) +# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) +model_hub = "fnlp/SpeechTokenizer" +save_path = "savedir" +model =Tokenizer_SpeechTokenizer(model_hub, save_path) # doctest: +SKIP +tokenizer= Tokenizer(model) +audio = torch.randn(4, 1000) +length = torch.tensor([1.0, .5, .75, 1.0]) +tokens = tokenizer.encode(audio, length,num_codebooks=2) +print(tokens.shape) +rec = tokenizer.decode(tokens) +print(rec.shape) \ No newline at end of file From 0c2b751c595c9a63a2bd66b14f32e2faa13478d8 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 6 Nov 2024 17:53:55 -0500 Subject: [PATCH 2/9] add reactored version of ASR --- .../ASR-refactor/hparams/LSTM/dac.yaml | 232 +++++++++ .../ASR-refactor/hparams/LSTM/encodec.yaml | 232 +++++++++ .../hparams/LSTM/speech_tokenizer.yaml | 222 +++++++++ .../ASR-refactor/hparams/contextnet/dac.yaml | 225 +++++++++ .../hparams/contextnet/encodec.yaml | 223 +++++++++ .../hparams/contextnet/speech_tokenizer.yaml | 213 +++++++++ .../ASR-refactor/librispeech_prepare.py | 1 + .../DASB/LibriSpeech/ASR-refactor/train.py | 447 ++++++++++++++++++ benchmarks/DASB/model/ __init__.py | 1 + benchmarks/DASB/model/custom_model.py | 17 +- benchmarks/DASB/model/tokenizer_interface.py | 231 ++++----- 11 files changed, 1933 insertions(+), 111 deletions(-) create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py create mode 100644 benchmarks/DASB/model/ __init__.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml new file mode 100644 index 000000000..4accc2241 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -0,0 +1,232 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml new file mode 100644 index 000000000..03c29ddbb --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml @@ -0,0 +1,232 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml new file mode 100644 index 000000000..8105204a5 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -0,0 +1,222 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml new file mode 100644 index 000000000..eabeef113 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml @@ -0,0 +1,225 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml new file mode 100644 index 000000000..c0411bd76 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml @@ -0,0 +1,223 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/Contexnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml new file mode 100644 index 000000000..77ef2c540 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml @@ -0,0 +1,213 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py new file mode 100644 index 000000000..61b6c56f4 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -0,0 +1,447 @@ +#!/usr/bin/env/python3 +"""Recipe for training an discrete tokens ctc ASR system with librispeech. + +Decoding is performed with greedy decoding at validation time. +At test time, beamsearch is used with an optional external language model. + +Authors + * Pooneh Mousavi 2024 +""" + +import os +import sys +import torch +import torchaudio +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + + +logger = logging.getLogger(__name__) + +_CACHE = {"size": 0} + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + + + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] + + current_epoch = self.hparams.epoch_counter.current + + # compute features + # Extract tokens (cache them at first epoch if augmentation is disabled) + key = tuple(sorted(batch.id)) + try: + in_toks = _CACHE[key] + in_toks = in_toks.to(self.device) + except KeyError: + with torch.no_grad(): + self.hparams.tokenizer.eval().to(self.device) + in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q] + if stage != sb.Stage.TRAIN or ( + stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment")) + ): + if _CACHE["size"] < self.hparams.cache_size: + _CACHE[key] = in_toks.cpu() + _CACHE["size"] += in_toks.numel() + + # Extract embeddings + in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D] + + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) #[B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2) #[B, T, D] + + # forward modules + if type(self.modules.encoder).__name__ == "ContextNet": + enc_out = self.modules.encoder(in_embs) + + elif type(self.modules.encoder).__name__ == "LSTM": + enc_out, _ = self.modules.encoder( + in_embs + ) + + else: + raise NotImplementedError + + # output layer for ctc log-probabilities + logits = self.modules.ctc_lin(enc_out) + p_ctc = self.hparams.log_softmax(logits) + + p_tokens = None + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) + + return p_ctc, wav_lens, p_tokens + + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + ids = batch.id + tokens, tokens_lens = batch.tokens + + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.wrd] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.wer_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # log stats and save checkpoint at end-of-epoch + if stage == sb.Stage.VALID: + if type(self.hparams.scheduler).__name__ == "NewBobScheduler": + lr, new_lr = self.hparams.scheduler( + stage_stats["loss"] + ) + sb.nnet.schedulers.update_learning_rate( + self.optimizer, new_lr + ) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr = self.hparams.scheduler.current_lr + steps = self.optimizer_step + + else: + raise NotImplementedError + + optimizer = self.optimizer.__class__.__name__ + epoch_stats = { + "epoch": epoch, + "lr": lr, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w: + self.wer_metric.write_stats(w) + + def on_fit_batch_end(self, batch, outputs, loss, should_step): + if should_step and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + self.hparams.scheduler(self.optimizer) + + + +def dataio_prepare(hparams, tokenizer): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions.""" + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + # test is separate + test_datasets = {} + for csv_file in hparams["test_csv"]: + name = Path(csv_file).stem + test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_file, replacements={"data_root": data_folder} + ) + test_datasets[name] = test_datasets[name].filtered_sorted( + sort_key="duration" + ) + + datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + info = torchaudio.info(wav) + resampled = torchaudio.transforms.Resample( + info.sample_rate, hparams["sample_rate"], + )(sig) + #resampled = resampled.unsqueeze(0) + return resampled + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.provides( + "wrd", "char_list", "tokens_list", "tokens" + ) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) + yield char_list + tokens_list = tokenizer.sp.encode_as_ids(wrd) + yield tokens_list + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, ["id", "sig", "wrd", "char_list", "tokens"], + ) + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_val = hparams["dynamic_batch_sampler_val"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_val, + ) + + return ( + train_data, + valid_data, + test_datasets, + train_batch_sampler, + valid_batch_sampler, + ) + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # If distributed_launch=True then + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + + # here we create the datasets objects as well as tokenization and encoding + ( + train_data, + valid_data, + test_datasets, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) + + # Use pretrained embeddings + if hparams["pretrain_embeddings"]: + embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"]) + hparams["discrete_embedding_layer"].init_embedding(embs) + + + # Log number of parameters/buffers + codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()]) + model_params = sum( + [ + x.numel() + for module in hparams["modules"].values() + for x in module.state_dict().values() + ] + ) + hparams["train_logger"].log_stats( + stats_meta={ + f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}", + "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", + }, + ) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["model_opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # Adding objects to trainer. + asr_brain.tokenizer = tokenizer + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] + + from speechbrain.decoders.ctc import CTCBeamSearcher + + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], + vocab_list=vocab_list, + ) + + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py new file mode 100644 index 000000000..e7db8766a --- /dev/null +++ b/benchmarks/DASB/model/ __init__.py @@ -0,0 +1 @@ +from model.tokenizer_interface import EncodecTokenizer \ No newline at end of file diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index b6e11a0d2..d3bf3cc9f 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -57,9 +57,9 @@ def __init__( num_codebooks, vocab_size, emb_dim, - pad_index=0, init=False, freeze=False, + hidden_dim =None, ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size @@ -70,10 +70,17 @@ def __init__( ).requires_grad_(not self.freeze) self.init = init - def init_embedding(self, weights): - with torch.no_grad(): - self.embedding.weight = torch.nn.Parameter(weights) + # Add a linear layer to match dimensions if necessary + if hidden_dim is not None and hidden_dim != emb_dim: + self.proj_layer = torch.nn.Linear(emb_dim, hidden_dim) + else: + self.proj_layer = None + + def init_embedding(self, weights): + self.embedding.weight.data.copy_(weights) + + def forward(self, in_tokens): """Computes the embedding for discrete tokens. a sample. @@ -97,4 +104,6 @@ def forward(self, in_tokens): ) # Forward Pass to embedding and in_embs = self.embedding(in_tokens) + if self.proj_layer is not None: + in_embs = self.proj_layer(in_embs) return in_embs diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 892bef6b3..351652a57 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -10,92 +10,152 @@ """ import torch +from abc import ABC, abstractmethod +from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.discrete.dac import DAC +from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface -from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec -from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL -from speechbrain.lobes.models.discrete.dac import DAC -from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface +class BaseTokenizer(ABC): + @abstractmethod + @torch.no_grad() + def sig_to_tokens(self, signal, lengths, **kwargs): + """Abstract method to encode a signal into tokens.""" + pass + + @abstractmethod + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + """Abstract method to decode tokens into a signal.""" + pass + + @abstractmethod + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + pass -class Tokenizer_Encodec(Encodec): +class EncodecTokenizer(Encodec, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks, _ = self.encode(sig, lens) # [B, N, K] - return toks + tokens, _ = self.encode(signal, lengths) # [B, T, N_Q] + return tokens @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - sig = self.decode(toks)[:, 0] # [B, T] - return sig - -class Tokenizer_DAC(DAC): + signal = self.decode(tokens)[:, 0] # [B, T] + return signal + @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + embeddings = self.vocabulary + return embeddings.reshape(-1, embeddings.shape[-1]) + +class DACTokenizer(DAC, BaseTokenizer): + @torch.no_grad() + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks, _ = self( - sig[:, None], n_quantizers=kwargs['num_codebooks'] - ) # [B, K, N] - toks = toks.movedim(-1, -2) # [B, N, K] - return toks + tokens, _ = self( + signal[:, None], n_quantizers=kwargs['num_codebooks'] + ) # [B, N_Q, T] + return tokens.movedim(-1, -2) # [B, T, N_Q] @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - qfeats, _, _ = self.quantizer.from_codes( - toks.movedim(-1, -2) # [B, K, N] + quantized_feats, _, _ = self.quantizer.from_codes( + tokens.movedim(-1, -2) # [B, N_Q, T] ) - sig = self.decode(qfeats)[:, 0] # [B, T] - return sig - -class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface): + signal = self.decode(quantized_feats)[:, 0] # [B, T] + return signal + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200 + toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) + toks = ( + toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone() + ) # [C, K, 1] + self.to(kwargs["device"]).eval() + with torch.no_grad(): + z_q, z_p, _ = self.quantizer.from_codes(toks) + z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) # [C, D, 1] * K + z_qs = [] + for i, z_p_i in enumerate(z_ps): + with torch.no_grad(): + z_q_i = ( + self.quantizer.quantizers[i].out_proj(z_p_i) + ) # [C, H, 1] + z_qs.append(z_q_i) + assert (z_q == sum(z_qs)).all() + embeddings = torch.cat(z_qs)[:, :, 0] # [CK, H] + return embeddings + +class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks = self(sig)[ - : kwargs['num_codebooks'] - ] # [K, B, N] - toks = toks.movedim(-3, -1) # [B, N, K] - return toks + tokens = self(signal)[: kwargs['num_codebooks']] # [N_Q, B, T] + return tokens.movedim(-3, -1) # [B, T, N_Q] @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - toks = toks.movedim(-1, -3) # [K, B, N] - sig = self.decode(toks) # [B, T] - return sig - -class Tokenizer_DiscreteSSL(DiscreteSSL): + tokens = tokens.movedim(-1, -3) # [N_Q, B, T] + return self.decode(tokens) # [B, T] + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360 + toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) + toks = ( + toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone() + ) # [K, C, 1] + self.to(kwargs["device"]).eval() + embs = [] + for i, indices in enumerate(toks): + layer = self.model.quantizer.vq.layers[i] + with torch.no_grad(): + quantized = layer.decode(indices) # [C, H, 1] + embs.append(quantized) + assert ( + self.model.quantizer.decode(toks) == sum(embs) + ).all() + embeddings = torch.cat(embs)[:, :, 0] # [CK, H] + return embeddings + +class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths): + # signal: [B, T] self.hparams.codec_quantizer.to(self.device).eval() - toks, _, _ = self.hparams.codec_quantizer( - sig, - lens, + tokens, _, _ = self.hparams.codec_quantizer( + signal, + lengths, SSL_layers=self.hparams.SSL_layers, deduplicates=[False] * len(self.hparams.SSL_layers), bpe_tokenizers=[None] * len(self.hparams.SSL_layers), - ) # [B, N, K] - return toks + ) # [B, T, N_Q] + return tokens @torch.no_grad() - def toks_to_sig(self, toks): - # toks: [B, N, K] - self.hparams.codec_vocoder.device = self.device + def tokens_to_sig(self, tokens): + # tokens: [B, T, N_Q] self.hparams.codec_vocoder.to(self.device).eval() - # Add offset for embedding layer all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids - # TODO: remove after testing - assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23) offsets = torch.arange( 0, len(all_layer_ids) * self.hparams.vocab_size, @@ -104,61 +164,18 @@ def toks_to_sig(self, toks): ) offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] offsets = offsets[offset_idxes] - toks = toks + offsets + 1 + tokens += offsets + 1 - # Handle missing codebooks if len(self.hparams.SSL_layers) < len(all_layer_ids): - full_toks = torch.zeros( - *toks.shape[:2], + full_tokens = torch.zeros( + *tokens.shape[:2], len(all_layer_ids), - dtype=toks.dtype, + dtype=tokens.dtype, device=self.device, ) for i, idx in enumerate(offset_idxes): - full_toks[..., idx] = toks[..., i] - toks = full_toks + full_tokens[..., idx] = tokens[..., i] + tokens = full_tokens self.hparams.codec_vocoder.tokenize = False - sig = self.hparams.codec_vocoder(toks)[:, 0] # [B, T] - return sig - -class Tokenizer: - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - @torch.no_grad() - def encode(self,sig, lens,**kwargs): - toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs) - return toks - - @torch.no_grad() - def decode(self,sig,**kwargs): - sig = self.tokenizer.toks_to_sig(sig,**kwargs) - return sig - - -# model_hub = "facebook/encodec_24khz" -# save_path = "savedir" -# model = Tokenizer_Encodec(model_hub, save_path) -# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT) -# inputs = torch.rand([3, 2000]) -# model_hub = "facebook/hubert-large-ll60k" -# save_path = "savedir" -# ssl_layer_num = [7,23] -# deduplicate =[False, True] -# bpe_tokenizers=[None, None] -# kmeans_repo_id = "speechbrain/SSL_Quantization" -# kmeans_dataset = "LJSpeech" -# num_clusters = 1000 -# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True) -# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) -model_hub = "fnlp/SpeechTokenizer" -save_path = "savedir" -model =Tokenizer_SpeechTokenizer(model_hub, save_path) # doctest: +SKIP -tokenizer= Tokenizer(model) -audio = torch.randn(4, 1000) -length = torch.tensor([1.0, .5, .75, 1.0]) -tokens = tokenizer.encode(audio, length,num_codebooks=2) -print(tokens.shape) -rec = tokenizer.decode(tokens) -print(rec.shape) \ No newline at end of file + return self.hparams.codec_vocoder(tokens)[:, 0] # [B, T] From 17898c3472ec45ae2173b5894f7c7e550918d9d4 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:40:14 -0500 Subject: [PATCH 3/9] fix precommit --- .../ASR-refactor/hparams/LSTM/dac.yaml | 4 +- .../ASR-refactor/hparams/LSTM/encodec.yaml | 4 +- .../hparams/LSTM/speech_tokenizer.yaml | 2 +- .../ASR-refactor/hparams/contextnet/dac.yaml | 4 +- .../hparams/contextnet/encodec.yaml | 5 +- .../hparams/contextnet/speech_tokenizer.yaml | 4 +- .../DASB/LibriSpeech/ASR-refactor/train.py | 78 +++++++++-------- benchmarks/DASB/model/ __init__.py | 2 +- benchmarks/DASB/model/custom_model.py | 6 +- benchmarks/DASB/model/tokenizer_interface.py | 84 +++++++------------ 10 files changed, 84 insertions(+), 109 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index 4accc2241..806305774 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -182,7 +182,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml index 03c29ddbb..18d967244 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml @@ -182,7 +182,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml index 8105204a5..55d7c3c91 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -172,7 +172,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml index eabeef113..aa7d2e141 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml @@ -175,7 +175,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -186,7 +186,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml index c0411bd76..a1b5262d3 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml @@ -107,7 +107,6 @@ encoder_dim: 1024 pretrain_embeddings: False freeze_embedding: False - output_neurons: 31 # BPE parameters @@ -173,7 +172,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -184,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml index 77ef2c540..c12d6f79f 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml @@ -163,7 +163,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -174,7 +174,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index 61b6c56f4..baa80c80e 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -18,6 +18,7 @@ from speechbrain.tokenizers.SentencePiece import SentencePiece from hyperpyyaml import load_hyperpyyaml from pathlib import Path + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) sys.path.append(base_dir) @@ -32,11 +33,10 @@ def compute_forward(self, batch, stage): """Forward computations from the waveform batches to the output probabilities.""" batch = batch.to(self.device) wavs, wav_lens = batch.sig - # Add waveform augmentation if specified. if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] current_epoch = self.hparams.epoch_counter.current @@ -49,33 +49,38 @@ def compute_forward(self, batch, stage): except KeyError: with torch.no_grad(): self.hparams.tokenizer.eval().to(self.device) - in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q] + in_toks = self.hparams.tokenizer.sig_to_tokens( + wavs, wav_lens, num_codebooks=hparams["num_codebooks"] + ) # [B, T, N-Q] if stage != sb.Stage.TRAIN or ( - stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment")) + stage == sb.Stage.TRAIN + and (not hasattr(self.hparams, "wav_augment")) ): if _CACHE["size"] < self.hparams.cache_size: _CACHE[key] = in_toks.cpu() _CACHE["size"] += in_toks.numel() # Extract embeddings - in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D] + in_embs = self.modules.discrete_embedding_layer( + in_toks + ) # [B, T, N-Q, D] - # Attention-Pooling - att_w = self.modules.attention_mlp(in_embs) #[B, T, N-Q, 1] - in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2) #[B, T, D] + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) # [B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze( + -2 + ) # [B, T, D] # forward modules if type(self.modules.encoder).__name__ == "ContextNet": enc_out = self.modules.encoder(in_embs) elif type(self.modules.encoder).__name__ == "LSTM": - enc_out, _ = self.modules.encoder( - in_embs - ) + enc_out, _ = self.modules.encoder(in_embs) else: raise NotImplementedError - + # output layer for ctc log-probabilities logits = self.modules.ctc_lin(enc_out) p_ctc = self.hparams.log_softmax(logits) @@ -89,7 +94,6 @@ def compute_forward(self, batch, stage): p_tokens = test_searcher(p_ctc, wav_lens) return p_ctc, wav_lens, p_tokens - def compute_objectives(self, predictions, batch, stage): """Computes the loss (CTC+NLL) given predictions and targets.""" @@ -98,14 +102,13 @@ def compute_objectives(self, predictions, batch, stage): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): tokens = self.hparams.wav_augment.replicate_labels(tokens) tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - + if stage == sb.Stage.VALID: # Decode token terms to words predicted_words = self.tokenizer( @@ -149,19 +152,15 @@ def on_stage_end(self, stage, stage_loss, epoch): # log stats and save checkpoint at end-of-epoch if stage == sb.Stage.VALID: if type(self.hparams.scheduler).__name__ == "NewBobScheduler": - lr, new_lr = self.hparams.scheduler( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.optimizer, new_lr - ) - elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr, new_lr = self.hparams.scheduler(stage_stats["loss"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": lr = self.hparams.scheduler.current_lr steps = self.optimizer_step - + else: raise NotImplementedError - + optimizer = self.optimizer.__class__.__name__ epoch_stats = { "epoch": epoch, @@ -185,15 +184,19 @@ def on_stage_end(self, stage, stage_loss, epoch): test_stats=stage_stats, ) if if_main_process(): - with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w: + with open( + self.hparams.output_wer_folder, "w", encoding="utf-8" + ) as w: self.wer_metric.write_stats(w) def on_fit_batch_end(self, batch, outputs, loss, should_step): - if should_step and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + if ( + should_step + and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler" + ): self.hparams.scheduler(self.optimizer) - def dataio_prepare(hparams, tokenizer): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions.""" @@ -251,7 +254,7 @@ def audio_pipeline(wav): resampled = torchaudio.transforms.Resample( info.sample_rate, hparams["sample_rate"], )(sig) - #resampled = resampled.unsqueeze(0) + # resampled = resampled.unsqueeze(0) return resampled sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) @@ -272,7 +275,6 @@ def text_pipeline(wrd): sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - # 4. Set output: sb.dataio.dataset.set_output_keys( datasets, ["id", "sig", "wrd", "char_list", "tokens"], @@ -319,7 +321,6 @@ def text_pipeline(wrd): # create ddp_group with the right communication protocol sb.utils.distributed.ddp_init_group(run_opts) - # Create experiment directory sb.create_experiment_directory( experiment_directory=hparams["output_folder"], @@ -327,7 +328,6 @@ def text_pipeline(wrd): overrides=overrides, ) - # Dataset prep (parsing Librispeech) from librispeech_prepare import prepare_librispeech # noqa @@ -369,12 +369,17 @@ def text_pipeline(wrd): # Use pretrained embeddings if hparams["pretrain_embeddings"]: - embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"]) + embs = hparams["tokenizer"].get_pretrained_embeddings( + device=run_opts["device"], + num_codebooks=hparams["num_codebooks"], + vocab_size=hparams["vocab_size"], + ) hparams["discrete_embedding_layer"].init_embedding(embs) - # Log number of parameters/buffers - codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()]) + codec_params = sum( + [x.numel() for x in hparams["tokenizer"].state_dict().values()] + ) model_params = sum( [ x.numel() @@ -407,8 +412,7 @@ def text_pipeline(wrd): from speechbrain.decoders.ctc import CTCBeamSearcher test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], - vocab_list=vocab_list, + **hparams["test_beam_search"], vocab_list=vocab_list, ) train_dataloader_opts = hparams["train_dataloader_opts"] diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py index e7db8766a..b59bcdfa5 100644 --- a/benchmarks/DASB/model/ __init__.py +++ b/benchmarks/DASB/model/ __init__.py @@ -1 +1 @@ -from model.tokenizer_interface import EncodecTokenizer \ No newline at end of file +from model.tokenizer_interface import EncodecTokenizer diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index d3bf3cc9f..1c655fc65 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -59,7 +59,7 @@ def __init__( emb_dim, init=False, freeze=False, - hidden_dim =None, + hidden_dim=None, ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size @@ -76,11 +76,9 @@ def __init__( else: self.proj_layer = None - def init_embedding(self, weights): self.embedding.weight.data.copy_(weights) - - + def forward(self, in_tokens): """Computes the embedding for discrete tokens. a sample. diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 351652a57..604e3a403 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -1,4 +1,3 @@ - """ Unified interface for tokenizers, standardizing the output shape of encode and decode functions. @@ -12,9 +11,13 @@ import torch from abc import ABC, abstractmethod from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec -from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( + DiscreteSSL, +) from speechbrain.lobes.models.discrete.dac import DAC -from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface +from speechbrain.lobes.models.discrete.speechtokenizer_interface import ( + SpeechTokenizer_interface, +) class BaseTokenizer(ABC): @@ -29,13 +32,14 @@ def sig_to_tokens(self, signal, lengths, **kwargs): def tokens_to_sig(self, tokens, **kwargs): """Abstract method to decode tokens into a signal.""" pass - + @abstractmethod @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" pass + class EncodecTokenizer(Encodec, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): @@ -50,20 +54,21 @@ def tokens_to_sig(self, tokens, **kwargs): self.eval() signal = self.decode(tokens)[:, 0] # [B, T] return signal - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" embeddings = self.vocabulary return embeddings.reshape(-1, embeddings.shape[-1]) + class DACTokenizer(DAC, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): # signal: [B, T] self.eval() tokens, _ = self( - signal[:, None], n_quantizers=kwargs['num_codebooks'] + signal[:, None], n_quantizers=kwargs["num_codebooks"] ) # [B, N_Q, T] return tokens.movedim(-1, -2) # [B, T, N_Q] @@ -76,7 +81,7 @@ def tokens_to_sig(self, tokens, **kwargs): ) signal = self.decode(quantized_feats)[:, 0] # [B, T] return signal - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" @@ -88,24 +93,25 @@ def get_pretrained_embeddings(self, **kwargs): self.to(kwargs["device"]).eval() with torch.no_grad(): z_q, z_p, _ = self.quantizer.from_codes(toks) - z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) # [C, D, 1] * K + z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) z_qs = [] for i, z_p_i in enumerate(z_ps): with torch.no_grad(): - z_q_i = ( - self.quantizer.quantizers[i].out_proj(z_p_i) + z_q_i = self.quantizer.quantizers[i].out_proj( + z_p_i ) # [C, H, 1] z_qs.append(z_q_i) assert (z_q == sum(z_qs)).all() - embeddings = torch.cat(z_qs)[:, :, 0] # [CK, H] + embeddings = torch.cat(z_qs)[:, :, 0] return embeddings + class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): # signal: [B, T] self.eval() - tokens = self(signal)[: kwargs['num_codebooks']] # [N_Q, B, T] + tokens = self(signal)[: kwargs["num_codebooks"]] # [N_Q, B, T] return tokens.movedim(-3, -1) # [B, T, N_Q] @torch.no_grad() @@ -114,7 +120,7 @@ def tokens_to_sig(self, tokens, **kwargs): self.eval() tokens = tokens.movedim(-1, -3) # [N_Q, B, T] return self.decode(tokens) # [B, T] - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" @@ -128,54 +134,22 @@ def get_pretrained_embeddings(self, **kwargs): for i, indices in enumerate(toks): layer = self.model.quantizer.vq.layers[i] with torch.no_grad(): - quantized = layer.decode(indices) # [C, H, 1] + quantized = layer.decode(indices) embs.append(quantized) - assert ( - self.model.quantizer.decode(toks) == sum(embs) - ).all() - embeddings = torch.cat(embs)[:, :, 0] # [CK, H] + assert (self.model.quantizer.decode(toks) == sum(embs)).all() + embeddings = torch.cat(embs)[:, :, 0] return embeddings + class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths): - # signal: [B, T] - self.hparams.codec_quantizer.to(self.device).eval() - tokens, _, _ = self.hparams.codec_quantizer( - signal, - lengths, - SSL_layers=self.hparams.SSL_layers, - deduplicates=[False] * len(self.hparams.SSL_layers), - bpe_tokenizers=[None] * len(self.hparams.SSL_layers), - ) # [B, T, N_Q] - return tokens + pass @torch.no_grad() def tokens_to_sig(self, tokens): - # tokens: [B, T, N_Q] - self.hparams.codec_vocoder.to(self.device).eval() - - all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids - offsets = torch.arange( - 0, - len(all_layer_ids) * self.hparams.vocab_size, - self.hparams.vocab_size, - device=self.device, - ) - offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] - offsets = offsets[offset_idxes] - tokens += offsets + 1 - - if len(self.hparams.SSL_layers) < len(all_layer_ids): - full_tokens = torch.zeros( - *tokens.shape[:2], - len(all_layer_ids), - dtype=tokens.dtype, - device=self.device, - ) - for i, idx in enumerate(offset_idxes): - full_tokens[..., idx] = tokens[..., i] - tokens = full_tokens - - self.hparams.codec_vocoder.tokenize = False - return self.hparams.codec_vocoder(tokens)[:, 0] # [B, T] + pass + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + pass From db1590ee346dab0896723cf8184ba8b1e12355b8 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:54:09 -0500 Subject: [PATCH 4/9] fix flake --- benchmarks/DASB/LibriSpeech/ASR-refactor/train.py | 5 +---- benchmarks/DASB/model/ __init__.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index baa80c80e..99eeb81fe 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -27,6 +27,7 @@ _CACHE = {"size": 0} + # Define training procedure class ASR(sb.Brain): def compute_forward(self, batch, stage): @@ -38,8 +39,6 @@ def compute_forward(self, batch, stage): if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] - current_epoch = self.hparams.epoch_counter.current - # compute features # Extract tokens (cache them at first epoch if augmentation is disabled) key = tuple(sorted(batch.id)) @@ -156,8 +155,6 @@ def on_stage_end(self, stage, stage_loss, epoch): sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": lr = self.hparams.scheduler.current_lr - steps = self.optimizer_step - else: raise NotImplementedError diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py index b59bcdfa5..e69de29bb 100644 --- a/benchmarks/DASB/model/ __init__.py +++ b/benchmarks/DASB/model/ __init__.py @@ -1 +0,0 @@ -from model.tokenizer_interface import EncodecTokenizer From 3361ac6e9c21e94d2957d76347c7c19bfeab88ad Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:56:08 -0500 Subject: [PATCH 5/9] fix blank index --- .../LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml index 55d7c3c91..99d423b87 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -183,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 From 1dd6c9e7d6cbce93335cfcb8e2ee152e09782331 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 26 Nov 2024 18:35:02 -0500 Subject: [PATCH 6/9] add hyp tuning [draft] --- .../ASR-refactor/hparams/LSTM/dac.yaml | 4 +- benchmarks/DASB/extra_requirements.txt | 2 + benchmarks/DASB/orion/hparams_bohb.yaml | 6 + benchmarks/DASB/run_experiments.sh | 220 ++++++++++ benchmarks/DASB/run_hparam_optimization.sh | 405 ++++++++++++++++++ benchmarks/DASB/utils/aggregate_results.py | 145 +++++++ 6 files changed, 780 insertions(+), 2 deletions(-) create mode 100755 benchmarks/DASB/orion/hparams_bohb.yaml create mode 100755 benchmarks/DASB/run_experiments.sh create mode 100644 benchmarks/DASB/run_hparam_optimization.sh create mode 100644 benchmarks/DASB/utils/aggregate_results.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index 806305774..e02076cfb 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -23,9 +23,9 @@ data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech # then data_folder_rirs should be /localscratch/xxx_corpus # otherwise the dataset will automatically be downloaded # data_folder_rirs: !ref -train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +train_splits: ["train-clean-100"] dev_splits: ["dev-clean"] -test_splits: ["dev-clean", "test-clean", "test-other"] +test_splits: ["dev-clean", "test-clean"] skip_prep: False train_csv: !ref /train.csv valid_csv: !ref /dev-clean.csv diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index 4d1d241c3..4b693ec1b 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -8,3 +8,5 @@ speechtokenizer>=0.1.2 tensorboard tgt unidecode +orion[bohb] +ConfigSpace==0.7.1 diff --git a/benchmarks/DASB/orion/hparams_bohb.yaml b/benchmarks/DASB/orion/hparams_bohb.yaml new file mode 100755 index 000000000..e68509559 --- /dev/null +++ b/benchmarks/DASB/orion/hparams_bohb.yaml @@ -0,0 +1,6 @@ +experiment: + algorithms: + bohb: + seed: 1986 + min_points_in_model: 20 + num_samples: 24 diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh new file mode 100755 index 000000000..36f6a845f --- /dev/null +++ b/benchmarks/DASB/run_experiments.sh @@ -0,0 +1,220 @@ +#!/bin/bash + +########################################################### +# Script to run leave-one-subject-out and/or leave-one-session-out training, optionally with multiple seeds. +# This script loops over the different subjects and sessions and trains different models. +# At the end, the final performance is computed with the aggregate_results.py script that provides the average performance. +# +# Usage: +# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \ +# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10 +# +# Authors: +# - Pooneh Mousavi (2024) +########################################################### + +# Initialize variables +data_folder="" +cached_data_folder="" +output_folder="" +task="" +downstream="" +tokenizer_name="" +dataset="" +seed="" +nruns="" +eval_metric="acc" +eval_set="test" +rnd_dir=False +additional_flags="" + + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --data_folder data_folder_path Data folder path" + echo " --output_folder output_path Output folder path" + echo " --task task downstream task" + echo " --downstream downstream probing head" + echo " --tokenizer_name tokenizer_name tokenizer choice" + echo " --dataset dataset dataset" + echo " --seed random_seed Seed (random if not specified)" + echo " --nruns num_runs Number of runs" + echo " --eval_metric metric Evaluation metric (e.g., acc or f1)" + echo " --eval_set dev or test Evaluation set. Default: test" + echo " --rnd_dir If True the results are stored in a subdir of the output folder with a random name (useful to store all the results of an hparam tuning). Default: False" + exit 1 +} + + +# Parse command line +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --data_folder) + data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --task) + task="$2" + shift + shift + ;; + + + --downstream) + downstream="$2" + shift + shift + ;; + + --tokenizer_name) + tokenizer_name="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + --eval_set) + eval_set="$2" + shift + shift + ;; + + --rnd_dir) + rnd_dir="$2" + shift + shift + ;; + + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Process eval_set argument +if [ "$eval_set" = "dev" ]; then + metric_file=valid_metrics.pkl +elif [ "$eval_set" = "test" ]; then + metric_file=test_metrics.pkl +else + echo "Invalid eval_set value: $eval_set. It can be test or dev only." + exit 1 +fi + +# Manage Seed (optional argument) +seed="${seed:-$RANDOM}" + + + +if [ "$rnd_dir" = True ]; then + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" +fi + +# Make sure the output_folder is created +mkdir -p $output_folder + +# Print command line arguments and save to file +{ + echo "hparams: $hparams" + echo "data_folder: $data_folder" + echo "output_folder: $output_folder" + echo "task: $task" + echo "downstream: $downstream" + echo "tokenizer_name: $tokenizer_name" + echo "dataset: $dataset" + echo "seed: $seed" + echo "nruns: $nruns" + echo "eval_metric: $eval_metric" + echo "eval_set: $eval_set" + echo "rnd_dir: $rnd_dir" + echo "additional flags: $additional_flags" +} | tee "$output_folder/flags.txt" + + +# Creating output folder +mkdir -p $output_folder +mkdir -p $data_folder +mkdir -p $cached_data_folder + +# Function to run the training experiment +run_experiment() { + +python $dataset/$task/train.py $dataset/$task/hparams/$downstream/$tokenizer_name.yaml --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp\ +$additional_flags --debug + +} + +# Run multiple training experiments (with different seeds) +for i in $(seq 0 1 $(( nruns - 1 ))); do + ((run_idx = i + 1)) + run_name=run"$run_idx" + output_folder_exp="$output_folder"/"$run_name"/$seed + + run_experiment $output_folder_exp + + + # Store the results + # python utils/parse_results.py $output_folder_exp $metric_file $eval_metric | tee -a $output_folder/$run_name\_results.txt + + # Changing Random seed + seed=$((seed+1)) +done + + +echo 'Final Results (Performance Aggregation)' +python utils/aggregate_results.py $output_folder $eval_metric | tee -a $output_folder/aggregated_performance.txt diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh new file mode 100644 index 000000000..1b2570675 --- /dev/null +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -0,0 +1,405 @@ +#!/bin/bash + +########################################################### +# Hyperparameter Tuning Script for EEG Model with Orion +########################################################### + +# Description: +# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion. +# It supports leave-one-subject-out and/or leave-one-session-out training strategies. + +# Usage: +# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \ +# --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \ +# --data_folder eeg_data/ \ +# --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \ +# --nruns 1 --nruns_eval 10 \ +# --eval_metric acc \ +# --exp_max_trials 50 \ +# --store_all True \ +# --device 'cpu' +# +# Optimization Steps: +# The script supports multiple hyperparameter optimization steps. +# We found it convenient to first optimize training and model hyperparameters, +# and then optimize data augmentation hyperparameters in a separate step. + +# Script Workflow: +# 1. Search for the orion flags in the specified hparam file. +# 2. Run the orion-hunt command for hyperparameter tuning. +# By default, TPE (Tree-structured Parzen Estimator) hyperparameter tuning is +# performed, as specified in the default orion config file at hparams/orion/hparams_tpe.yaml. +# 3. Save the best hyperparameters, which can be viewed using torch-info. +# 4. Loop until flags like @orion_step are found in the YAML file. +# +# Final Performance Evaluation: +# At the end of the optimization process, the script computes the final performance +# using the best hyperparameters on the test set. +# This is done by averaging over nruns_eval different seeds. +# +# Note: More detailed information can be found in the README.md file. + +# Authors: +# - Mirco Ravanelli (2023) +# - Davide Borra (2023) +########################################################### + +# Initialize variables +exp_name="hopt" +output_folder="" +data_folder="" +cached_data_folder="" +hparams="" +nruns="" +nruns_eval=10 +eval_metric="acc" +seed=1986 +config_file="hparams/orion/hparams_tpe.yaml" +mne_dir="" +orion_db_address="" +orion_db_type="PickledDB" +exp_max_trials=50 +store_all=True +compress_exp=True + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --exp_name Name Name that Orion gives to the experiment" + echo " --output_folder output_path Output folder were the results will be stored" + echo " --data_folder data_path Folder were the data are stored. If not available, they will be downloaded there." + echo " --cached_data_folder path [Optional] Folder were the data in pkl format will be cached." + echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" + echo " --nruns num_runs Number of runs for each hparam selection." + echo " --nruns_eval num_runs Number of runs for the final evaluation (with best hparams) on the test set" + echo " --eval_metric metric [Optional] Evaluation metric description. Default:acc" + echo " --seed random_seed [Optional] Seed (random if not specified)" + echo " --config_file config_file [Optional] Orion config file. Default: hparams/orion/hparams_tpe.yaml" + echo " --mne_dir mne_dir [Optional] MNE directory. Need it different from your home (see notes on MNE in README.md)" + echo " --orion_db_address [Optional] Path of the database where orion will store hparams and performance" + echo " --orion_db_type db_type [Optional] Type of the dataset that orion will use. Default: PickledDB" + echo " --exp_max_trials int [Optional] Maximum number of hparam trials for each oprimization step. Default:50" + echo " --store_all Bool [Optional] When set to True, the output folders of all hparam trials will be stored in randomly named folders. Default: False" + echo " --compress_exp Bool [Optional] When set to True, this option compresses the output folders of all hyperparameter trials into a single tar.gz file. This is particularly useful when store_all is set to True, as it helps prevent the accumulation of a large number of files. Default: False" + exit 1 +} + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + + --exp_name) + exp_name="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --data_folder) + data_folder="$2" + shift + shift + ;; + + --hparams) + hparams="$2" + shift + shift + ;; + + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --nruns_eval) + nruns_eval="$2" + shift + shift + ;; + + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + + + --config_file) + config_file="$2" + shift + shift + ;; + + --mne_dir) + mne_dir="$2" + shift + shift + ;; + + --orion_db_address) + orion_db_address="$2" + shift + shift + ;; + + --orion_db_type) + orion_db_type="$2" + shift + shift + ;; + + --exp_max_trials) + exp_max_trials="$2" + shift + shift + ;; + + --store_all) + store_all="$2" + shift + shift + ;; + + --compress_exp) + compress_exp="$2" + shift + shift + ;; + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$output_folder" ] || [ -z "$data_folder" ] || [ -z "$hparams" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Set mne_dir if specified +if [ "$mne_dir" ]; then + export _MNE_FAKE_HOME_DIR=$mne_dir +fi + +# Assign default value to cached_data_folder +if [ -z "$cached_data_folder" ]; then + cached_data_folder="$data_folder/pkl" +fi + + +# Set orion db address if specified +if [ -z "$orion_db_address" ]; then + orion_db_address=$output_folder'/'$exp_name'.pkl' +fi +export ORION_DB_ADDRESS=$orion_db_address +export ORION_DB_TYPE=$orion_db_type + +echo "-------------------------------------" +echo "Experiment Name: $exp_name" +echo "Output Folder: $output_folder" +echo "Data Folder: $data_folder" +echo "Cached Data Folder: $cached_data_folder" +echo "Hparam File: $hparams" +echo "Number of Runs: $nruns" +echo "Number of Eval Runs: $nruns_eval" +echo "Eval Metric: $eval_metric" +echo "Seed: $seed" +echo "Additional Flags: $additional_flags" +echo "Orion Config File: $config_file" +echo "Orion Database type: $orion_db_type" +echo "Orion Database file: $orion_db_address" +echo "Experiment Max Trials: $exp_max_trials" +echo "-------------------------------------" + + +# This function will extract all the optimization flags added in the yaml file +# The input is a text file (e.g, a yaml file) and a pattern (e.g, "@orion_step1:") +# The ouput are the detected flags (e.g., --dropout~"uniform(0.0, 0.5)"). +get_flag() { + local file_path="$1" + local pattern="$2" + + # Check if the file exists + if [ ! -f "$file_path" ]; then + echo "Error: File '$file_path' not found." + return 1 + fi + + # Use grep to find all lines containing the pattern and then extract the flags using sed + grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n' +} + + +# Function for updatading the hparam yaml file with the best hparams found at step 1 +update_hparams() { + local best_hparams_file="$1" + local hparams_yaml_file="$2" + local output_yaml_file="$3" + + # Read the values from best_hparams.txt into an associative array + declare -A best_hparams + while IFS=": " read -r key value; do + best_hparams["$key"]=$value + done < "$best_hparams_file" + + + # Read the hparams.yaml file into a variable + local hparams_content=$(cat "$hparams_yaml_file") + + # Update values in hparams_content using values from best_hparams + for key in "${!best_hparams[@]}"; do + local pattern="^$key: .*" + local replacement="$key: ${best_hparams[$key]}" + hparams_content=$(sed "s/$pattern/$replacement/g" <<< "$hparams_content") + done + + # Write the updated content to a new YAML file + echo "$hparams_content" > "$output_yaml_file" +} + +# Function for extracting the best hparams from orion-info +function extract_best_params() { + local input_file="$1" + local best_trial_line=$(grep -n "best trial:" "$input_file" | cut -d ":" -f 1) + local params_lines=$(tail -n +$best_trial_line "$input_file" | awk '/params:/{flag=1;next}/start time:/{flag=0}flag') + local formatted_params=$(echo "$params_lines" | sed -e 's/^[[:space:]]*//' -e 's/: /: /' -e '/^$/d' -e 's#^/##') + echo "$formatted_params" +} + +# Running hparam tuning (loop over multiple steps) +step_id=1 +hparams_step=$hparams +pattern="@orion_step1:" +opt_flags=$(get_flag "$hparams_step" "$pattern") + +# Check if the string is empty and exit with an error if it is +if [ -z "$opt_flags" ]; then + echo "Error: Optimization flags not found in '$hparams'" + echo "Please ensure that the Orion optimization flags are set in the hparam file using in-line comments like:" + echo "# @orion_step1: --dropout~\"uniform(0.0, 0.5)\"" + exit 1 # Exit with a non-zero error code +fi + + +while [ -n "$opt_flags" ]; do + # Do something + output_folder_step="$output_folder"/step"$step_id" + mkdir -p $output_folder_step + exp_name_step="$exp_name"_step"$step_id" + + echo + echo "**********************************************************************************************" + echo "Running hparam tuning (step $step_id)..." + echo "- This might take several hours!" + echo "- The best set of hparams will be save in $output_folder_step" + echo "- You can monitor the evolution of the hparam optimization with: orion status -n $exp_name" + echo "......" + echo "**********************************************************************************************" + echo + # Setting up orion command + orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ + ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --seed $seed \ + --output_folder $output_folder_step/exp --nruns $nruns \ + --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all $additional_flags" + + + # Appending the optimization flags + orion_hunt_command="$orion_hunt_command $opt_flags" + + echo $orion_hunt_command &> "$output_folder_step/orion_hunt_command.txt" + + # Execute the command for hparm tuning + eval $orion_hunt_command + + # Compress the exp folder (if required) + if [ "$compress_exp" = True ]; then + tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp" + if [ -d "$output_folder_step/exp" ]; then + rm -rf "$output_folder_step/exp" + fi + + fi + + # Storing best haprams + orion info --name $exp_name_step &> $output_folder_step/orion-info.txt + + # Extract list of the best hparams from orion-info + # Find the line number where "best trial:" appears + best_trial_line=$(grep -n "best trial:" $output_folder_step/orion-info.txt | cut -d ":" -f 1) + + # Extract and store the best set of hparams + best_params_output=$(extract_best_params "$output_folder_step/orion-info.txt") + best_hparams_file="$output_folder_step/best_hparams.txt" + echo "$best_params_output" > $best_hparams_file + + # Store the current best yaml file + best_yaml_file="$output_folder_step/best_hparams.yaml" + update_hparams "$best_hparams_file" "$hparams_step" "$best_yaml_file" + + # Update best hparam step + hparams_step=$best_yaml_file + + # Update step variable + ((step_id++)) + + # Update search pattern + pattern="@orion_step$step_id:" + + # update optimization flags pattern + opt_flags=$(get_flag "$hparams_step" "$pattern") +done + +echo +echo "**********************************************************************************************" +echo "Running Final Evaluation on the best hparams (test-set)..." +echo "**********************************************************************************************" +echo + +final_yaml_file="$output_folder/best_hparams.yaml" +scp $best_yaml_file $final_yaml_file + +# Running evaluation on the test set for the best models + ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder \ + --seed $seed --output_folder $output_folder/best --nsbj $nsbj --nsess $nsess \ + --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ + --train_mode $train_mode --rnd_dir $store_all $additional_flags + + +echo "The test performance with best hparams is available at $output_folder/best" diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py new file mode 100644 index 000000000..1ba94c7e1 --- /dev/null +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -0,0 +1,145 @@ +#!/usr/bin/python +""" +Snippet to aggregate the results over multiple runs of the same experiment. +This is useful when we run multiple experiments with different seeds and we +want to compute the average performance. The script also reports the final +metric to Orion (when needed for hyperparameter tuning). + +The script searches for the result files (_results.txt) and computes the mean +and the standard deviation of the given evaluation metrics (e.g., acc or f1). +The results must have an identical format (with only different performance +numbers). + +To run this script: + + > python aggregate_results.py your_result_folder acc + +Author +------ +Mirco Ravanelli, 2022 +""" + +import sys +import numpy as np +from orion.client import report_objective +from speechbrain.utils.data_utils import get_all_files + + +def get_prototype(res_file, eval_metric): + """Parses a result file and adds a placeholder where the aggregated metrics + should be printed. It also returns the number of detected metrics. + + Arguments + --------- + res_file: path + Path of the result file to parse. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + prototype: list + List of the lines of the result file (with as placeholder). + n_metrics: int + Number of metrics to replace in the result files. + """ + prototype = [] + n_metrics = 0 + + # Open the first res file and figure out where the metrics are + with open(res_file) as file_in: + for line in file_in: + if eval_metric in line: + line = line.split(eval_metric)[0] + # The placeholder for the metric is + line = line + eval_metric + " " + n_metrics = n_metrics + 1 + prototype.append(line) + return prototype, n_metrics + + +def get_metrics(res_files, eval_metric): + """Summarizes the metrics of interest in a matrix. + + Arguments + --------- + res_files: list + List of all the result files. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + + # Metric initialization + metrics = np.zeros([n_metrics, len(res_files)]) + + # Loop over files + for i in range(len(res_files)): + cnt = 0 + # Metric extraction + with open(res_files[i]) as file_in: + for line in file_in: + if eval_metric in line: + value = line.split(eval_metric + " ")[1] + value = value.split(" ")[0] + value = float(value) + metrics[cnt, i] = value + cnt = cnt + 1 + return metrics + + +def aggregate_metrics(prototype, metrics): + """Prints the aggregated metrics.It replaces the placeholders with + the corresponding metrics. + + Arguments + --------- + prototype: list + List of the lines of the result file (with as placeholder). + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + cnt = 0 + for line in prototype: + if eval_metric in line: + values_line = "[" + for i in range(len(res_files)): + values_line = values_line + "%f " % float(metrics[cnt, i]) + values_line = values_line[:-1] + values_line = values_line + "] avg: %f ± %f " % ( + float(metrics[cnt, :].mean()), + float(metrics[cnt, :].std()), + ) + line = line.replace("", values_line) + cnt = cnt + 1 + print(line) + + +if __name__ == "__main__": + # output_folder = sys.argv[1] + # eval_metric = sys.argv[2] + output_folder = "benchmarks/DASB/result" + eval_metric = "wer" + # Getting the list of the result files in the output folder + res_files = get_all_files(output_folder, match_and=["_results.txt"]) + + # Gettin a prototype file + prototype, n_metrics = get_prototype(res_files[0], eval_metric) + + # Extracting the metrics of interest + metrics = get_metrics(res_files, eval_metric) + + # print aggregated metrics + aggregate_metrics(prototype, metrics) + + final_metric = metrics[-1, :].mean() + + # Report final metric to Orion + # Remember: orion expects metrics to be minimized! + if eval_metric == "acc" or eval_metric == "f1": + final_metric = 1 - final_metric + report_objective(final_metric) From 78cb049c7712d47b63645d2d14b63fc72f25c6d5 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 27 Nov 2024 12:58:55 -0500 Subject: [PATCH 7/9] add bobh --- .../ASR-refactor/hparams/LSTM/dac.yaml | 21 +++++--- .../DASB/LibriSpeech/ASR-refactor/train.py | 4 +- benchmarks/DASB/run_experiments.sh | 49 +++++++++---------- benchmarks/DASB/run_hparam_optimization.sh | 33 ++++++++++--- benchmarks/DASB/utils/aggregate_results.py | 22 +++++---- 5 files changed, 76 insertions(+), 53 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index e02076cfb..e1d4680b1 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -15,7 +15,7 @@ output_folder: !ref results/dac/LSTM/ output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt - +cached_data_folder: !PLACEHOLDER #'path/to/cache' # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech @@ -27,16 +27,21 @@ train_splits: ["train-clean-100"] dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /dev-clean.csv - - !ref /test-clean.csv + - !ref /dev-clean.csv + - !ref /test-clean.csv ####################### Training Parameters #################################### -number_of_epochs: 20 -batch_size: 4 # This works for 2x GPUs with 32GB +# number_of_epochs: 20 +number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, base=4)" +number_of_epochs: !apply:int + - !apply:math.floor + - !ref +batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(4, 6,discrete=True)" +batch_size: !ref 2 ** test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -48,7 +53,7 @@ valid_search_interval: 1 avg_checkpoints: 10 # Number of checkpoints to average for evaluation cache_size: 1.e+10 -lr_model: 0.001 +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" weight_decay: 0.0005 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index 99eeb81fe..177d79f8f 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -336,7 +336,7 @@ def text_pipeline(wrd): "tr_splits": hparams["train_splits"], "dev_splits": hparams["dev_splits"], "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], + "save_folder": hparams["cached_data_folder"], "merge_lst": hparams["train_splits"], "merge_name": "train.csv", "skip_prep": hparams["skip_prep"], @@ -345,7 +345,7 @@ def text_pipeline(wrd): # Defining tokenizer and loading it tokenizer = SentencePiece( - model_dir=hparams["save_folder"], + model_dir=hparams["cached_data_folder"], vocab_size=hparams["output_neurons"], annotation_train=hparams["train_csv"], annotation_read="wrd", diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh index 36f6a845f..596556c2c 100755 --- a/benchmarks/DASB/run_experiments.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -14,12 +14,11 @@ ########################################################### # Initialize variables +hparams="" data_folder="" cached_data_folder="" output_folder="" task="" -downstream="" -tokenizer_name="" dataset="" seed="" nruns="" @@ -33,12 +32,12 @@ additional_flags="" print_argument_descriptions() { echo "Usage: $0 [options]" echo "Options:" + echo " --hparams hparams_path Hparam YAML file" echo " --data_folder data_folder_path Data folder path" + echo " --cached_data_folder cache_path Cached data folder path" echo " --output_folder output_path Output folder path" echo " --task task downstream task" - echo " --downstream downstream probing head" - echo " --tokenizer_name tokenizer_name tokenizer choice" - echo " --dataset dataset dataset" + echo " --dataset dataset dataset" echo " --seed random_seed Seed (random if not specified)" echo " --nruns num_runs Number of runs" echo " --eval_metric metric Evaluation metric (e.g., acc or f1)" @@ -53,12 +52,24 @@ POSITIONAL_ARGS=() while [[ $# -gt 0 ]]; do case $1 in + --hparams) + hparams="$2" + shift + shift + ;; + --data_folder) data_folder="$2" shift shift ;; + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + --output_folder) output_folder="$2" shift @@ -70,20 +81,7 @@ while [[ $# -gt 0 ]]; do shift shift ;; - - - --downstream) - downstream="$2" - shift - shift - ;; - - --tokenizer_name) - tokenizer_name="$2" - shift - shift - ;; - + --dataset) dataset="$2" shift @@ -140,7 +138,7 @@ done # Check for required arguments -if [ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$nruns" ]; then +if [ -z "$hparams" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$nruns" ]; then echo "ERROR: Missing required arguments! Please provide all required options." print_argument_descriptions fi @@ -172,10 +170,9 @@ mkdir -p $output_folder { echo "hparams: $hparams" echo "data_folder: $data_folder" + echo "cached_data_folder: $cached_data_folder" echo "output_folder: $output_folder" echo "task: $task" - echo "downstream: $downstream" - echo "tokenizer_name: $tokenizer_name" echo "dataset: $dataset" echo "seed: $seed" echo "nruns: $nruns" @@ -194,8 +191,8 @@ mkdir -p $cached_data_folder # Function to run the training experiment run_experiment() { -python $dataset/$task/train.py $dataset/$task/hparams/$downstream/$tokenizer_name.yaml --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp\ -$additional_flags --debug +python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ +$additional_flags } @@ -208,7 +205,7 @@ for i in $(seq 0 1 $(( nruns - 1 ))); do run_experiment $output_folder_exp - # Store the results + # # Store the results # python utils/parse_results.py $output_folder_exp $metric_file $eval_metric | tee -a $output_folder/$run_name\_results.txt # Changing Random seed @@ -217,4 +214,4 @@ done echo 'Final Results (Performance Aggregation)' -python utils/aggregate_results.py $output_folder $eval_metric | tee -a $output_folder/aggregated_performance.txt +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 1b2570675..4eefc8292 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -49,12 +49,14 @@ exp_name="hopt" output_folder="" data_folder="" cached_data_folder="" +task="" +dataset="" hparams="" nruns="" nruns_eval=10 eval_metric="acc" seed=1986 -config_file="hparams/orion/hparams_tpe.yaml" +config_file="orion/hparams_bohb.yaml" mne_dir="" orion_db_address="" orion_db_type="PickledDB" @@ -70,6 +72,8 @@ print_argument_descriptions() { echo " --output_folder output_path Output folder were the results will be stored" echo " --data_folder data_path Folder were the data are stored. If not available, they will be downloaded there." echo " --cached_data_folder path [Optional] Folder were the data in pkl format will be cached." + echo " --task task downstream task" + echo " --dataset dataset dataset" echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" echo " --nruns num_runs Number of runs for each hparam selection." echo " --nruns_eval num_runs Number of runs for the final evaluation (with best hparams) on the test set" @@ -120,6 +124,18 @@ while [[ $# -gt 0 ]]; do shift ;; + --task) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + --seed) seed="$2" shift @@ -220,7 +236,7 @@ fi # Assign default value to cached_data_folder if [ -z "$cached_data_folder" ]; then - cached_data_folder="$data_folder/pkl" + cached_data_folder="$data_folder/cache" fi @@ -233,9 +249,12 @@ export ORION_DB_TYPE=$orion_db_type echo "-------------------------------------" echo "Experiment Name: $exp_name" +echo "hparams: $hparams" echo "Output Folder: $output_folder" echo "Data Folder: $data_folder" echo "Cached Data Folder: $cached_data_folder" +echo "task: $task" +echo "dataset: $dataset" echo "Hparam File: $hparams" echo "Number of Runs: $nruns" echo "Number of Eval Runs: $nruns_eval" @@ -335,8 +354,8 @@ while [ -n "$opt_flags" ]; do echo # Setting up orion command orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ - ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --seed $seed \ - --output_folder $output_folder_step/exp --nruns $nruns \ + ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder=$cached_data_folder --seed $seed \ + --output_folder $output_folder_step/exp --task=$task --dataset=$dataset --nruns $nruns \ --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all $additional_flags" @@ -396,10 +415,10 @@ final_yaml_file="$output_folder/best_hparams.yaml" scp $best_yaml_file $final_yaml_file # Running evaluation on the test set for the best models - ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder \ - --seed $seed --output_folder $output_folder/best --nsbj $nsbj --nsess $nsess \ + ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder=$cached_data_folder \ + --seed $seed --output_folder $output_folder/best --task=$task --dataset=$dataset \ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --train_mode $train_mode --rnd_dir $store_all $additional_flags + --rnd_dir $store_all $additional_flags echo "The test performance with best hparams is available at $output_folder/best" diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py index 1ba94c7e1..73a35cbad 100644 --- a/benchmarks/DASB/utils/aggregate_results.py +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -20,6 +20,7 @@ """ import sys +import re import numpy as np from orion.client import report_objective from speechbrain.utils.data_utils import get_all_files @@ -84,11 +85,13 @@ def get_metrics(res_files, eval_metric): with open(res_files[i]) as file_in: for line in file_in: if eval_metric in line: - value = line.split(eval_metric + " ")[1] - value = value.split(" ")[0] - value = float(value) - metrics[cnt, i] = value - cnt = cnt + 1 + # Use regex to find the test WER value + match = re.search(rf'{eval_metric}: (\d+\.\d+e[+-]\d+)', line) + if match: + value = match.group(1) + value = float(value) + metrics[cnt, i] = value + cnt = cnt + 1 return metrics @@ -120,12 +123,11 @@ def aggregate_metrics(prototype, metrics): if __name__ == "__main__": - # output_folder = sys.argv[1] - # eval_metric = sys.argv[2] - output_folder = "benchmarks/DASB/result" - eval_metric = "wer" + output_folder = sys.argv[1] + eval_metric = sys.argv[2] + # Getting the list of the result files in the output folder - res_files = get_all_files(output_folder, match_and=["_results.txt"]) + res_files = get_all_files(output_folder, match_and=["train_log.txt"]) # Gettin a prototype file prototype, n_metrics = get_prototype(res_files[0], eval_metric) From 1f959a612b5497bf7238404474f75d9ae2c1402b Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Thu, 28 Nov 2024 12:09:21 -0500 Subject: [PATCH 8/9] fix bug --- .../DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml | 4 ++-- benchmarks/DASB/run_hparam_optimization.sh | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index e1d4680b1..47a803de8 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -40,7 +40,7 @@ number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, ba number_of_epochs: !apply:int - !apply:math.floor - !ref -batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(4, 6,discrete=True)" +batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(1, 2,discrete=True)" batch_size: !ref 2 ** test_batch_size: 1 grad_accumulation_factor: 2 @@ -119,7 +119,7 @@ freeze_embedding: False # LSTM activation: !name:torch.nn.Sigmoid -dnn_layers: 2 +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(2, 4,discrete=True)" dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 4eefc8292..12d38d131 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -354,8 +354,8 @@ while [ -n "$opt_flags" ]; do echo # Setting up orion command orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ - ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder=$cached_data_folder --seed $seed \ - --output_folder $output_folder_step/exp --task=$task --dataset=$dataset --nruns $nruns \ + ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder_step/exp --task $task --dataset $dataset --seed $seed --nruns $nruns \ --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all $additional_flags" @@ -415,10 +415,9 @@ final_yaml_file="$output_folder/best_hparams.yaml" scp $best_yaml_file $final_yaml_file # Running evaluation on the test set for the best models - ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder=$cached_data_folder \ - --seed $seed --output_folder $output_folder/best --task=$task --dataset=$dataset \ +./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ --rnd_dir $store_all $additional_flags - echo "The test performance with best hparams is available at $output_folder/best" From 081c4e7a9bc1bf2f2271d1bff9f1af5a14f4dda8 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 16 Dec 2024 18:16:33 +0000 Subject: [PATCH 9/9] fix aggregat_result bug --- .../ASR-refactor/hparams/LSTM/dac.yaml | 17 ++++---- .../ASR-refactor/hparams/LSTM/encodec.yaml | 39 ++++++++++--------- .../DASB/LibriSpeech/ASR-refactor/train.py | 23 +++++------ benchmarks/DASB/orion/hparams_bohb.yaml | 4 +- benchmarks/DASB/orion/hparams_tpe.yaml | 6 +++ benchmarks/DASB/utils/aggregate_results.py | 4 +- 6 files changed, 50 insertions(+), 43 deletions(-) create mode 100755 benchmarks/DASB/orion/hparams_tpe.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index 47a803de8..ef9d20349 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -16,7 +16,7 @@ output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt cached_data_folder: !PLACEHOLDER #'path/to/cache' - +testing: True # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES @@ -25,7 +25,7 @@ data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech # data_folder_rirs: !ref train_splits: ["train-clean-100"] dev_splits: ["dev-clean"] -test_splits: ["dev-clean", "test-clean"] +test_splits: ["test-clean"] skip_prep: False train_csv: !ref /train.csv valid_csv: !ref /dev-clean.csv @@ -33,13 +33,9 @@ test_csv: - !ref /dev-clean.csv - !ref /test-clean.csv - ####################### Training Parameters #################################### # number_of_epochs: 20 -number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, base=4)" -number_of_epochs: !apply:int - - !apply:math.floor - - !ref +number_of_epochs: 200 # @orion_step1: --number_of_epochs~"fidelity(1, 2, base=4)" batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(1, 2,discrete=True)" batch_size: !ref 2 ** test_batch_size: 1 @@ -119,7 +115,7 @@ freeze_embedding: False # LSTM activation: !name:torch.nn.Sigmoid -dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(2, 4,discrete=True)" +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 2,discrete=True)" dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 @@ -224,8 +220,9 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer # Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter # epoch counter + limit: !new:int + - !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml index 18d967244..cfd42f3cc 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml @@ -15,28 +15,29 @@ output_folder: !ref results/enocdec/LSTM/ output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt - +cached_data_folder: !PLACEHOLDER #'path/to/cache' +testing: True # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES # then data_folder_rirs should be /localscratch/xxx_corpus # otherwise the dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +data_folder_rirs: !ref +train_splits: ["train-clean-100"] dev_splits: ["dev-clean"] -test_splits: ["dev-clean", "test-clean", "test-other"] +test_splits: ["test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /dev-clean.csv - - !ref /test-clean.csv - + - !ref /test-clean.csv + - !ref /test-other.csv ####################### Training Parameters #################################### -number_of_epochs: 20 -batch_size: 4 # This works for 2x GPUs with 32GB +number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)" +batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" +batch_size: !ref 2 ** test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -48,7 +49,7 @@ valid_search_interval: 1 avg_checkpoints: 10 # Number of checkpoints to average for evaluation cache_size: 1.e+10 -lr_model: 0.001 +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" weight_decay: 0.0005 @@ -99,8 +100,8 @@ test_dataloader_opts: # bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] # num_codebooks: [2, 4, 8, 16, 32] vocab_size: 1024 -bandwidth: 1.5 -num_codebooks: 2 +bandwidth: 6.0 +num_codebooks: 8 sample_rate: 24000 # Feature parameters encoder_dim: 1024 @@ -109,9 +110,10 @@ pretrain_embeddings: False freeze_embedding: False + # LSTM activation: !name:torch.nn.Sigmoid -dnn_layers: 2 +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)" dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 @@ -134,7 +136,7 @@ prune_history: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:model.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False @@ -219,8 +221,9 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer # Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter # epoch counter + limit: !new:int + - !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index 177d79f8f..9f9f05ae0 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -434,15 +434,16 @@ def text_pipeline(wrd): ) # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) + if hparams['testing']: + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.output_wer_folder = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/orion/hparams_bohb.yaml b/benchmarks/DASB/orion/hparams_bohb.yaml index e68509559..a360d2beb 100755 --- a/benchmarks/DASB/orion/hparams_bohb.yaml +++ b/benchmarks/DASB/orion/hparams_bohb.yaml @@ -2,5 +2,5 @@ experiment: algorithms: bohb: seed: 1986 - min_points_in_model: 20 - num_samples: 24 + min_points_in_model: 5 + num_samples: 5 diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml new file mode 100755 index 000000000..fb6a7c9b0 --- /dev/null +++ b/benchmarks/DASB/orion/hparams_tpe.yaml @@ -0,0 +1,6 @@ +experiment: + algorithms: + tpe: + seed: 1986 + n_initial_points: 20 + n_ei_candidates: 24 diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py index 73a35cbad..ae9c19ad2 100644 --- a/benchmarks/DASB/utils/aggregate_results.py +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -86,7 +86,7 @@ def get_metrics(res_files, eval_metric): for line in file_in: if eval_metric in line: # Use regex to find the test WER value - match = re.search(rf'{eval_metric}: (\d+\.\d+e[+-]\d+)', line) + match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line) if match: value = match.group(1) value = float(value) @@ -125,7 +125,7 @@ def aggregate_metrics(prototype, metrics): if __name__ == "__main__": output_folder = sys.argv[1] eval_metric = sys.argv[2] - + # Getting the list of the result files in the output folder res_files = get_all_files(output_folder, match_and=["train_log.txt"])