From a4f38bd2a9cbb76e8bd56c944c66a8fccf0a7c04 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 5 Nov 2024 09:31:52 -0500
Subject: [PATCH 1/9] add tokenizer_interface

---
 benchmarks/DASB/model/tokenizer_interface.py | 164 +++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 benchmarks/DASB/model/tokenizer_interface.py

diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
new file mode 100644
index 000000000..892bef6b3
--- /dev/null
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -0,0 +1,164 @@
+
+"""
+Unified interface for tokenizers, standardizing the output shape of encode and decode functions.
+
+This class reshapes the outputs of various tokenizers to ensure consistency, simplifying integration with recipes and workflows.
+
+Authors
+---------
+* Pooneh Mousavi, 2024
+"""
+
+import torch
+
+from  speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
+from  speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from  speechbrain.lobes.models.discrete.dac import DAC
+from  speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
+
+
+class Tokenizer_Encodec(Encodec):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks, _ = self.encode(sig, lens)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        sig = self.decode(toks)[:, 0]  # [B, T]
+        return sig
+  
+class Tokenizer_DAC(DAC):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks, _ = self(
+            sig[:, None], n_quantizers=kwargs['num_codebooks']
+        )  # [B, K, N]
+        toks = toks.movedim(-1, -2)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        qfeats, _, _ = self.quantizer.from_codes(
+            toks.movedim(-1, -2)  # [B, K, N]
+        )
+        sig = self.decode(qfeats)[:, 0]  # [B, T]
+        return sig
+
+class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks = self(sig)[
+            : kwargs['num_codebooks']
+        ]  # [K, B, N]
+        toks = toks.movedim(-3, -1)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        toks = toks.movedim(-1, -3)  # [K, B, N]
+        sig = self.decode(toks)  # [B, T]
+        return sig
+
+class Tokenizer_DiscreteSSL(DiscreteSSL):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens):
+        # sig: [B, T]
+        self.hparams.codec_quantizer.to(self.device).eval()
+        toks, _, _ = self.hparams.codec_quantizer(
+            sig,
+            lens,
+            SSL_layers=self.hparams.SSL_layers,
+            deduplicates=[False] * len(self.hparams.SSL_layers),
+            bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
+        )  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks):
+        # toks: [B, N, K]
+        self.hparams.codec_vocoder.device = self.device
+        self.hparams.codec_vocoder.to(self.device).eval()
+
+        # Add offset for embedding layer
+        all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
+        # TODO: remove after testing
+        assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23)
+        offsets = torch.arange(
+            0,
+            len(all_layer_ids) * self.hparams.vocab_size,
+            self.hparams.vocab_size,
+            device=self.device,
+        )
+        offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
+        offsets = offsets[offset_idxes]
+        toks = toks + offsets + 1
+
+        # Handle missing codebooks
+        if len(self.hparams.SSL_layers) < len(all_layer_ids):
+            full_toks = torch.zeros(
+                *toks.shape[:2],
+                len(all_layer_ids),
+                dtype=toks.dtype,
+                device=self.device,
+            )
+            for i, idx in enumerate(offset_idxes):
+                full_toks[..., idx] = toks[..., i]
+            toks = full_toks
+
+        self.hparams.codec_vocoder.tokenize = False
+        sig = self.hparams.codec_vocoder(toks)[:, 0]  # [B, T]
+        return sig
+
+class Tokenizer:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    @torch.no_grad()
+    def encode(self,sig, lens,**kwargs):
+        toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs)
+        return toks
+    
+    @torch.no_grad()
+    def decode(self,sig,**kwargs):
+        sig = self.tokenizer.toks_to_sig(sig,**kwargs)
+        return sig
+    
+    
+# model_hub = "facebook/encodec_24khz"
+# save_path = "savedir"
+# model = Tokenizer_Encodec(model_hub, save_path)
+# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT)
+# inputs = torch.rand([3, 2000])
+# model_hub = "facebook/hubert-large-ll60k"
+# save_path = "savedir"
+# ssl_layer_num = [7,23]
+# deduplicate =[False, True]
+# bpe_tokenizers=[None, None]
+# kmeans_repo_id = "speechbrain/SSL_Quantization"
+# kmeans_dataset = "LJSpeech"
+# num_clusters = 1000
+# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True)
+# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters)
+model_hub = "fnlp/SpeechTokenizer"
+save_path = "savedir"
+model =Tokenizer_SpeechTokenizer(model_hub, save_path)  # doctest: +SKIP
+tokenizer= Tokenizer(model)
+audio = torch.randn(4, 1000)
+length = torch.tensor([1.0, .5, .75, 1.0])
+tokens = tokenizer.encode(audio, length,num_codebooks=2)
+print(tokens.shape)
+rec = tokenizer.decode(tokens)
+print(rec.shape)
\ No newline at end of file

From 0c2b751c595c9a63a2bd66b14f32e2faa13478d8 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 6 Nov 2024 17:53:55 -0500
Subject: [PATCH 2/9] add reactored version of ASR

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        | 232 +++++++++
 .../ASR-refactor/hparams/LSTM/encodec.yaml    | 232 +++++++++
 .../hparams/LSTM/speech_tokenizer.yaml        | 222 +++++++++
 .../ASR-refactor/hparams/contextnet/dac.yaml  | 225 +++++++++
 .../hparams/contextnet/encodec.yaml           | 223 +++++++++
 .../hparams/contextnet/speech_tokenizer.yaml  | 213 +++++++++
 .../ASR-refactor/librispeech_prepare.py       |   1 +
 .../DASB/LibriSpeech/ASR-refactor/train.py    | 447 ++++++++++++++++++
 benchmarks/DASB/model/ __init__.py            |   1 +
 benchmarks/DASB/model/custom_model.py         |  17 +-
 benchmarks/DASB/model/tokenizer_interface.py  | 231 ++++-----
 11 files changed, 1933 insertions(+), 111 deletions(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
 create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
 create mode 100644 benchmarks/DASB/model/ __init__.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
new file mode 100644
index 000000000..4accc2241
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -0,0 +1,232 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: DAC
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.DACTokenizer
+   model_type: !ref <model_type>
+   model_bitrate: !ref <model_bitrate>
+   load_pretrained: True
+   tag: latest
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
new file mode 100644
index 000000000..03c29ddbb
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
@@ -0,0 +1,232 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: Encodec
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
new file mode 100644
index 000000000..8105204a5
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -0,0 +1,222 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: SpeechTokenizer
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speechtokenizer/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+vocab_size: 1024
+num_codebooks: 2
+sample_rate: 16000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
new file mode 100644
index 000000000..eabeef113
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
@@ -0,0 +1,225 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: DAC
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac/contextnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.DACTokenizer
+   model_type: !ref <model_type>
+   model_bitrate: !ref <model_bitrate>
+   load_pretrained: True
+   tag: latest
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
new file mode 100644
index 000000000..c0411bd76
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
@@ -0,0 +1,223 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/Contexnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
new file mode 100644
index 000000000..77ef2c540
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
@@ -0,0 +1,213 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: SpeechTokenizer
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speechtokenizer/contextnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+vocab_size: 1024
+num_codebooks: 2
+sample_rate: 16000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
new file mode 120000
index 000000000..a3126ec94
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
@@ -0,0 +1 @@
+../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
new file mode 100644
index 000000000..61b6c56f4
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -0,0 +1,447 @@
+#!/usr/bin/env/python3
+"""Recipe for training an discrete tokens ctc ASR system with librispeech.
+
+Decoding is performed with greedy decoding at validation time.
+At test time, beamsearch is used with an optional external language model.
+
+Authors
+ * Pooneh Mousavi 2024
+"""
+
+import os
+import sys
+import torch
+import torchaudio
+import logging
+import speechbrain as sb
+from speechbrain.utils.distributed import run_on_main, if_main_process
+from speechbrain.tokenizers.SentencePiece import SentencePiece
+from hyperpyyaml import load_hyperpyyaml
+from pathlib import Path
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+
+logger = logging.getLogger(__name__)
+
+_CACHE = {"size": 0}
+
+# Define training procedure
+class ASR(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """Forward computations from the waveform batches to the output probabilities."""
+        batch = batch.to(self.device)
+        wavs, wav_lens = batch.sig
+       
+
+        # Add waveform augmentation if specified.
+        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
+            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T]
+
+        current_epoch = self.hparams.epoch_counter.current
+
+        # compute features
+        # Extract tokens (cache them at first epoch if augmentation is disabled)
+        key = tuple(sorted(batch.id))
+        try:
+            in_toks = _CACHE[key]
+            in_toks = in_toks.to(self.device)
+        except KeyError:
+            with torch.no_grad():
+                self.hparams.tokenizer.eval().to(self.device)
+                in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q]
+            if stage != sb.Stage.TRAIN or (
+                stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment"))
+            ):
+                if _CACHE["size"] < self.hparams.cache_size:
+                    _CACHE[key] = in_toks.cpu()
+                    _CACHE["size"] += in_toks.numel()
+
+        # Extract embeddings
+        in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D]
+
+        # Attention-Pooling 
+        att_w = self.modules.attention_mlp(in_embs)  #[B, T, N-Q, 1]
+        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2)  #[B, T, D]
+
+        # forward modules
+        if type(self.modules.encoder).__name__ == "ContextNet":
+            enc_out = self.modules.encoder(in_embs)
+
+        elif type(self.modules.encoder).__name__ == "LSTM":
+            enc_out, _ = self.modules.encoder(
+                in_embs
+            ) 
+
+        else:
+            raise NotImplementedError
+        
+        # output layer for ctc log-probabilities
+        logits = self.modules.ctc_lin(enc_out)
+        p_ctc = self.hparams.log_softmax(logits)
+
+        p_tokens = None
+        if stage == sb.Stage.VALID:
+            p_tokens = sb.decoders.ctc_greedy_decode(
+                p_ctc, wav_lens, blank_id=self.hparams.blank_index
+            )
+        elif stage == sb.Stage.TEST:
+            p_tokens = test_searcher(p_ctc, wav_lens)
+
+        return p_ctc, wav_lens, p_tokens
+    
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss (CTC+NLL) given predictions and targets."""
+
+        p_ctc, wav_lens, predicted_tokens = predictions
+        ids = batch.id
+        tokens, tokens_lens = batch.tokens
+
+
+        # Label Augmentation
+        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
+            tokens = self.hparams.wav_augment.replicate_labels(tokens)
+            tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens)
+        
+        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
+        
+        if stage == sb.Stage.VALID:
+            # Decode token terms to words
+            predicted_words = self.tokenizer(
+                predicted_tokens, task="decode_from_list"
+            )
+        elif stage == sb.Stage.TEST:
+            predicted_words = [
+                hyp[0].text.split(" ") for hyp in predicted_tokens
+            ]
+
+        if stage != sb.Stage.TRAIN:
+            target_words = [wrd.split(" ") for wrd in batch.wrd]
+            self.wer_metric.append(ids, predicted_words, target_words)
+            self.cer_metric.append(ids, predicted_words, target_words)
+
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch"""
+        if stage != sb.Stage.TRAIN:
+            self.cer_metric = self.hparams.cer_computer()
+            self.wer_metric = self.hparams.wer_computer()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a epoch."""
+        # Compute/store important stats
+        stage_stats = {"loss": stage_loss}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+        else:
+            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
+            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+            current_epoch = self.hparams.epoch_counter.current
+            valid_search_interval = self.hparams.valid_search_interval
+            if (
+                current_epoch % valid_search_interval == 0
+                or stage == sb.Stage.TEST
+            ):
+                stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+
+        # log stats and save checkpoint at end-of-epoch
+        if stage == sb.Stage.VALID:
+            if type(self.hparams.scheduler).__name__ == "NewBobScheduler":
+                lr, new_lr = self.hparams.scheduler(
+                    stage_stats["loss"]
+                )
+                sb.nnet.schedulers.update_learning_rate(
+                    self.optimizer, new_lr
+                )
+            elif type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+                lr = self.hparams.scheduler.current_lr
+                steps = self.optimizer_step
+           
+            else:
+                raise NotImplementedError
+            
+            optimizer = self.optimizer.__class__.__name__
+            epoch_stats = {
+                "epoch": epoch,
+                "lr": lr,
+                "optimizer": optimizer,
+            }
+            self.hparams.train_logger.log_stats(
+                stats_meta=epoch_stats,
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+            self.checkpointer.save_and_keep_only(
+                meta={"WER": stage_stats["WER"], "epoch": epoch},
+                min_keys=["WER"],
+                num_to_keep=self.hparams.avg_checkpoints,
+            )
+
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+            if if_main_process():
+                with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w:
+                    self.wer_metric.write_stats(w)
+
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        if should_step and type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+            self.hparams.scheduler(self.optimizer)
+
+
+
+def dataio_prepare(hparams, tokenizer):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions."""
+    data_folder = hparams["data_folder"]
+
+    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
+    )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True
+        )
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+
+    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
+    )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    # test is separate
+    test_datasets = {}
+    for csv_file in hparams["test_csv"]:
+        name = Path(csv_file).stem
+        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
+            csv_path=csv_file, replacements={"data_root": data_folder}
+        )
+        test_datasets[name] = test_datasets[name].filtered_sorted(
+            sort_key="duration"
+        )
+
+    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
+
+    # 2. Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        info = torchaudio.info(wav)
+        resampled = torchaudio.transforms.Resample(
+            info.sample_rate, hparams["sample_rate"],
+        )(sig)
+        #resampled = resampled.unsqueeze(0)
+        return resampled
+
+    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @sb.utils.data_pipeline.takes("wrd")
+    @sb.utils.data_pipeline.provides(
+        "wrd", "char_list", "tokens_list", "tokens"
+    )
+    def text_pipeline(wrd):
+        yield wrd
+        char_list = list(wrd)
+        yield char_list
+        tokens_list = tokenizer.sp.encode_as_ids(wrd)
+        yield tokens_list
+        tokens = torch.LongTensor(tokens_list)
+        yield tokens
+
+    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
+
+
+    # 4. Set output:
+    sb.dataio.dataset.set_output_keys(
+        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
+    )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from speechbrain.dataio.sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams_train = hparams["dynamic_batch_sampler_train"]
+        dynamic_hparams_val = hparams["dynamic_batch_sampler_val"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_train,
+        )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_val,
+        )
+
+    return (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_batch_sampler,
+        valid_batch_sampler,
+    )
+
+
+if __name__ == "__main__":
+
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # If distributed_launch=True then
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+
+    # Dataset prep (parsing Librispeech)
+    from librispeech_prepare import prepare_librispeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_librispeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "tr_splits": hparams["train_splits"],
+            "dev_splits": hparams["dev_splits"],
+            "te_splits": hparams["test_splits"],
+            "save_folder": hparams["output_folder"],
+            "merge_lst": hparams["train_splits"],
+            "merge_name": "train.csv",
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    # Defining tokenizer and loading it
+    tokenizer = SentencePiece(
+        model_dir=hparams["save_folder"],
+        vocab_size=hparams["output_neurons"],
+        annotation_train=hparams["train_csv"],
+        annotation_read="wrd",
+        model_type=hparams["token_type"],
+        character_coverage=hparams["character_coverage"],
+        bos_id=hparams["bos_index"],
+        eos_id=hparams["eos_index"],
+    )
+
+    # here we create the datasets objects as well as tokenization and encoding
+    (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_bsampler,
+        valid_bsampler,
+    ) = dataio_prepare(hparams, tokenizer)
+
+    # Use pretrained embeddings
+    if hparams["pretrain_embeddings"]:
+        embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"])
+        hparams["discrete_embedding_layer"].init_embedding(embs)
+
+
+    # Log number of parameters/buffers
+    codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()])
+    model_params = sum(
+        [
+            x.numel()
+            for module in hparams["modules"].values()
+            for x in module.state_dict().values()
+        ]
+    )
+    hparams["train_logger"].log_stats(
+        stats_meta={
+            f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}",
+            "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}",
+        },
+    )
+
+    # Trainer initialization
+    asr_brain = ASR(
+        modules=hparams["modules"],
+        opt_class=hparams["model_opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # Adding objects to trainer.
+    asr_brain.tokenizer = tokenizer
+    vocab_list = [
+        tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size())
+    ]
+
+    from speechbrain.decoders.ctc import CTCBeamSearcher
+
+    test_searcher = CTCBeamSearcher(
+        **hparams["test_beam_search"],
+        vocab_list=vocab_list,
+    )
+
+    train_dataloader_opts = hparams["train_dataloader_opts"]
+    valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+    if train_bsampler is not None:
+        train_dataloader_opts = {
+            "batch_sampler": train_bsampler,
+            "num_workers": hparams["num_workers"],
+        }
+
+    if valid_bsampler is not None:
+        valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+    # Training
+    asr_brain.fit(
+        asr_brain.hparams.epoch_counter,
+        train_data,
+        valid_data,
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Testing
+    if not os.path.exists(hparams["output_wer_folder"]):
+        os.makedirs(hparams["output_wer_folder"])
+
+    for k in test_datasets.keys():  # keys are test_clean, test_other etc
+        asr_brain.hparams.output_wer_folder = os.path.join(
+            hparams["output_wer_folder"], f"wer_{k}.txt"
+        )
+        asr_brain.evaluate(
+            test_datasets[k],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+            min_key="WER",
+        )
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
new file mode 100644
index 000000000..e7db8766a
--- /dev/null
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -0,0 +1 @@
+from model.tokenizer_interface import EncodecTokenizer
\ No newline at end of file
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index b6e11a0d2..d3bf3cc9f 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -57,9 +57,9 @@ def __init__(
         num_codebooks,
         vocab_size,
         emb_dim,
-        pad_index=0,
         init=False,
         freeze=False,
+        hidden_dim =None,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -70,10 +70,17 @@ def __init__(
         ).requires_grad_(not self.freeze)
         self.init = init
 
-    def init_embedding(self, weights):
-        with torch.no_grad():
-            self.embedding.weight = torch.nn.Parameter(weights)
+        # Add a linear layer to match dimensions if necessary
+        if hidden_dim is not None and hidden_dim != emb_dim:
+            self.proj_layer = torch.nn.Linear(emb_dim, hidden_dim)
+        else:
+            self.proj_layer = None
+
 
+    def init_embedding(self, weights):
+        self.embedding.weight.data.copy_(weights)
+    
+    
     def forward(self, in_tokens):
         """Computes the embedding for discrete tokens.
         a sample.
@@ -97,4 +104,6 @@ def forward(self, in_tokens):
             )
             # Forward Pass to embedding and
             in_embs = self.embedding(in_tokens)
+            if self.proj_layer is not None:
+                in_embs = self.proj_layer(in_embs)
             return in_embs
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 892bef6b3..351652a57 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -10,92 +10,152 @@
 """
 
 import torch
+from abc import ABC, abstractmethod
+from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
+from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from speechbrain.lobes.models.discrete.dac import DAC
+from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
 
-from  speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
-from  speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
-from  speechbrain.lobes.models.discrete.dac import DAC
-from  speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
 
+class BaseTokenizer(ABC):
+    @abstractmethod
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        """Abstract method to encode a signal into tokens."""
+        pass
+
+    @abstractmethod
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        """Abstract method to decode tokens into a signal."""
+        pass
+    
+    @abstractmethod
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        pass
 
-class Tokenizer_Encodec(Encodec):
+class EncodecTokenizer(Encodec, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks, _ = self.encode(sig, lens)  # [B, N, K]
-        return toks
+        tokens, _ = self.encode(signal, lengths)  # [B, T, N_Q]
+        return tokens
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        sig = self.decode(toks)[:, 0]  # [B, T]
-        return sig
-  
-class Tokenizer_DAC(DAC):
+        signal = self.decode(tokens)[:, 0]  # [B, T]
+        return signal
+    
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        embeddings = self.vocabulary
+        return embeddings.reshape(-1, embeddings.shape[-1])
+
+class DACTokenizer(DAC, BaseTokenizer):
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks, _ = self(
-            sig[:, None], n_quantizers=kwargs['num_codebooks']
-        )  # [B, K, N]
-        toks = toks.movedim(-1, -2)  # [B, N, K]
-        return toks
+        tokens, _ = self(
+            signal[:, None], n_quantizers=kwargs['num_codebooks']
+        )  # [B, N_Q, T]
+        return tokens.movedim(-1, -2)  # [B, T, N_Q]
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        qfeats, _, _ = self.quantizer.from_codes(
-            toks.movedim(-1, -2)  # [B, K, N]
+        quantized_feats, _, _ = self.quantizer.from_codes(
+            tokens.movedim(-1, -2)  # [B, N_Q, T]
         )
-        sig = self.decode(qfeats)[:, 0]  # [B, T]
-        return sig
-
-class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface):
+        signal = self.decode(quantized_feats)[:, 0]  # [B, T]
+        return signal
+    
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200
+        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
+        toks = (
+            toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone()
+        )  # [C, K, 1]
+        self.to(kwargs["device"]).eval()
+        with torch.no_grad():
+            z_q, z_p, _ = self.quantizer.from_codes(toks)
+        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)  # [C, D, 1] * K
+        z_qs = []
+        for i, z_p_i in enumerate(z_ps):
+            with torch.no_grad():
+                z_q_i = (
+                    self.quantizer.quantizers[i].out_proj(z_p_i)
+                )  # [C, H, 1]
+            z_qs.append(z_q_i)
+        assert (z_q == sum(z_qs)).all()
+        embeddings = torch.cat(z_qs)[:, :, 0]  # [CK, H]
+        return embeddings
+
+class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks = self(sig)[
-            : kwargs['num_codebooks']
-        ]  # [K, B, N]
-        toks = toks.movedim(-3, -1)  # [B, N, K]
-        return toks
+        tokens = self(signal)[: kwargs['num_codebooks']]  # [N_Q, B, T]
+        return tokens.movedim(-3, -1)  # [B, T, N_Q]
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        toks = toks.movedim(-1, -3)  # [K, B, N]
-        sig = self.decode(toks)  # [B, T]
-        return sig
-
-class Tokenizer_DiscreteSSL(DiscreteSSL):
+        tokens = tokens.movedim(-1, -3)  # [N_Q, B, T]
+        return self.decode(tokens)  # [B, T]
+    
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360
+        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
+        toks = (
+            toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone()
+        )  # [K, C, 1]
+        self.to(kwargs["device"]).eval()
+        embs = []
+        for i, indices in enumerate(toks):
+            layer = self.model.quantizer.vq.layers[i]
+            with torch.no_grad():
+                quantized = layer.decode(indices)  # [C, H, 1]
+            embs.append(quantized)
+        assert (
+            self.model.quantizer.decode(toks) == sum(embs)
+        ).all()
+        embeddings = torch.cat(embs)[:, :, 0]  # [CK, H]
+        return embeddings
+
+class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths):
+        # signal: [B, T]
         self.hparams.codec_quantizer.to(self.device).eval()
-        toks, _, _ = self.hparams.codec_quantizer(
-            sig,
-            lens,
+        tokens, _, _ = self.hparams.codec_quantizer(
+            signal,
+            lengths,
             SSL_layers=self.hparams.SSL_layers,
             deduplicates=[False] * len(self.hparams.SSL_layers),
             bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
-        )  # [B, N, K]
-        return toks
+        )  # [B, T, N_Q]
+        return tokens
 
     @torch.no_grad()
-    def toks_to_sig(self, toks):
-        # toks: [B, N, K]
-        self.hparams.codec_vocoder.device = self.device
+    def tokens_to_sig(self, tokens):
+        # tokens: [B, T, N_Q]
         self.hparams.codec_vocoder.to(self.device).eval()
 
-        # Add offset for embedding layer
         all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
-        # TODO: remove after testing
-        assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23)
         offsets = torch.arange(
             0,
             len(all_layer_ids) * self.hparams.vocab_size,
@@ -104,61 +164,18 @@ def toks_to_sig(self, toks):
         )
         offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
         offsets = offsets[offset_idxes]
-        toks = toks + offsets + 1
+        tokens += offsets + 1
 
-        # Handle missing codebooks
         if len(self.hparams.SSL_layers) < len(all_layer_ids):
-            full_toks = torch.zeros(
-                *toks.shape[:2],
+            full_tokens = torch.zeros(
+                *tokens.shape[:2],
                 len(all_layer_ids),
-                dtype=toks.dtype,
+                dtype=tokens.dtype,
                 device=self.device,
             )
             for i, idx in enumerate(offset_idxes):
-                full_toks[..., idx] = toks[..., i]
-            toks = full_toks
+                full_tokens[..., idx] = tokens[..., i]
+            tokens = full_tokens
 
         self.hparams.codec_vocoder.tokenize = False
-        sig = self.hparams.codec_vocoder(toks)[:, 0]  # [B, T]
-        return sig
-
-class Tokenizer:
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    @torch.no_grad()
-    def encode(self,sig, lens,**kwargs):
-        toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs)
-        return toks
-    
-    @torch.no_grad()
-    def decode(self,sig,**kwargs):
-        sig = self.tokenizer.toks_to_sig(sig,**kwargs)
-        return sig
-    
-    
-# model_hub = "facebook/encodec_24khz"
-# save_path = "savedir"
-# model = Tokenizer_Encodec(model_hub, save_path)
-# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT)
-# inputs = torch.rand([3, 2000])
-# model_hub = "facebook/hubert-large-ll60k"
-# save_path = "savedir"
-# ssl_layer_num = [7,23]
-# deduplicate =[False, True]
-# bpe_tokenizers=[None, None]
-# kmeans_repo_id = "speechbrain/SSL_Quantization"
-# kmeans_dataset = "LJSpeech"
-# num_clusters = 1000
-# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True)
-# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters)
-model_hub = "fnlp/SpeechTokenizer"
-save_path = "savedir"
-model =Tokenizer_SpeechTokenizer(model_hub, save_path)  # doctest: +SKIP
-tokenizer= Tokenizer(model)
-audio = torch.randn(4, 1000)
-length = torch.tensor([1.0, .5, .75, 1.0])
-tokens = tokenizer.encode(audio, length,num_codebooks=2)
-print(tokens.shape)
-rec = tokenizer.decode(tokens)
-print(rec.shape)
\ No newline at end of file
+        return self.hparams.codec_vocoder(tokens)[:, 0]  # [B, T]

From 17898c3472ec45ae2173b5894f7c7e550918d9d4 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:40:14 -0500
Subject: [PATCH 3/9] fix precommit

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        |  4 +-
 .../ASR-refactor/hparams/LSTM/encodec.yaml    |  4 +-
 .../hparams/LSTM/speech_tokenizer.yaml        |  2 +-
 .../ASR-refactor/hparams/contextnet/dac.yaml  |  4 +-
 .../hparams/contextnet/encodec.yaml           |  5 +-
 .../hparams/contextnet/speech_tokenizer.yaml  |  4 +-
 .../DASB/LibriSpeech/ASR-refactor/train.py    | 78 +++++++++--------
 benchmarks/DASB/model/ __init__.py            |  2 +-
 benchmarks/DASB/model/custom_model.py         |  6 +-
 benchmarks/DASB/model/tokenizer_interface.py  | 84 +++++++------------
 10 files changed, 84 insertions(+), 109 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index 4accc2241..806305774 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -182,7 +182,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
index 03c29ddbb..18d967244 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
@@ -182,7 +182,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
index 8105204a5..55d7c3c91 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -172,7 +172,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
index eabeef113..aa7d2e141 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
@@ -175,7 +175,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -186,7 +186,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
index c0411bd76..a1b5262d3 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
@@ -107,7 +107,6 @@ encoder_dim: 1024
 pretrain_embeddings: False
 freeze_embedding: False
 
-
 output_neurons: 31
 
 # BPE parameters
@@ -173,7 +172,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -184,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
index 77ef2c540..c12d6f79f 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
@@ -163,7 +163,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -174,7 +174,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index 61b6c56f4..baa80c80e 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -18,6 +18,7 @@
 from speechbrain.tokenizers.SentencePiece import SentencePiece
 from hyperpyyaml import load_hyperpyyaml
 from pathlib import Path
+
 base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 sys.path.append(base_dir)
 
@@ -32,11 +33,10 @@ def compute_forward(self, batch, stage):
         """Forward computations from the waveform batches to the output probabilities."""
         batch = batch.to(self.device)
         wavs, wav_lens = batch.sig
-       
 
         # Add waveform augmentation if specified.
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
-            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T]
+            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens)  # [B, T]
 
         current_epoch = self.hparams.epoch_counter.current
 
@@ -49,33 +49,38 @@ def compute_forward(self, batch, stage):
         except KeyError:
             with torch.no_grad():
                 self.hparams.tokenizer.eval().to(self.device)
-                in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q]
+                in_toks = self.hparams.tokenizer.sig_to_tokens(
+                    wavs, wav_lens, num_codebooks=hparams["num_codebooks"]
+                )  # [B, T, N-Q]
             if stage != sb.Stage.TRAIN or (
-                stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment"))
+                stage == sb.Stage.TRAIN
+                and (not hasattr(self.hparams, "wav_augment"))
             ):
                 if _CACHE["size"] < self.hparams.cache_size:
                     _CACHE[key] = in_toks.cpu()
                     _CACHE["size"] += in_toks.numel()
 
         # Extract embeddings
-        in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D]
+        in_embs = self.modules.discrete_embedding_layer(
+            in_toks
+        )  # [B, T, N-Q, D]
 
-        # Attention-Pooling 
-        att_w = self.modules.attention_mlp(in_embs)  #[B, T, N-Q, 1]
-        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2)  #[B, T, D]
+        # Attention-Pooling
+        att_w = self.modules.attention_mlp(in_embs)  # [B, T, N-Q, 1]
+        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(
+            -2
+        )  # [B, T, D]
 
         # forward modules
         if type(self.modules.encoder).__name__ == "ContextNet":
             enc_out = self.modules.encoder(in_embs)
 
         elif type(self.modules.encoder).__name__ == "LSTM":
-            enc_out, _ = self.modules.encoder(
-                in_embs
-            ) 
+            enc_out, _ = self.modules.encoder(in_embs)
 
         else:
             raise NotImplementedError
-        
+
         # output layer for ctc log-probabilities
         logits = self.modules.ctc_lin(enc_out)
         p_ctc = self.hparams.log_softmax(logits)
@@ -89,7 +94,6 @@ def compute_forward(self, batch, stage):
             p_tokens = test_searcher(p_ctc, wav_lens)
 
         return p_ctc, wav_lens, p_tokens
-    
 
     def compute_objectives(self, predictions, batch, stage):
         """Computes the loss (CTC+NLL) given predictions and targets."""
@@ -98,14 +102,13 @@ def compute_objectives(self, predictions, batch, stage):
         ids = batch.id
         tokens, tokens_lens = batch.tokens
 
-
         # Label Augmentation
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
             tokens = self.hparams.wav_augment.replicate_labels(tokens)
             tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens)
-        
+
         loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-        
+
         if stage == sb.Stage.VALID:
             # Decode token terms to words
             predicted_words = self.tokenizer(
@@ -149,19 +152,15 @@ def on_stage_end(self, stage, stage_loss, epoch):
         # log stats and save checkpoint at end-of-epoch
         if stage == sb.Stage.VALID:
             if type(self.hparams.scheduler).__name__ == "NewBobScheduler":
-                lr, new_lr = self.hparams.scheduler(
-                    stage_stats["loss"]
-                )
-                sb.nnet.schedulers.update_learning_rate(
-                    self.optimizer, new_lr
-                )
-            elif type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+                lr, new_lr = self.hparams.scheduler(stage_stats["loss"])
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+            elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler":
                 lr = self.hparams.scheduler.current_lr
                 steps = self.optimizer_step
-           
+
             else:
                 raise NotImplementedError
-            
+
             optimizer = self.optimizer.__class__.__name__
             epoch_stats = {
                 "epoch": epoch,
@@ -185,15 +184,19 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 test_stats=stage_stats,
             )
             if if_main_process():
-                with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w:
+                with open(
+                    self.hparams.output_wer_folder, "w", encoding="utf-8"
+                ) as w:
                     self.wer_metric.write_stats(w)
 
     def on_fit_batch_end(self, batch, outputs, loss, should_step):
-        if should_step and type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+        if (
+            should_step
+            and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler"
+        ):
             self.hparams.scheduler(self.optimizer)
 
 
-
 def dataio_prepare(hparams, tokenizer):
     """This function prepares the datasets to be used in the brain class.
     It also defines the data processing pipeline through user-defined functions."""
@@ -251,7 +254,7 @@ def audio_pipeline(wav):
         resampled = torchaudio.transforms.Resample(
             info.sample_rate, hparams["sample_rate"],
         )(sig)
-        #resampled = resampled.unsqueeze(0)
+        # resampled = resampled.unsqueeze(0)
         return resampled
 
     sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
@@ -272,7 +275,6 @@ def text_pipeline(wrd):
 
     sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
 
-
     # 4. Set output:
     sb.dataio.dataset.set_output_keys(
         datasets, ["id", "sig", "wrd", "char_list", "tokens"],
@@ -319,7 +321,6 @@ def text_pipeline(wrd):
     # create ddp_group with the right communication protocol
     sb.utils.distributed.ddp_init_group(run_opts)
 
-
     # Create experiment directory
     sb.create_experiment_directory(
         experiment_directory=hparams["output_folder"],
@@ -327,7 +328,6 @@ def text_pipeline(wrd):
         overrides=overrides,
     )
 
-
     # Dataset prep (parsing Librispeech)
     from librispeech_prepare import prepare_librispeech  # noqa
 
@@ -369,12 +369,17 @@ def text_pipeline(wrd):
 
     # Use pretrained embeddings
     if hparams["pretrain_embeddings"]:
-        embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"])
+        embs = hparams["tokenizer"].get_pretrained_embeddings(
+            device=run_opts["device"],
+            num_codebooks=hparams["num_codebooks"],
+            vocab_size=hparams["vocab_size"],
+        )
         hparams["discrete_embedding_layer"].init_embedding(embs)
 
-
     # Log number of parameters/buffers
-    codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()])
+    codec_params = sum(
+        [x.numel() for x in hparams["tokenizer"].state_dict().values()]
+    )
     model_params = sum(
         [
             x.numel()
@@ -407,8 +412,7 @@ def text_pipeline(wrd):
     from speechbrain.decoders.ctc import CTCBeamSearcher
 
     test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"],
-        vocab_list=vocab_list,
+        **hparams["test_beam_search"], vocab_list=vocab_list,
     )
 
     train_dataloader_opts = hparams["train_dataloader_opts"]
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
index e7db8766a..b59bcdfa5 100644
--- a/benchmarks/DASB/model/ __init__.py	
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -1 +1 @@
-from model.tokenizer_interface import EncodecTokenizer
\ No newline at end of file
+from model.tokenizer_interface import EncodecTokenizer
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index d3bf3cc9f..1c655fc65 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -59,7 +59,7 @@ def __init__(
         emb_dim,
         init=False,
         freeze=False,
-        hidden_dim =None,
+        hidden_dim=None,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -76,11 +76,9 @@ def __init__(
         else:
             self.proj_layer = None
 
-
     def init_embedding(self, weights):
         self.embedding.weight.data.copy_(weights)
-    
-    
+
     def forward(self, in_tokens):
         """Computes the embedding for discrete tokens.
         a sample.
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 351652a57..604e3a403 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -1,4 +1,3 @@
-
 """
 Unified interface for tokenizers, standardizing the output shape of encode and decode functions.
 
@@ -12,9 +11,13 @@
 import torch
 from abc import ABC, abstractmethod
 from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
-from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
+    DiscreteSSL,
+)
 from speechbrain.lobes.models.discrete.dac import DAC
-from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
+from speechbrain.lobes.models.discrete.speechtokenizer_interface import (
+    SpeechTokenizer_interface,
+)
 
 
 class BaseTokenizer(ABC):
@@ -29,13 +32,14 @@ def sig_to_tokens(self, signal, lengths, **kwargs):
     def tokens_to_sig(self, tokens, **kwargs):
         """Abstract method to decode tokens into a signal."""
         pass
-    
+
     @abstractmethod
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
         pass
 
+
 class EncodecTokenizer(Encodec, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
@@ -50,20 +54,21 @@ def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
         signal = self.decode(tokens)[:, 0]  # [B, T]
         return signal
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
         embeddings = self.vocabulary
         return embeddings.reshape(-1, embeddings.shape[-1])
 
+
 class DACTokenizer(DAC, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
         # signal: [B, T]
         self.eval()
         tokens, _ = self(
-            signal[:, None], n_quantizers=kwargs['num_codebooks']
+            signal[:, None], n_quantizers=kwargs["num_codebooks"]
         )  # [B, N_Q, T]
         return tokens.movedim(-1, -2)  # [B, T, N_Q]
 
@@ -76,7 +81,7 @@ def tokens_to_sig(self, tokens, **kwargs):
         )
         signal = self.decode(quantized_feats)[:, 0]  # [B, T]
         return signal
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
@@ -88,24 +93,25 @@ def get_pretrained_embeddings(self, **kwargs):
         self.to(kwargs["device"]).eval()
         with torch.no_grad():
             z_q, z_p, _ = self.quantizer.from_codes(toks)
-        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)  # [C, D, 1] * K
+        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)
         z_qs = []
         for i, z_p_i in enumerate(z_ps):
             with torch.no_grad():
-                z_q_i = (
-                    self.quantizer.quantizers[i].out_proj(z_p_i)
+                z_q_i = self.quantizer.quantizers[i].out_proj(
+                    z_p_i
                 )  # [C, H, 1]
             z_qs.append(z_q_i)
         assert (z_q == sum(z_qs)).all()
-        embeddings = torch.cat(z_qs)[:, :, 0]  # [CK, H]
+        embeddings = torch.cat(z_qs)[:, :, 0]
         return embeddings
 
+
 class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
         # signal: [B, T]
         self.eval()
-        tokens = self(signal)[: kwargs['num_codebooks']]  # [N_Q, B, T]
+        tokens = self(signal)[: kwargs["num_codebooks"]]  # [N_Q, B, T]
         return tokens.movedim(-3, -1)  # [B, T, N_Q]
 
     @torch.no_grad()
@@ -114,7 +120,7 @@ def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
         tokens = tokens.movedim(-1, -3)  # [N_Q, B, T]
         return self.decode(tokens)  # [B, T]
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
@@ -128,54 +134,22 @@ def get_pretrained_embeddings(self, **kwargs):
         for i, indices in enumerate(toks):
             layer = self.model.quantizer.vq.layers[i]
             with torch.no_grad():
-                quantized = layer.decode(indices)  # [C, H, 1]
+                quantized = layer.decode(indices)
             embs.append(quantized)
-        assert (
-            self.model.quantizer.decode(toks) == sum(embs)
-        ).all()
-        embeddings = torch.cat(embs)[:, :, 0]  # [CK, H]
+        assert (self.model.quantizer.decode(toks) == sum(embs)).all()
+        embeddings = torch.cat(embs)[:, :, 0]
         return embeddings
 
+
 class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths):
-        # signal: [B, T]
-        self.hparams.codec_quantizer.to(self.device).eval()
-        tokens, _, _ = self.hparams.codec_quantizer(
-            signal,
-            lengths,
-            SSL_layers=self.hparams.SSL_layers,
-            deduplicates=[False] * len(self.hparams.SSL_layers),
-            bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
-        )  # [B, T, N_Q]
-        return tokens
+        pass
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens):
-        # tokens: [B, T, N_Q]
-        self.hparams.codec_vocoder.to(self.device).eval()
-
-        all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
-        offsets = torch.arange(
-            0,
-            len(all_layer_ids) * self.hparams.vocab_size,
-            self.hparams.vocab_size,
-            device=self.device,
-        )
-        offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
-        offsets = offsets[offset_idxes]
-        tokens += offsets + 1
-
-        if len(self.hparams.SSL_layers) < len(all_layer_ids):
-            full_tokens = torch.zeros(
-                *tokens.shape[:2],
-                len(all_layer_ids),
-                dtype=tokens.dtype,
-                device=self.device,
-            )
-            for i, idx in enumerate(offset_idxes):
-                full_tokens[..., idx] = tokens[..., i]
-            tokens = full_tokens
-
-        self.hparams.codec_vocoder.tokenize = False
-        return self.hparams.codec_vocoder(tokens)[:, 0]  # [B, T]
+        pass
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        pass

From db1590ee346dab0896723cf8184ba8b1e12355b8 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:54:09 -0500
Subject: [PATCH 4/9] fix flake

---
 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py | 5 +----
 benchmarks/DASB/model/ __init__.py                | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index baa80c80e..99eeb81fe 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -27,6 +27,7 @@
 
 _CACHE = {"size": 0}
 
+
 # Define training procedure
 class ASR(sb.Brain):
     def compute_forward(self, batch, stage):
@@ -38,8 +39,6 @@ def compute_forward(self, batch, stage):
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
             wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens)  # [B, T]
 
-        current_epoch = self.hparams.epoch_counter.current
-
         # compute features
         # Extract tokens (cache them at first epoch if augmentation is disabled)
         key = tuple(sorted(batch.id))
@@ -156,8 +155,6 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
             elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler":
                 lr = self.hparams.scheduler.current_lr
-                steps = self.optimizer_step
-
             else:
                 raise NotImplementedError
 
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
index b59bcdfa5..e69de29bb 100644
--- a/benchmarks/DASB/model/ __init__.py	
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -1 +0,0 @@
-from model.tokenizer_interface import EncodecTokenizer

From 3361ac6e9c21e94d2957d76347c7c19bfeab88ad Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:56:08 -0500
Subject: [PATCH 5/9] fix blank index

---
 .../LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
index 55d7c3c91..99d423b87 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -183,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8

From 1dd6c9e7d6cbce93335cfcb8e2ee152e09782331 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 26 Nov 2024 18:35:02 -0500
Subject: [PATCH 6/9] add hyp tuning [draft]

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        |   4 +-
 benchmarks/DASB/extra_requirements.txt        |   2 +
 benchmarks/DASB/orion/hparams_bohb.yaml       |   6 +
 benchmarks/DASB/run_experiments.sh            | 220 ++++++++++
 benchmarks/DASB/run_hparam_optimization.sh    | 405 ++++++++++++++++++
 benchmarks/DASB/utils/aggregate_results.py    | 145 +++++++
 6 files changed, 780 insertions(+), 2 deletions(-)
 create mode 100755 benchmarks/DASB/orion/hparams_bohb.yaml
 create mode 100755 benchmarks/DASB/run_experiments.sh
 create mode 100644 benchmarks/DASB/run_hparam_optimization.sh
 create mode 100644 benchmarks/DASB/utils/aggregate_results.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index 806305774..e02076cfb 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -23,9 +23,9 @@ data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
 # then data_folder_rirs should be /localscratch/xxx_corpus
 # otherwise the dataset will automatically be downloaded
 # data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+train_splits: ["train-clean-100"]
 dev_splits: ["dev-clean"]
-test_splits: ["dev-clean", "test-clean", "test-other"]
+test_splits: ["dev-clean", "test-clean"]
 skip_prep: False
 train_csv: !ref <output_folder>/train.csv
 valid_csv: !ref <output_folder>/dev-clean.csv
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index 4d1d241c3..4b693ec1b 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -8,3 +8,5 @@ speechtokenizer>=0.1.2
 tensorboard
 tgt
 unidecode
+orion[bohb]
+ConfigSpace==0.7.1
diff --git a/benchmarks/DASB/orion/hparams_bohb.yaml b/benchmarks/DASB/orion/hparams_bohb.yaml
new file mode 100755
index 000000000..e68509559
--- /dev/null
+++ b/benchmarks/DASB/orion/hparams_bohb.yaml
@@ -0,0 +1,6 @@
+experiment:
+    algorithms:
+        bohb:
+            seed: 1986
+            min_points_in_model: 20
+            num_samples: 24
diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
new file mode 100755
index 000000000..36f6a845f
--- /dev/null
+++ b/benchmarks/DASB/run_experiments.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+###########################################################
+# Script to run leave-one-subject-out and/or leave-one-session-out training, optionally with multiple seeds.
+# This script loops over the different subjects and sessions and trains different models.
+# At the end, the final performance is computed with the aggregate_results.py script that provides the average performance.
+#
+# Usage:
+# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \
+# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10
+#
+# Authors:
+# - Pooneh Mousavi (2024)
+###########################################################
+
+# Initialize variables
+data_folder=""
+cached_data_folder=""
+output_folder=""
+task=""
+downstream=""
+tokenizer_name=""
+dataset=""
+seed=""
+nruns=""
+eval_metric="acc"
+eval_set="test"
+rnd_dir=False
+additional_flags=""
+
+
+# Function to print argument descriptions and exit
+print_argument_descriptions() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --data_folder data_folder_path    Data folder path"
+    echo "  --output_folder output_path       Output folder path"
+    echo "  --task task                       downstream task"
+    echo "  --downstream downstream           probing head"
+    echo "  --tokenizer_name tokenizer_name   tokenizer choice"
+    echo "  --dataset dataset               dataset"
+    echo "  --seed random_seed                Seed (random if not specified)"
+    echo "  --nruns num_runs                  Number of runs"
+    echo "  --eval_metric metric              Evaluation metric (e.g., acc or f1)"
+    echo "  --eval_set dev or test            Evaluation set. Default: test"
+    echo "  --rnd_dir                         If True the results are stored in a subdir of the output folder with a random name (useful to store all the results of an hparam tuning).  Default: False"
+    exit 1
+}
+
+
+# Parse command line
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --data_folder)
+      data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --output_folder)
+      output_folder="$2"
+      shift
+      shift
+      ;;
+   
+      --task)
+      task="$2"
+      shift
+      shift
+      ;;
+    
+    
+    --downstream)
+      downstream="$2"
+      shift
+      shift
+      ;;   
+
+      --tokenizer_name)
+      tokenizer_name="$2"
+      shift
+      shift
+      ;;
+      
+      --dataset)
+      dataset="$2"
+      shift
+      shift
+      ;;
+
+    --seed)
+      seed="$2"
+      shift
+      shift
+      ;;
+
+    --nruns)
+      nruns="$2"
+      shift
+      shift
+      ;;
+
+    --eval_metric)
+      eval_metric="$2"
+      shift
+      shift
+      ;;
+
+    --eval_set)
+      eval_set="$2"
+      shift
+      shift
+      ;;
+
+    --rnd_dir)
+      rnd_dir="$2"
+      shift
+      shift
+      ;;
+
+
+    --help)
+      print_argument_descriptions
+      ;;
+
+    -*|--*)
+      additional_flags+="$1 $2 " # store additional flags
+      shift # past argument
+      ;;
+
+
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+
+# Check for required arguments
+if  [ -z "$data_folder" ] || [ -z "$output_folder" ]  || [ -z "$nruns" ]; then
+    echo "ERROR: Missing required arguments! Please provide all required options."
+    print_argument_descriptions
+fi
+
+# Process eval_set argument
+if [ "$eval_set" = "dev" ]; then
+  metric_file=valid_metrics.pkl
+elif [ "$eval_set" = "test" ]; then
+  metric_file=test_metrics.pkl
+else
+  echo "Invalid eval_set value: $eval_set. It can be test or dev only."
+  exit 1
+fi
+
+# Manage Seed (optional argument)
+seed="${seed:-$RANDOM}"
+
+
+
+if [ "$rnd_dir" = True ]; then
+    rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
+    output_folder="$output_folder/$rnd_dirname"
+fi
+
+# Make sure  the output_folder is created
+mkdir -p $output_folder
+
+# Print command line arguments and save to file
+{
+    echo "hparams: $hparams"
+    echo "data_folder: $data_folder"
+    echo "output_folder: $output_folder"
+    echo "task: $task"
+    echo "downstream: $downstream"
+    echo "tokenizer_name: $tokenizer_name"
+    echo "dataset: $dataset"
+    echo "seed: $seed"
+    echo "nruns: $nruns"
+    echo "eval_metric: $eval_metric"
+    echo "eval_set: $eval_set"
+    echo "rnd_dir: $rnd_dir"
+    echo "additional flags: $additional_flags"
+} | tee "$output_folder/flags.txt"
+
+
+# Creating output folder
+mkdir -p $output_folder
+mkdir -p $data_folder
+mkdir -p $cached_data_folder
+
+# Function to run the training experiment
+run_experiment() {
+
+python $dataset/$task/train.py $dataset/$task/hparams/$downstream/$tokenizer_name.yaml  --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp\
+$additional_flags --debug
+
+}
+
+# Run multiple training experiments (with different seeds)
+for i in $(seq 0 1 $(( nruns - 1 ))); do
+  ((run_idx = i + 1))
+  run_name=run"$run_idx"
+  output_folder_exp="$output_folder"/"$run_name"/$seed
+
+  run_experiment  $output_folder_exp
+
+
+  # Store the results
+  # python utils/parse_results.py $output_folder_exp $metric_file $eval_metric | tee -a  $output_folder/$run_name\_results.txt
+
+  # Changing Random seed
+  seed=$((seed+1))
+done
+
+
+echo 'Final Results (Performance Aggregation)'
+python utils/aggregate_results.py $output_folder $eval_metric | tee -a  $output_folder/aggregated_performance.txt
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
new file mode 100644
index 000000000..1b2570675
--- /dev/null
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -0,0 +1,405 @@
+#!/bin/bash
+
+###########################################################
+# Hyperparameter Tuning Script for EEG Model with Orion
+###########################################################
+
+# Description:
+# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion.
+# It supports leave-one-subject-out and/or leave-one-session-out training strategies.
+
+# Usage:
+# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \
+#                             --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \
+#                             --data_folder eeg_data/ \
+#                             --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \
+#                             --nruns 1 --nruns_eval 10 \
+#                             --eval_metric acc \
+#                             --exp_max_trials 50 \
+#                             --store_all True \
+#                             --device 'cpu'
+#
+# Optimization Steps:
+# The script supports multiple hyperparameter optimization steps.
+# We found it convenient to first optimize training and model hyperparameters,
+# and then optimize data augmentation hyperparameters in a separate step.
+
+# Script Workflow:
+# 1. Search for the orion flags in the specified hparam file.
+# 2. Run the orion-hunt command for hyperparameter tuning.
+#    By default, TPE (Tree-structured Parzen Estimator) hyperparameter tuning is
+#    performed, as specified in the default orion config file at hparams/orion/hparams_tpe.yaml.
+# 3. Save the best hyperparameters, which can be viewed using torch-info.
+# 4. Loop until flags like @orion_step<stepid> are found in the YAML file.
+#
+# Final Performance Evaluation:
+# At the end of the optimization process, the script computes the final performance
+# using the best hyperparameters on the test set.
+# This is done by averaging over nruns_eval different seeds.
+#
+# Note: More detailed information can be found in the README.md file.
+
+# Authors:
+# - Mirco Ravanelli (2023)
+# - Davide Borra (2023)
+###########################################################
+
+# Initialize variables
+exp_name="hopt"
+output_folder=""
+data_folder=""
+cached_data_folder=""
+hparams=""
+nruns=""
+nruns_eval=10
+eval_metric="acc"
+seed=1986
+config_file="hparams/orion/hparams_tpe.yaml"
+mne_dir=""
+orion_db_address=""
+orion_db_type="PickledDB"
+exp_max_trials=50
+store_all=True
+compress_exp=True
+
+# Function to print argument descriptions and exit
+print_argument_descriptions() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --exp_name Name                       Name that Orion gives to the experiment"
+    echo "  --output_folder output_path           Output folder were the results will be stored"
+    echo "  --data_folder data_path               Folder were the data are stored. If not available, they will be downloaded there."
+    echo "  --cached_data_folder path [Optional]  Folder were the data in pkl format will be cached."
+    echo "  --hparms hparam_file                  YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used"
+    echo "  --nruns num_runs                      Number of runs for each hparam selection."
+    echo "  --nruns_eval num_runs                 Number of runs for the final evaluation  (with best hparams) on the test set"
+    echo "  --eval_metric metric [Optional]       Evaluation metric description. Default:acc"
+    echo "  --seed random_seed [Optional]         Seed (random if not specified)"
+    echo "  --config_file config_file [Optional]  Orion config file. Default: hparams/orion/hparams_tpe.yaml"
+    echo "  --mne_dir mne_dir [Optional]          MNE directory. Need it different from your home (see notes on MNE in README.md)"
+    echo "  --orion_db_address [Optional]         Path of the database where orion will store hparams and performance"
+    echo "  --orion_db_type db_type [Optional]    Type of the dataset that orion will use. Default: PickledDB"
+    echo "  --exp_max_trials int [Optional]       Maximum number of hparam trials for each oprimization step. Default:50"
+    echo "  --store_all Bool [Optional]           When set to True, the output folders of all hparam trials will be stored in randomly named folders. Default: False"
+    echo "  --compress_exp Bool [Optional]        When set to True, this option compresses the output folders of all hyperparameter trials into a single tar.gz file. This is particularly useful when store_all is set to True, as it helps prevent the accumulation of a large number of files. Default: False"
+    exit 1
+}
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+
+    --exp_name)
+      exp_name="$2"
+      shift
+      shift
+      ;;
+
+    --output_folder)
+      output_folder="$2"
+      shift
+      shift
+      ;;
+
+    --data_folder)
+      data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --hparams)
+      hparams="$2"
+      shift
+      shift
+      ;;
+
+    --cached_data_folder)
+      cached_data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --seed)
+      seed="$2"
+      shift
+      shift
+      ;;
+
+    --nruns)
+      nruns="$2"
+      shift
+      shift
+      ;;
+
+    --nruns_eval)
+      nruns_eval="$2"
+      shift
+      shift
+      ;;
+
+
+    --eval_metric)
+      eval_metric="$2"
+      shift
+      shift
+      ;;
+
+
+
+    --config_file)
+      config_file="$2"
+      shift
+      shift
+      ;;
+
+    --mne_dir)
+      mne_dir="$2"
+      shift
+      shift
+      ;;
+
+    --orion_db_address)
+      orion_db_address="$2"
+      shift
+      shift
+      ;;
+
+    --orion_db_type)
+      orion_db_type="$2"
+      shift
+      shift
+      ;;
+
+    --exp_max_trials)
+      exp_max_trials="$2"
+      shift
+      shift
+      ;;
+
+    --store_all)
+      store_all="$2"
+      shift
+      shift
+      ;;
+
+    --compress_exp)
+      compress_exp="$2"
+      shift
+      shift
+      ;;
+
+    --help)
+      print_argument_descriptions
+      ;;
+
+    -*|--*)
+      additional_flags+="$1 $2 " # store additional flags
+      shift # past argument
+      ;;
+
+
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+
+# Check for required arguments
+if [ -z "$output_folder" ] || [ -z "$data_folder" ]  || [ -z "$hparams" ] || [ -z "$nruns" ]; then
+    echo "ERROR: Missing required arguments! Please provide all required options."
+    print_argument_descriptions
+fi
+
+# Set mne_dir if specified
+if [ "$mne_dir" ]; then
+   export _MNE_FAKE_HOME_DIR=$mne_dir
+fi
+
+# Assign default value to cached_data_folder
+if [ -z "$cached_data_folder" ]; then
+    cached_data_folder="$data_folder/pkl"
+fi
+
+
+# Set orion db address if specified
+if [ -z "$orion_db_address" ]; then
+    orion_db_address=$output_folder'/'$exp_name'.pkl'
+fi
+export ORION_DB_ADDRESS=$orion_db_address
+export ORION_DB_TYPE=$orion_db_type
+
+echo "-------------------------------------"
+echo "Experiment Name: $exp_name"
+echo "Output Folder: $output_folder"
+echo "Data Folder: $data_folder"
+echo "Cached Data Folder: $cached_data_folder"
+echo "Hparam File: $hparams"
+echo "Number of Runs: $nruns"
+echo "Number of Eval Runs: $nruns_eval"
+echo "Eval Metric: $eval_metric"
+echo "Seed: $seed"
+echo "Additional Flags: $additional_flags"
+echo "Orion Config File: $config_file"
+echo "Orion Database type: $orion_db_type"
+echo "Orion Database file: $orion_db_address"
+echo "Experiment Max Trials: $exp_max_trials"
+echo "-------------------------------------"
+
+
+# This function will extract all the optimization flags added in the yaml file
+# The input is a text file (e.g, a yaml file) and a pattern (e.g, "@orion_step1:")
+# The ouput are the detected flags (e.g.,  --dropout~"uniform(0.0, 0.5)").
+get_flag() {
+    local file_path="$1"
+    local pattern="$2"
+
+    # Check if the file exists
+    if [ ! -f "$file_path" ]; then
+        echo "Error: File '$file_path' not found."
+        return 1
+    fi
+
+    # Use grep to find all lines containing the pattern and then extract the flags using sed
+    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n'
+}
+
+
+# Function for updatading the hparam yaml file with the best hparams found at step 1
+update_hparams() {
+    local best_hparams_file="$1"
+    local hparams_yaml_file="$2"
+    local output_yaml_file="$3"
+
+    # Read the values from best_hparams.txt into an associative array
+    declare -A best_hparams
+    while IFS=": " read -r key value; do
+        best_hparams["$key"]=$value
+    done < "$best_hparams_file"
+
+
+    # Read the hparams.yaml file into a variable
+    local hparams_content=$(cat "$hparams_yaml_file")
+
+    # Update values in hparams_content using values from best_hparams
+    for key in "${!best_hparams[@]}"; do
+        local pattern="^$key: .*"
+        local replacement="$key: ${best_hparams[$key]}"
+        hparams_content=$(sed "s/$pattern/$replacement/g" <<< "$hparams_content")
+    done
+
+    # Write the updated content to a new YAML file
+    echo "$hparams_content" > "$output_yaml_file"
+}
+
+# Function for extracting the best hparams from orion-info
+function extract_best_params() {
+    local input_file="$1"
+    local best_trial_line=$(grep -n "best trial:" "$input_file" | cut -d ":" -f 1)
+    local params_lines=$(tail -n +$best_trial_line "$input_file" | awk '/params:/{flag=1;next}/start time:/{flag=0}flag')
+    local formatted_params=$(echo "$params_lines" | sed -e 's/^[[:space:]]*//' -e 's/: /: /' -e '/^$/d' -e 's#^/##')
+    echo "$formatted_params"
+}
+
+# Running hparam tuning (loop over multiple steps)
+step_id=1
+hparams_step=$hparams
+pattern="@orion_step1:"
+opt_flags=$(get_flag "$hparams_step" "$pattern")
+
+# Check if the string is empty and exit with an error if it is
+if [ -z "$opt_flags" ]; then
+    echo "Error: Optimization flags not found in '$hparams'"
+    echo "Please ensure that the Orion optimization flags are set in the hparam file using in-line comments like:"
+    echo "# @orion_step1: --dropout~\"uniform(0.0, 0.5)\""
+    exit 1  # Exit with a non-zero error code
+fi
+
+
+while [ -n "$opt_flags" ]; do
+    # Do something
+    output_folder_step="$output_folder"/step"$step_id"
+    mkdir -p $output_folder_step
+    exp_name_step="$exp_name"_step"$step_id"
+
+    echo
+    echo "**********************************************************************************************"
+    echo "Running hparam tuning (step $step_id)..."
+    echo "- This might take several hours!"
+    echo "- The best set of hparams will be save in $output_folder_step"
+    echo "- You can monitor the evolution of the hparam optimization with: orion status -n $exp_name"
+    echo "......"
+    echo "**********************************************************************************************"
+    echo
+    # Setting up orion command
+    orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \
+    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --seed $seed \
+    	--output_folder $output_folder_step/exp   --nruns $nruns \
+    	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all $additional_flags"
+
+
+    # Appending the optimization flags
+    orion_hunt_command="$orion_hunt_command $opt_flags"
+
+    echo $orion_hunt_command &> "$output_folder_step/orion_hunt_command.txt"
+
+    # Execute the command for hparm tuning
+    eval $orion_hunt_command
+
+    # Compress the exp folder (if required)
+    if [ "$compress_exp" = True ]; then
+        tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp"
+        if [ -d "$output_folder_step/exp" ]; then
+            rm -rf "$output_folder_step/exp"
+        fi
+
+    fi
+
+    # Storing best haprams
+    orion info --name $exp_name_step &> $output_folder_step/orion-info.txt
+
+    # Extract list of the best hparams from orion-info
+    # Find the line number where "best trial:" appears
+    best_trial_line=$(grep -n "best trial:" $output_folder_step/orion-info.txt | cut -d ":" -f 1)
+
+    # Extract and store the best set of hparams
+    best_params_output=$(extract_best_params "$output_folder_step/orion-info.txt")
+    best_hparams_file="$output_folder_step/best_hparams.txt"
+    echo "$best_params_output" > $best_hparams_file
+
+    # Store the current best yaml file
+    best_yaml_file="$output_folder_step/best_hparams.yaml"
+    update_hparams "$best_hparams_file" "$hparams_step" "$best_yaml_file"
+
+    # Update best hparam step
+    hparams_step=$best_yaml_file
+
+    # Update step variable
+    ((step_id++))
+
+    # Update search pattern
+    pattern="@orion_step$step_id:"
+
+    # update optimization flags pattern
+    opt_flags=$(get_flag "$hparams_step" "$pattern")
+done
+
+echo
+echo "**********************************************************************************************"
+echo "Running Final Evaluation on the best hparams (test-set)..."
+echo "**********************************************************************************************"
+echo
+
+final_yaml_file="$output_folder/best_hparams.yaml"
+scp $best_yaml_file $final_yaml_file
+
+# Running evaluation on the test set for the best models
+ ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder \
+  --seed $seed --output_folder $output_folder/best --nsbj $nsbj --nsess $nsess \
+  --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
+  --train_mode $train_mode --rnd_dir $store_all $additional_flags
+
+
+echo "The test performance with best hparams is available at  $output_folder/best"
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
new file mode 100644
index 000000000..1ba94c7e1
--- /dev/null
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -0,0 +1,145 @@
+#!/usr/bin/python
+"""
+Snippet to aggregate the results over multiple runs of the same experiment.
+This is useful when we run multiple experiments with different seeds and we
+want to compute the average performance. The script also reports the final
+metric to Orion (when needed for hyperparameter tuning).
+
+The script searches for the result files (_results.txt) and computes the mean
+and the standard deviation of the given evaluation metrics (e.g., acc or f1).
+The results must have an identical format (with only different performance
+numbers).
+
+To run this script:
+
+    > python aggregate_results.py your_result_folder acc
+
+Author
+------
+Mirco Ravanelli, 2022
+"""
+
+import sys
+import numpy as np
+from orion.client import report_objective
+from speechbrain.utils.data_utils import get_all_files
+
+
+def get_prototype(res_file, eval_metric):
+    """Parses a result file and adds a placeholder where the aggregated metrics
+    should be printed. It also returns the number of detected metrics.
+
+    Arguments
+    ---------
+    res_file: path
+        Path of the result file to parse.
+    eval_metric: path
+        Metric of interest (e.g, acc or f1).
+
+    Returns
+    ---------
+    prototype: list
+        List of the lines of the result file (with <values> as placeholder).
+    n_metrics: int
+        Number of metrics to replace in the result files.
+    """
+    prototype = []
+    n_metrics = 0
+
+    # Open the first res file and figure out where the metrics are
+    with open(res_file) as file_in:
+        for line in file_in:
+            if eval_metric in line:
+                line = line.split(eval_metric)[0]
+                # The placeholder for the metric is <values>
+                line = line + eval_metric + " <values>"
+                n_metrics = n_metrics + 1
+            prototype.append(line)
+    return prototype, n_metrics
+
+
+def get_metrics(res_files, eval_metric):
+    """Summarizes the metrics of interest in a matrix.
+
+    Arguments
+    ---------
+    res_files: list
+        List of all the result files.
+    eval_metric: path
+        Metric of interest (e.g, acc or f1).
+
+    Returns
+    ---------
+    metrics: np.array
+        Matrix (n_metrics, n_files) containing the metrics of interest.
+    """
+
+    # Metric initialization
+    metrics = np.zeros([n_metrics, len(res_files)])
+
+    # Loop over files
+    for i in range(len(res_files)):
+        cnt = 0
+        # Metric extraction
+        with open(res_files[i]) as file_in:
+            for line in file_in:
+                if eval_metric in line:
+                    value = line.split(eval_metric + " ")[1]
+                    value = value.split(" ")[0]
+                    value = float(value)
+                    metrics[cnt, i] = value
+                    cnt = cnt + 1
+    return metrics
+
+
+def aggregate_metrics(prototype, metrics):
+    """Prints the aggregated metrics.It replaces the <values> placeholders with
+    the corresponding metrics.
+
+    Arguments
+    ---------
+    prototype: list
+        List of the lines of the result file (with <values> as placeholder).
+    metrics: np.array
+        Matrix (n_metrics, n_files) containing the metrics of interest.
+    """
+    cnt = 0
+    for line in prototype:
+        if eval_metric in line:
+            values_line = "["
+            for i in range(len(res_files)):
+                values_line = values_line + "%f " % float(metrics[cnt, i])
+            values_line = values_line[:-1]
+            values_line = values_line + "] avg: %f ± %f " % (
+                float(metrics[cnt, :].mean()),
+                float(metrics[cnt, :].std()),
+            )
+            line = line.replace("<values>", values_line)
+            cnt = cnt + 1
+        print(line)
+
+
+if __name__ == "__main__":
+    # output_folder = sys.argv[1]
+    # eval_metric = sys.argv[2]
+    output_folder = "benchmarks/DASB/result"
+    eval_metric = "wer"
+    # Getting the list of the result files in the output folder
+    res_files = get_all_files(output_folder, match_and=["_results.txt"])
+
+    # Gettin a prototype file
+    prototype, n_metrics = get_prototype(res_files[0], eval_metric)
+
+    # Extracting the metrics of interest
+    metrics = get_metrics(res_files, eval_metric)
+
+    # print aggregated metrics
+    aggregate_metrics(prototype, metrics)
+
+    final_metric = metrics[-1, :].mean()
+
+    # Report final metric to Orion
+    # Remember: orion expects metrics to be minimized!
+    if eval_metric == "acc" or eval_metric == "f1":
+        final_metric = 1 - final_metric
+    report_objective(final_metric)

From 78cb049c7712d47b63645d2d14b63fc72f25c6d5 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 27 Nov 2024 12:58:55 -0500
Subject: [PATCH 7/9] add bobh

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        | 21 +++++---
 .../DASB/LibriSpeech/ASR-refactor/train.py    |  4 +-
 benchmarks/DASB/run_experiments.sh            | 49 +++++++++----------
 benchmarks/DASB/run_hparam_optimization.sh    | 33 ++++++++++---
 benchmarks/DASB/utils/aggregate_results.py    | 22 +++++----
 5 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index e02076cfb..e1d4680b1 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -15,7 +15,7 @@ output_folder: !ref results/dac/LSTM/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
+cached_data_folder: !PLACEHOLDER #'path/to/cache'
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
@@ -27,16 +27,21 @@ train_splits: ["train-clean-100"]
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-   - !ref <output_folder>/dev-clean.csv
-   - !ref <output_folder>/test-clean.csv
+   - !ref <cached_data_folder>/dev-clean.csv
+   - !ref <cached_data_folder>/test-clean.csv
 
 
 ####################### Training Parameters ####################################
-number_of_epochs: 20
-batch_size: 4 # This works for 2x GPUs with 32GB
+# number_of_epochs: 20
+number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, base=4)"
+number_of_epochs: !apply:int
+    - !apply:math.floor
+        - !ref <number_of_epochs_>
+batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(4, 6,discrete=True)"
+batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -48,7 +53,7 @@ valid_search_interval: 1
 avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 cache_size: 1.e+10
 
-lr_model: 0.001
+lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
 weight_decay: 0.0005
 
 
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index 99eeb81fe..177d79f8f 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -336,7 +336,7 @@ def text_pipeline(wrd):
             "tr_splits": hparams["train_splits"],
             "dev_splits": hparams["dev_splits"],
             "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
+            "save_folder": hparams["cached_data_folder"],
             "merge_lst": hparams["train_splits"],
             "merge_name": "train.csv",
             "skip_prep": hparams["skip_prep"],
@@ -345,7 +345,7 @@ def text_pipeline(wrd):
 
     # Defining tokenizer and loading it
     tokenizer = SentencePiece(
-        model_dir=hparams["save_folder"],
+        model_dir=hparams["cached_data_folder"],
         vocab_size=hparams["output_neurons"],
         annotation_train=hparams["train_csv"],
         annotation_read="wrd",
diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
index 36f6a845f..596556c2c 100755
--- a/benchmarks/DASB/run_experiments.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -14,12 +14,11 @@
 ###########################################################
 
 # Initialize variables
+hparams=""
 data_folder=""
 cached_data_folder=""
 output_folder=""
 task=""
-downstream=""
-tokenizer_name=""
 dataset=""
 seed=""
 nruns=""
@@ -33,12 +32,12 @@ additional_flags=""
 print_argument_descriptions() {
     echo "Usage: $0 [options]"
     echo "Options:"
+    echo "  --hparams hparams_path            Hparam YAML file"
     echo "  --data_folder data_folder_path    Data folder path"
+    echo "  --cached_data_folder cache_path   Cached data folder path"
     echo "  --output_folder output_path       Output folder path"
     echo "  --task task                       downstream task"
-    echo "  --downstream downstream           probing head"
-    echo "  --tokenizer_name tokenizer_name   tokenizer choice"
-    echo "  --dataset dataset               dataset"
+    echo "  --dataset dataset                 dataset"
     echo "  --seed random_seed                Seed (random if not specified)"
     echo "  --nruns num_runs                  Number of runs"
     echo "  --eval_metric metric              Evaluation metric (e.g., acc or f1)"
@@ -53,12 +52,24 @@ POSITIONAL_ARGS=()
 
 while [[ $# -gt 0 ]]; do
   case $1 in
+    --hparams)
+      hparams="$2"
+      shift
+      shift
+      ;;
+
     --data_folder)
       data_folder="$2"
       shift
       shift
       ;;
 
+    --cached_data_folder)
+      cached_data_folder="$2"
+      shift
+      shift
+      ;;
+
     --output_folder)
       output_folder="$2"
       shift
@@ -70,20 +81,7 @@ while [[ $# -gt 0 ]]; do
       shift
       shift
       ;;
-    
-    
-    --downstream)
-      downstream="$2"
-      shift
-      shift
-      ;;   
-
-      --tokenizer_name)
-      tokenizer_name="$2"
-      shift
-      shift
-      ;;
-      
+     
       --dataset)
       dataset="$2"
       shift
@@ -140,7 +138,7 @@ done
 
 
 # Check for required arguments
-if  [ -z "$data_folder" ] || [ -z "$output_folder" ]  || [ -z "$nruns" ]; then
+if  [ -z "$hparams" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ]  || [ -z "$nruns" ]; then
     echo "ERROR: Missing required arguments! Please provide all required options."
     print_argument_descriptions
 fi
@@ -172,10 +170,9 @@ mkdir -p $output_folder
 {
     echo "hparams: $hparams"
     echo "data_folder: $data_folder"
+    echo "cached_data_folder: $cached_data_folder"
     echo "output_folder: $output_folder"
     echo "task: $task"
-    echo "downstream: $downstream"
-    echo "tokenizer_name: $tokenizer_name"
     echo "dataset: $dataset"
     echo "seed: $seed"
     echo "nruns: $nruns"
@@ -194,8 +191,8 @@ mkdir -p $cached_data_folder
 # Function to run the training experiment
 run_experiment() {
 
-python $dataset/$task/train.py $dataset/$task/hparams/$downstream/$tokenizer_name.yaml  --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp\
-$additional_flags --debug
+python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
+$additional_flags 
 
 }
 
@@ -208,7 +205,7 @@ for i in $(seq 0 1 $(( nruns - 1 ))); do
   run_experiment  $output_folder_exp
 
 
-  # Store the results
+  # # Store the results
   # python utils/parse_results.py $output_folder_exp $metric_file $eval_metric | tee -a  $output_folder/$run_name\_results.txt
 
   # Changing Random seed
@@ -217,4 +214,4 @@ done
 
 
 echo 'Final Results (Performance Aggregation)'
-python utils/aggregate_results.py $output_folder $eval_metric | tee -a  $output_folder/aggregated_performance.txt
+python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 1b2570675..4eefc8292 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -49,12 +49,14 @@ exp_name="hopt"
 output_folder=""
 data_folder=""
 cached_data_folder=""
+task=""
+dataset=""
 hparams=""
 nruns=""
 nruns_eval=10
 eval_metric="acc"
 seed=1986
-config_file="hparams/orion/hparams_tpe.yaml"
+config_file="orion/hparams_bohb.yaml"
 mne_dir=""
 orion_db_address=""
 orion_db_type="PickledDB"
@@ -70,6 +72,8 @@ print_argument_descriptions() {
     echo "  --output_folder output_path           Output folder were the results will be stored"
     echo "  --data_folder data_path               Folder were the data are stored. If not available, they will be downloaded there."
     echo "  --cached_data_folder path [Optional]  Folder were the data in pkl format will be cached."
+    echo "  --task task                       downstream task"
+    echo "  --dataset dataset                 dataset"
     echo "  --hparms hparam_file                  YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used"
     echo "  --nruns num_runs                      Number of runs for each hparam selection."
     echo "  --nruns_eval num_runs                 Number of runs for the final evaluation  (with best hparams) on the test set"
@@ -120,6 +124,18 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
+    --task)
+      task="$2"
+      shift
+      shift
+      ;;
+     
+    --dataset)
+      dataset="$2"
+      shift
+      shift
+      ;;
+
     --seed)
       seed="$2"
       shift
@@ -220,7 +236,7 @@ fi
 
 # Assign default value to cached_data_folder
 if [ -z "$cached_data_folder" ]; then
-    cached_data_folder="$data_folder/pkl"
+    cached_data_folder="$data_folder/cache"
 fi
 
 
@@ -233,9 +249,12 @@ export ORION_DB_TYPE=$orion_db_type
 
 echo "-------------------------------------"
 echo "Experiment Name: $exp_name"
+echo "hparams: $hparams"
 echo "Output Folder: $output_folder"
 echo "Data Folder: $data_folder"
 echo "Cached Data Folder: $cached_data_folder"
+echo "task: $task"
+echo "dataset: $dataset"
 echo "Hparam File: $hparams"
 echo "Number of Runs: $nruns"
 echo "Number of Eval Runs: $nruns_eval"
@@ -335,8 +354,8 @@ while [ -n "$opt_flags" ]; do
     echo
     # Setting up orion command
     orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \
-    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --seed $seed \
-    	--output_folder $output_folder_step/exp   --nruns $nruns \
+    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder=$cached_data_folder --seed $seed \
+    	--output_folder $output_folder_step/exp   --task=$task   --dataset=$dataset --nruns $nruns \
     	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all $additional_flags"
 
 
@@ -396,10 +415,10 @@ final_yaml_file="$output_folder/best_hparams.yaml"
 scp $best_yaml_file $final_yaml_file
 
 # Running evaluation on the test set for the best models
- ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder \
-  --seed $seed --output_folder $output_folder/best --nsbj $nsbj --nsess $nsess \
+ ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder=$cached_data_folder \
+  --seed $seed --output_folder $output_folder/best --task=$task   --dataset=$dataset \
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --train_mode $train_mode --rnd_dir $store_all $additional_flags
+  --rnd_dir $store_all $additional_flags
 
 
 echo "The test performance with best hparams is available at  $output_folder/best"
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
index 1ba94c7e1..73a35cbad 100644
--- a/benchmarks/DASB/utils/aggregate_results.py
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -20,6 +20,7 @@
 """
 
 import sys
+import re
 import numpy as np
 from orion.client import report_objective
 from speechbrain.utils.data_utils import get_all_files
@@ -84,11 +85,13 @@ def get_metrics(res_files, eval_metric):
         with open(res_files[i]) as file_in:
             for line in file_in:
                 if eval_metric in line:
-                    value = line.split(eval_metric + " ")[1]
-                    value = value.split(" ")[0]
-                    value = float(value)
-                    metrics[cnt, i] = value
-                    cnt = cnt + 1
+                    # Use regex to find the test WER value
+                    match = re.search(rf'{eval_metric}: (\d+\.\d+e[+-]\d+)', line)
+                    if match:
+                        value = match.group(1)
+                        value = float(value)
+                        metrics[cnt, i] = value
+                        cnt = cnt + 1
     return metrics
 
 
@@ -120,12 +123,11 @@ def aggregate_metrics(prototype, metrics):
 
 
 if __name__ == "__main__":
-    # output_folder = sys.argv[1]
-    # eval_metric = sys.argv[2]
-    output_folder = "benchmarks/DASB/result"
-    eval_metric = "wer"
+    output_folder = sys.argv[1]
+    eval_metric = sys.argv[2]
+
     # Getting the list of the result files in the output folder
-    res_files = get_all_files(output_folder, match_and=["_results.txt"])
+    res_files = get_all_files(output_folder, match_and=["train_log.txt"])
 
     # Gettin a prototype file
     prototype, n_metrics = get_prototype(res_files[0], eval_metric)

From 1f959a612b5497bf7238404474f75d9ae2c1402b Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Thu, 28 Nov 2024 12:09:21 -0500
Subject: [PATCH 8/9] fix bug

---
 .../DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml  | 4 ++--
 benchmarks/DASB/run_hparam_optimization.sh               | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index e1d4680b1..47a803de8 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -40,7 +40,7 @@ number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, ba
 number_of_epochs: !apply:int
     - !apply:math.floor
         - !ref <number_of_epochs_>
-batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(4, 6,discrete=True)"
+batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(1, 2,discrete=True)"
 batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
 grad_accumulation_factor: 2
@@ -119,7 +119,7 @@ freeze_embedding: False
 
 # LSTM
 activation: !name:torch.nn.Sigmoid
-dnn_layers: 2
+dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(2, 4,discrete=True)"
 dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 4eefc8292..12d38d131 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -354,8 +354,8 @@ while [ -n "$opt_flags" ]; do
     echo
     # Setting up orion command
     orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \
-    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder=$cached_data_folder --seed $seed \
-    	--output_folder $output_folder_step/exp   --task=$task   --dataset=$dataset --nruns $nruns \
+    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \
+    	--output_folder $output_folder_step/exp   --task $task   --dataset $dataset  --seed $seed --nruns $nruns \
     	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all $additional_flags"
 
 
@@ -415,10 +415,9 @@ final_yaml_file="$output_folder/best_hparams.yaml"
 scp $best_yaml_file $final_yaml_file
 
 # Running evaluation on the test set for the best models
- ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder=$cached_data_folder \
-  --seed $seed --output_folder $output_folder/best --task=$task   --dataset=$dataset \
+./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
+  --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
   --rnd_dir $store_all $additional_flags
 
-
 echo "The test performance with best hparams is available at  $output_folder/best"

From 081c4e7a9bc1bf2f2271d1bff9f1af5a14f4dda8 Mon Sep 17 00:00:00 2001
From: poonehmousavi <mousavi.pooneh@gmail.com>
Date: Mon, 16 Dec 2024 18:16:33 +0000
Subject: [PATCH 9/9] fix aggregat_result  bug

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        | 17 ++++----
 .../ASR-refactor/hparams/LSTM/encodec.yaml    | 39 ++++++++++---------
 .../DASB/LibriSpeech/ASR-refactor/train.py    | 23 +++++------
 benchmarks/DASB/orion/hparams_bohb.yaml       |  4 +-
 benchmarks/DASB/orion/hparams_tpe.yaml        |  6 +++
 benchmarks/DASB/utils/aggregate_results.py    |  4 +-
 6 files changed, 50 insertions(+), 43 deletions(-)
 create mode 100755 benchmarks/DASB/orion/hparams_tpe.yaml

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index 47a803de8..ef9d20349 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -16,7 +16,7 @@ output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 cached_data_folder: !PLACEHOLDER #'path/to/cache'
-
+testing: True
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
 # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
@@ -25,7 +25,7 @@ data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
 # data_folder_rirs: !ref <data_folder>
 train_splits: ["train-clean-100"]
 dev_splits: ["dev-clean"]
-test_splits: ["dev-clean", "test-clean"]
+test_splits: ["test-clean"]
 skip_prep: False
 train_csv: !ref <cached_data_folder>/train.csv
 valid_csv: !ref <cached_data_folder>/dev-clean.csv
@@ -33,13 +33,9 @@ test_csv:
    - !ref <cached_data_folder>/dev-clean.csv
    - !ref <cached_data_folder>/test-clean.csv
 
-
 ####################### Training Parameters ####################################
 # number_of_epochs: 20
-number_of_epochs_: 200 # @orion_step1: --number_of_epochs~"fidelity(15, 1000, base=4)"
-number_of_epochs: !apply:int
-    - !apply:math.floor
-        - !ref <number_of_epochs_>
+number_of_epochs: 200 # @orion_step1: --number_of_epochs~"fidelity(1, 2, base=4)"
 batch_size_exponent: 6 # @orion_step1: --batch_size_exponent~"uniform(1, 2,discrete=True)"
 batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
@@ -119,7 +115,7 @@ freeze_embedding: False
 
 # LSTM
 activation: !name:torch.nn.Sigmoid
-dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(2, 4,discrete=True)"
+dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(1, 2,discrete=True)"
 dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
@@ -224,8 +220,9 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
 
 
 # Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter  # epoch counter
+    limit: !new:int
+        - !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
index 18d967244..cfd42f3cc 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
@@ -15,28 +15,29 @@ output_folder: !ref results/enocdec/LSTM/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
+cached_data_folder: !PLACEHOLDER #'path/to/cache'
+testing: True
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
 # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
 # then data_folder_rirs should be /localscratch/xxx_corpus
 # otherwise the dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100"]
 dev_splits: ["dev-clean"]
-test_splits: ["dev-clean", "test-clean", "test-other"]
+test_splits: ["test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-   - !ref <output_folder>/dev-clean.csv
-   - !ref <output_folder>/test-clean.csv
-
+   - !ref <cached_data_folder>/test-clean.csv
+   - !ref <cached_data_folder>/test-other.csv
 
 ####################### Training Parameters ####################################
-number_of_epochs: 20
-batch_size: 4 # This works for 2x GPUs with 32GB
+number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)"
+batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)"
+batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -48,7 +49,7 @@ valid_search_interval: 1
 avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 cache_size: 1.e+10
 
-lr_model: 0.001
+lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
 weight_decay: 0.0005
 
 
@@ -99,8 +100,8 @@ test_dataloader_opts:
 # bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
 # num_codebooks: [2, 4, 8, 16, 32]
 vocab_size: 1024
-bandwidth: 1.5
-num_codebooks: 2
+bandwidth: 6.0
+num_codebooks: 8
 sample_rate: 24000
 # Feature parameters
 encoder_dim: 1024
@@ -109,9 +110,10 @@ pretrain_embeddings: False
 freeze_embedding: False
 
 
+
 # LSTM
 activation: !name:torch.nn.Sigmoid
-dnn_layers: 2
+dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"
 dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
@@ -134,7 +136,7 @@ prune_history: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
+   save_path: !ref <cached_data_folder>
    sample_rate: !ref <sample_rate>
    bandwidth: !ref <bandwidth>
    flat_embeddings: False
@@ -219,8 +221,9 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
 
 
 # Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter  # epoch counter
+    limit: !new:int
+        - !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index 177d79f8f..9f9f05ae0 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -434,15 +434,16 @@ def text_pipeline(wrd):
     )
 
     # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
+    if hparams['testing']:
+        if not os.path.exists(hparams["output_wer_folder"]):
+            os.makedirs(hparams["output_wer_folder"])
 
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.output_wer_folder = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
+        for k in test_datasets.keys():  # keys are test_clean, test_other etc
+            asr_brain.hparams.output_wer_folder = os.path.join(
+                hparams["output_wer_folder"], f"wer_{k}.txt"
+            )
+            asr_brain.evaluate(
+                test_datasets[k],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                min_key="WER",
+            )
diff --git a/benchmarks/DASB/orion/hparams_bohb.yaml b/benchmarks/DASB/orion/hparams_bohb.yaml
index e68509559..a360d2beb 100755
--- a/benchmarks/DASB/orion/hparams_bohb.yaml
+++ b/benchmarks/DASB/orion/hparams_bohb.yaml
@@ -2,5 +2,5 @@ experiment:
     algorithms:
         bohb:
             seed: 1986
-            min_points_in_model: 20
-            num_samples: 24
+            min_points_in_model: 5
+            num_samples: 5
diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml
new file mode 100755
index 000000000..fb6a7c9b0
--- /dev/null
+++ b/benchmarks/DASB/orion/hparams_tpe.yaml
@@ -0,0 +1,6 @@
+experiment:
+    algorithms:
+        tpe:
+            seed: 1986
+            n_initial_points: 20
+            n_ei_candidates: 24
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
index 73a35cbad..ae9c19ad2 100644
--- a/benchmarks/DASB/utils/aggregate_results.py
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -86,7 +86,7 @@ def get_metrics(res_files, eval_metric):
             for line in file_in:
                 if eval_metric in line:
                     # Use regex to find the test WER value
-                    match = re.search(rf'{eval_metric}: (\d+\.\d+e[+-]\d+)', line)
+                    match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line)
                     if match:
                         value = match.group(1)
                         value = float(value)
@@ -125,7 +125,7 @@ def aggregate_metrics(prototype, metrics):
 if __name__ == "__main__":
     output_folder = sys.argv[1]
     eval_metric = sys.argv[2]
-
+    
     # Getting the list of the result files in the output folder
     res_files = get_all_files(output_folder, match_and=["train_log.txt"])