From a4f38bd2a9cbb76e8bd56c944c66a8fccf0a7c04 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 5 Nov 2024 09:31:52 -0500
Subject: [PATCH 001/270] add tokenizer_interface

---
 benchmarks/DASB/model/tokenizer_interface.py | 164 +++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 benchmarks/DASB/model/tokenizer_interface.py

diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
new file mode 100644
index 000000000..892bef6b3
--- /dev/null
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -0,0 +1,164 @@
+
+"""
+Unified interface for tokenizers, standardizing the output shape of encode and decode functions.
+
+This class reshapes the outputs of various tokenizers to ensure consistency, simplifying integration with recipes and workflows.
+
+Authors
+---------
+* Pooneh Mousavi, 2024
+"""
+
+import torch
+
+from  speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
+from  speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from  speechbrain.lobes.models.discrete.dac import DAC
+from  speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
+
+
+class Tokenizer_Encodec(Encodec):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks, _ = self.encode(sig, lens)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        sig = self.decode(toks)[:, 0]  # [B, T]
+        return sig
+  
+class Tokenizer_DAC(DAC):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks, _ = self(
+            sig[:, None], n_quantizers=kwargs['num_codebooks']
+        )  # [B, K, N]
+        toks = toks.movedim(-1, -2)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        qfeats, _, _ = self.quantizer.from_codes(
+            toks.movedim(-1, -2)  # [B, K, N]
+        )
+        sig = self.decode(qfeats)[:, 0]  # [B, T]
+        return sig
+
+class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens,**kwargs):
+        # sig: [B, T]
+        self.eval()
+        toks = self(sig)[
+            : kwargs['num_codebooks']
+        ]  # [K, B, N]
+        toks = toks.movedim(-3, -1)  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks,**kwargs):
+        # toks: [B, N, K]
+        self.eval()
+        toks = toks.movedim(-1, -3)  # [K, B, N]
+        sig = self.decode(toks)  # [B, T]
+        return sig
+
+class Tokenizer_DiscreteSSL(DiscreteSSL):
+    @torch.no_grad()
+    def sig_to_toks(self, sig, lens):
+        # sig: [B, T]
+        self.hparams.codec_quantizer.to(self.device).eval()
+        toks, _, _ = self.hparams.codec_quantizer(
+            sig,
+            lens,
+            SSL_layers=self.hparams.SSL_layers,
+            deduplicates=[False] * len(self.hparams.SSL_layers),
+            bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
+        )  # [B, N, K]
+        return toks
+
+    @torch.no_grad()
+    def toks_to_sig(self, toks):
+        # toks: [B, N, K]
+        self.hparams.codec_vocoder.device = self.device
+        self.hparams.codec_vocoder.to(self.device).eval()
+
+        # Add offset for embedding layer
+        all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
+        # TODO: remove after testing
+        assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23)
+        offsets = torch.arange(
+            0,
+            len(all_layer_ids) * self.hparams.vocab_size,
+            self.hparams.vocab_size,
+            device=self.device,
+        )
+        offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
+        offsets = offsets[offset_idxes]
+        toks = toks + offsets + 1
+
+        # Handle missing codebooks
+        if len(self.hparams.SSL_layers) < len(all_layer_ids):
+            full_toks = torch.zeros(
+                *toks.shape[:2],
+                len(all_layer_ids),
+                dtype=toks.dtype,
+                device=self.device,
+            )
+            for i, idx in enumerate(offset_idxes):
+                full_toks[..., idx] = toks[..., i]
+            toks = full_toks
+
+        self.hparams.codec_vocoder.tokenize = False
+        sig = self.hparams.codec_vocoder(toks)[:, 0]  # [B, T]
+        return sig
+
+class Tokenizer:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    @torch.no_grad()
+    def encode(self,sig, lens,**kwargs):
+        toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs)
+        return toks
+    
+    @torch.no_grad()
+    def decode(self,sig,**kwargs):
+        sig = self.tokenizer.toks_to_sig(sig,**kwargs)
+        return sig
+    
+    
+# model_hub = "facebook/encodec_24khz"
+# save_path = "savedir"
+# model = Tokenizer_Encodec(model_hub, save_path)
+# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT)
+# inputs = torch.rand([3, 2000])
+# model_hub = "facebook/hubert-large-ll60k"
+# save_path = "savedir"
+# ssl_layer_num = [7,23]
+# deduplicate =[False, True]
+# bpe_tokenizers=[None, None]
+# kmeans_repo_id = "speechbrain/SSL_Quantization"
+# kmeans_dataset = "LJSpeech"
+# num_clusters = 1000
+# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True)
+# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters)
+model_hub = "fnlp/SpeechTokenizer"
+save_path = "savedir"
+model =Tokenizer_SpeechTokenizer(model_hub, save_path)  # doctest: +SKIP
+tokenizer= Tokenizer(model)
+audio = torch.randn(4, 1000)
+length = torch.tensor([1.0, .5, .75, 1.0])
+tokens = tokenizer.encode(audio, length,num_codebooks=2)
+print(tokens.shape)
+rec = tokenizer.decode(tokens)
+print(rec.shape)
\ No newline at end of file

From 0c2b751c595c9a63a2bd66b14f32e2faa13478d8 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 6 Nov 2024 17:53:55 -0500
Subject: [PATCH 002/270] add reactored version of ASR

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        | 232 +++++++++
 .../ASR-refactor/hparams/LSTM/encodec.yaml    | 232 +++++++++
 .../hparams/LSTM/speech_tokenizer.yaml        | 222 +++++++++
 .../ASR-refactor/hparams/contextnet/dac.yaml  | 225 +++++++++
 .../hparams/contextnet/encodec.yaml           | 223 +++++++++
 .../hparams/contextnet/speech_tokenizer.yaml  | 213 +++++++++
 .../ASR-refactor/librispeech_prepare.py       |   1 +
 .../DASB/LibriSpeech/ASR-refactor/train.py    | 447 ++++++++++++++++++
 benchmarks/DASB/model/ __init__.py            |   1 +
 benchmarks/DASB/model/custom_model.py         |  17 +-
 benchmarks/DASB/model/tokenizer_interface.py  | 231 ++++-----
 11 files changed, 1933 insertions(+), 111 deletions(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
 create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
 create mode 100644 benchmarks/DASB/model/ __init__.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
new file mode 100644
index 000000000..4accc2241
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -0,0 +1,232 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: DAC
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.DACTokenizer
+   model_type: !ref <model_type>
+   model_bitrate: !ref <model_bitrate>
+   load_pretrained: True
+   tag: latest
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
new file mode 100644
index 000000000..03c29ddbb
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
@@ -0,0 +1,232 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: Encodec
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
new file mode 100644
index 000000000..8105204a5
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -0,0 +1,222 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: SpeechTokenizer
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speechtokenizer/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+vocab_size: 1024
+num_codebooks: 2
+sample_rate: 16000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
new file mode 100644
index 000000000..eabeef113
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
@@ -0,0 +1,225 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: DAC
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac/contextnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.DACTokenizer
+   model_type: !ref <model_type>
+   model_bitrate: !ref <model_bitrate>
+   load_pretrained: True
+   tag: latest
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
new file mode 100644
index 000000000..c0411bd76
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
@@ -0,0 +1,223 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/Contexnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
new file mode 100644
index 000000000..77ef2c540
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
@@ -0,0 +1,213 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: SpeechTokenizer
+# Encoder: Contextnet Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speechtokenizer/contextnet/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+vocab_size: 1024
+num_codebooks: 2
+sample_rate: 16000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 640
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history> 
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
new file mode 120000
index 000000000..a3126ec94
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
@@ -0,0 +1 @@
+../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
new file mode 100644
index 000000000..61b6c56f4
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -0,0 +1,447 @@
+#!/usr/bin/env/python3
+"""Recipe for training an discrete tokens ctc ASR system with librispeech.
+
+Decoding is performed with greedy decoding at validation time.
+At test time, beamsearch is used with an optional external language model.
+
+Authors
+ * Pooneh Mousavi 2024
+"""
+
+import os
+import sys
+import torch
+import torchaudio
+import logging
+import speechbrain as sb
+from speechbrain.utils.distributed import run_on_main, if_main_process
+from speechbrain.tokenizers.SentencePiece import SentencePiece
+from hyperpyyaml import load_hyperpyyaml
+from pathlib import Path
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+
+logger = logging.getLogger(__name__)
+
+_CACHE = {"size": 0}
+
+# Define training procedure
+class ASR(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """Forward computations from the waveform batches to the output probabilities."""
+        batch = batch.to(self.device)
+        wavs, wav_lens = batch.sig
+       
+
+        # Add waveform augmentation if specified.
+        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
+            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T]
+
+        current_epoch = self.hparams.epoch_counter.current
+
+        # compute features
+        # Extract tokens (cache them at first epoch if augmentation is disabled)
+        key = tuple(sorted(batch.id))
+        try:
+            in_toks = _CACHE[key]
+            in_toks = in_toks.to(self.device)
+        except KeyError:
+            with torch.no_grad():
+                self.hparams.tokenizer.eval().to(self.device)
+                in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q]
+            if stage != sb.Stage.TRAIN or (
+                stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment"))
+            ):
+                if _CACHE["size"] < self.hparams.cache_size:
+                    _CACHE[key] = in_toks.cpu()
+                    _CACHE["size"] += in_toks.numel()
+
+        # Extract embeddings
+        in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D]
+
+        # Attention-Pooling 
+        att_w = self.modules.attention_mlp(in_embs)  #[B, T, N-Q, 1]
+        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2)  #[B, T, D]
+
+        # forward modules
+        if type(self.modules.encoder).__name__ == "ContextNet":
+            enc_out = self.modules.encoder(in_embs)
+
+        elif type(self.modules.encoder).__name__ == "LSTM":
+            enc_out, _ = self.modules.encoder(
+                in_embs
+            ) 
+
+        else:
+            raise NotImplementedError
+        
+        # output layer for ctc log-probabilities
+        logits = self.modules.ctc_lin(enc_out)
+        p_ctc = self.hparams.log_softmax(logits)
+
+        p_tokens = None
+        if stage == sb.Stage.VALID:
+            p_tokens = sb.decoders.ctc_greedy_decode(
+                p_ctc, wav_lens, blank_id=self.hparams.blank_index
+            )
+        elif stage == sb.Stage.TEST:
+            p_tokens = test_searcher(p_ctc, wav_lens)
+
+        return p_ctc, wav_lens, p_tokens
+    
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss (CTC+NLL) given predictions and targets."""
+
+        p_ctc, wav_lens, predicted_tokens = predictions
+        ids = batch.id
+        tokens, tokens_lens = batch.tokens
+
+
+        # Label Augmentation
+        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
+            tokens = self.hparams.wav_augment.replicate_labels(tokens)
+            tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens)
+        
+        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
+        
+        if stage == sb.Stage.VALID:
+            # Decode token terms to words
+            predicted_words = self.tokenizer(
+                predicted_tokens, task="decode_from_list"
+            )
+        elif stage == sb.Stage.TEST:
+            predicted_words = [
+                hyp[0].text.split(" ") for hyp in predicted_tokens
+            ]
+
+        if stage != sb.Stage.TRAIN:
+            target_words = [wrd.split(" ") for wrd in batch.wrd]
+            self.wer_metric.append(ids, predicted_words, target_words)
+            self.cer_metric.append(ids, predicted_words, target_words)
+
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch"""
+        if stage != sb.Stage.TRAIN:
+            self.cer_metric = self.hparams.cer_computer()
+            self.wer_metric = self.hparams.wer_computer()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a epoch."""
+        # Compute/store important stats
+        stage_stats = {"loss": stage_loss}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+        else:
+            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
+            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+            current_epoch = self.hparams.epoch_counter.current
+            valid_search_interval = self.hparams.valid_search_interval
+            if (
+                current_epoch % valid_search_interval == 0
+                or stage == sb.Stage.TEST
+            ):
+                stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+
+        # log stats and save checkpoint at end-of-epoch
+        if stage == sb.Stage.VALID:
+            if type(self.hparams.scheduler).__name__ == "NewBobScheduler":
+                lr, new_lr = self.hparams.scheduler(
+                    stage_stats["loss"]
+                )
+                sb.nnet.schedulers.update_learning_rate(
+                    self.optimizer, new_lr
+                )
+            elif type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+                lr = self.hparams.scheduler.current_lr
+                steps = self.optimizer_step
+           
+            else:
+                raise NotImplementedError
+            
+            optimizer = self.optimizer.__class__.__name__
+            epoch_stats = {
+                "epoch": epoch,
+                "lr": lr,
+                "optimizer": optimizer,
+            }
+            self.hparams.train_logger.log_stats(
+                stats_meta=epoch_stats,
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+            self.checkpointer.save_and_keep_only(
+                meta={"WER": stage_stats["WER"], "epoch": epoch},
+                min_keys=["WER"],
+                num_to_keep=self.hparams.avg_checkpoints,
+            )
+
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+            if if_main_process():
+                with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w:
+                    self.wer_metric.write_stats(w)
+
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        if should_step and type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+            self.hparams.scheduler(self.optimizer)
+
+
+
+def dataio_prepare(hparams, tokenizer):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions."""
+    data_folder = hparams["data_folder"]
+
+    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
+    )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True
+        )
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+
+    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
+    )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    # test is separate
+    test_datasets = {}
+    for csv_file in hparams["test_csv"]:
+        name = Path(csv_file).stem
+        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
+            csv_path=csv_file, replacements={"data_root": data_folder}
+        )
+        test_datasets[name] = test_datasets[name].filtered_sorted(
+            sort_key="duration"
+        )
+
+    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
+
+    # 2. Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        info = torchaudio.info(wav)
+        resampled = torchaudio.transforms.Resample(
+            info.sample_rate, hparams["sample_rate"],
+        )(sig)
+        #resampled = resampled.unsqueeze(0)
+        return resampled
+
+    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @sb.utils.data_pipeline.takes("wrd")
+    @sb.utils.data_pipeline.provides(
+        "wrd", "char_list", "tokens_list", "tokens"
+    )
+    def text_pipeline(wrd):
+        yield wrd
+        char_list = list(wrd)
+        yield char_list
+        tokens_list = tokenizer.sp.encode_as_ids(wrd)
+        yield tokens_list
+        tokens = torch.LongTensor(tokens_list)
+        yield tokens
+
+    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
+
+
+    # 4. Set output:
+    sb.dataio.dataset.set_output_keys(
+        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
+    )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from speechbrain.dataio.sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams_train = hparams["dynamic_batch_sampler_train"]
+        dynamic_hparams_val = hparams["dynamic_batch_sampler_val"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_train,
+        )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_val,
+        )
+
+    return (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_batch_sampler,
+        valid_batch_sampler,
+    )
+
+
+if __name__ == "__main__":
+
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # If distributed_launch=True then
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+
+    # Dataset prep (parsing Librispeech)
+    from librispeech_prepare import prepare_librispeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_librispeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "tr_splits": hparams["train_splits"],
+            "dev_splits": hparams["dev_splits"],
+            "te_splits": hparams["test_splits"],
+            "save_folder": hparams["output_folder"],
+            "merge_lst": hparams["train_splits"],
+            "merge_name": "train.csv",
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    # Defining tokenizer and loading it
+    tokenizer = SentencePiece(
+        model_dir=hparams["save_folder"],
+        vocab_size=hparams["output_neurons"],
+        annotation_train=hparams["train_csv"],
+        annotation_read="wrd",
+        model_type=hparams["token_type"],
+        character_coverage=hparams["character_coverage"],
+        bos_id=hparams["bos_index"],
+        eos_id=hparams["eos_index"],
+    )
+
+    # here we create the datasets objects as well as tokenization and encoding
+    (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_bsampler,
+        valid_bsampler,
+    ) = dataio_prepare(hparams, tokenizer)
+
+    # Use pretrained embeddings
+    if hparams["pretrain_embeddings"]:
+        embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"])
+        hparams["discrete_embedding_layer"].init_embedding(embs)
+
+
+    # Log number of parameters/buffers
+    codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()])
+    model_params = sum(
+        [
+            x.numel()
+            for module in hparams["modules"].values()
+            for x in module.state_dict().values()
+        ]
+    )
+    hparams["train_logger"].log_stats(
+        stats_meta={
+            f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}",
+            "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}",
+        },
+    )
+
+    # Trainer initialization
+    asr_brain = ASR(
+        modules=hparams["modules"],
+        opt_class=hparams["model_opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # Adding objects to trainer.
+    asr_brain.tokenizer = tokenizer
+    vocab_list = [
+        tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size())
+    ]
+
+    from speechbrain.decoders.ctc import CTCBeamSearcher
+
+    test_searcher = CTCBeamSearcher(
+        **hparams["test_beam_search"],
+        vocab_list=vocab_list,
+    )
+
+    train_dataloader_opts = hparams["train_dataloader_opts"]
+    valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+    if train_bsampler is not None:
+        train_dataloader_opts = {
+            "batch_sampler": train_bsampler,
+            "num_workers": hparams["num_workers"],
+        }
+
+    if valid_bsampler is not None:
+        valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+    # Training
+    asr_brain.fit(
+        asr_brain.hparams.epoch_counter,
+        train_data,
+        valid_data,
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Testing
+    if not os.path.exists(hparams["output_wer_folder"]):
+        os.makedirs(hparams["output_wer_folder"])
+
+    for k in test_datasets.keys():  # keys are test_clean, test_other etc
+        asr_brain.hparams.output_wer_folder = os.path.join(
+            hparams["output_wer_folder"], f"wer_{k}.txt"
+        )
+        asr_brain.evaluate(
+            test_datasets[k],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+            min_key="WER",
+        )
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
new file mode 100644
index 000000000..e7db8766a
--- /dev/null
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -0,0 +1 @@
+from model.tokenizer_interface import EncodecTokenizer
\ No newline at end of file
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index b6e11a0d2..d3bf3cc9f 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -57,9 +57,9 @@ def __init__(
         num_codebooks,
         vocab_size,
         emb_dim,
-        pad_index=0,
         init=False,
         freeze=False,
+        hidden_dim =None,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -70,10 +70,17 @@ def __init__(
         ).requires_grad_(not self.freeze)
         self.init = init
 
-    def init_embedding(self, weights):
-        with torch.no_grad():
-            self.embedding.weight = torch.nn.Parameter(weights)
+        # Add a linear layer to match dimensions if necessary
+        if hidden_dim is not None and hidden_dim != emb_dim:
+            self.proj_layer = torch.nn.Linear(emb_dim, hidden_dim)
+        else:
+            self.proj_layer = None
+
 
+    def init_embedding(self, weights):
+        self.embedding.weight.data.copy_(weights)
+    
+    
     def forward(self, in_tokens):
         """Computes the embedding for discrete tokens.
         a sample.
@@ -97,4 +104,6 @@ def forward(self, in_tokens):
             )
             # Forward Pass to embedding and
             in_embs = self.embedding(in_tokens)
+            if self.proj_layer is not None:
+                in_embs = self.proj_layer(in_embs)
             return in_embs
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 892bef6b3..351652a57 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -10,92 +10,152 @@
 """
 
 import torch
+from abc import ABC, abstractmethod
+from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
+from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from speechbrain.lobes.models.discrete.dac import DAC
+from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
 
-from  speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
-from  speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
-from  speechbrain.lobes.models.discrete.dac import DAC
-from  speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
 
+class BaseTokenizer(ABC):
+    @abstractmethod
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        """Abstract method to encode a signal into tokens."""
+        pass
+
+    @abstractmethod
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        """Abstract method to decode tokens into a signal."""
+        pass
+    
+    @abstractmethod
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        pass
 
-class Tokenizer_Encodec(Encodec):
+class EncodecTokenizer(Encodec, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks, _ = self.encode(sig, lens)  # [B, N, K]
-        return toks
+        tokens, _ = self.encode(signal, lengths)  # [B, T, N_Q]
+        return tokens
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        sig = self.decode(toks)[:, 0]  # [B, T]
-        return sig
-  
-class Tokenizer_DAC(DAC):
+        signal = self.decode(tokens)[:, 0]  # [B, T]
+        return signal
+    
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        embeddings = self.vocabulary
+        return embeddings.reshape(-1, embeddings.shape[-1])
+
+class DACTokenizer(DAC, BaseTokenizer):
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks, _ = self(
-            sig[:, None], n_quantizers=kwargs['num_codebooks']
-        )  # [B, K, N]
-        toks = toks.movedim(-1, -2)  # [B, N, K]
-        return toks
+        tokens, _ = self(
+            signal[:, None], n_quantizers=kwargs['num_codebooks']
+        )  # [B, N_Q, T]
+        return tokens.movedim(-1, -2)  # [B, T, N_Q]
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        qfeats, _, _ = self.quantizer.from_codes(
-            toks.movedim(-1, -2)  # [B, K, N]
+        quantized_feats, _, _ = self.quantizer.from_codes(
+            tokens.movedim(-1, -2)  # [B, N_Q, T]
         )
-        sig = self.decode(qfeats)[:, 0]  # [B, T]
-        return sig
-
-class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface):
+        signal = self.decode(quantized_feats)[:, 0]  # [B, T]
+        return signal
+    
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200
+        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
+        toks = (
+            toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone()
+        )  # [C, K, 1]
+        self.to(kwargs["device"]).eval()
+        with torch.no_grad():
+            z_q, z_p, _ = self.quantizer.from_codes(toks)
+        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)  # [C, D, 1] * K
+        z_qs = []
+        for i, z_p_i in enumerate(z_ps):
+            with torch.no_grad():
+                z_q_i = (
+                    self.quantizer.quantizers[i].out_proj(z_p_i)
+                )  # [C, H, 1]
+            z_qs.append(z_q_i)
+        assert (z_q == sum(z_qs)).all()
+        embeddings = torch.cat(z_qs)[:, :, 0]  # [CK, H]
+        return embeddings
+
+class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens,**kwargs):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths, **kwargs):
+        # signal: [B, T]
         self.eval()
-        toks = self(sig)[
-            : kwargs['num_codebooks']
-        ]  # [K, B, N]
-        toks = toks.movedim(-3, -1)  # [B, N, K]
-        return toks
+        tokens = self(signal)[: kwargs['num_codebooks']]  # [N_Q, B, T]
+        return tokens.movedim(-3, -1)  # [B, T, N_Q]
 
     @torch.no_grad()
-    def toks_to_sig(self, toks,**kwargs):
-        # toks: [B, N, K]
+    def tokens_to_sig(self, tokens, **kwargs):
+        # tokens: [B, T, N_Q]
         self.eval()
-        toks = toks.movedim(-1, -3)  # [K, B, N]
-        sig = self.decode(toks)  # [B, T]
-        return sig
-
-class Tokenizer_DiscreteSSL(DiscreteSSL):
+        tokens = tokens.movedim(-1, -3)  # [N_Q, B, T]
+        return self.decode(tokens)  # [B, T]
+    
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        """Return pretrained codebook embedding."""
+        # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360
+        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
+        toks = (
+            toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone()
+        )  # [K, C, 1]
+        self.to(kwargs["device"]).eval()
+        embs = []
+        for i, indices in enumerate(toks):
+            layer = self.model.quantizer.vq.layers[i]
+            with torch.no_grad():
+                quantized = layer.decode(indices)  # [C, H, 1]
+            embs.append(quantized)
+        assert (
+            self.model.quantizer.decode(toks) == sum(embs)
+        ).all()
+        embeddings = torch.cat(embs)[:, :, 0]  # [CK, H]
+        return embeddings
+
+class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
     @torch.no_grad()
-    def sig_to_toks(self, sig, lens):
-        # sig: [B, T]
+    def sig_to_tokens(self, signal, lengths):
+        # signal: [B, T]
         self.hparams.codec_quantizer.to(self.device).eval()
-        toks, _, _ = self.hparams.codec_quantizer(
-            sig,
-            lens,
+        tokens, _, _ = self.hparams.codec_quantizer(
+            signal,
+            lengths,
             SSL_layers=self.hparams.SSL_layers,
             deduplicates=[False] * len(self.hparams.SSL_layers),
             bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
-        )  # [B, N, K]
-        return toks
+        )  # [B, T, N_Q]
+        return tokens
 
     @torch.no_grad()
-    def toks_to_sig(self, toks):
-        # toks: [B, N, K]
-        self.hparams.codec_vocoder.device = self.device
+    def tokens_to_sig(self, tokens):
+        # tokens: [B, T, N_Q]
         self.hparams.codec_vocoder.to(self.device).eval()
 
-        # Add offset for embedding layer
         all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
-        # TODO: remove after testing
-        assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23)
         offsets = torch.arange(
             0,
             len(all_layer_ids) * self.hparams.vocab_size,
@@ -104,61 +164,18 @@ def toks_to_sig(self, toks):
         )
         offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
         offsets = offsets[offset_idxes]
-        toks = toks + offsets + 1
+        tokens += offsets + 1
 
-        # Handle missing codebooks
         if len(self.hparams.SSL_layers) < len(all_layer_ids):
-            full_toks = torch.zeros(
-                *toks.shape[:2],
+            full_tokens = torch.zeros(
+                *tokens.shape[:2],
                 len(all_layer_ids),
-                dtype=toks.dtype,
+                dtype=tokens.dtype,
                 device=self.device,
             )
             for i, idx in enumerate(offset_idxes):
-                full_toks[..., idx] = toks[..., i]
-            toks = full_toks
+                full_tokens[..., idx] = tokens[..., i]
+            tokens = full_tokens
 
         self.hparams.codec_vocoder.tokenize = False
-        sig = self.hparams.codec_vocoder(toks)[:, 0]  # [B, T]
-        return sig
-
-class Tokenizer:
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    @torch.no_grad()
-    def encode(self,sig, lens,**kwargs):
-        toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs)
-        return toks
-    
-    @torch.no_grad()
-    def decode(self,sig,**kwargs):
-        sig = self.tokenizer.toks_to_sig(sig,**kwargs)
-        return sig
-    
-    
-# model_hub = "facebook/encodec_24khz"
-# save_path = "savedir"
-# model = Tokenizer_Encodec(model_hub, save_path)
-# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT)
-# inputs = torch.rand([3, 2000])
-# model_hub = "facebook/hubert-large-ll60k"
-# save_path = "savedir"
-# ssl_layer_num = [7,23]
-# deduplicate =[False, True]
-# bpe_tokenizers=[None, None]
-# kmeans_repo_id = "speechbrain/SSL_Quantization"
-# kmeans_dataset = "LJSpeech"
-# num_clusters = 1000
-# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True)
-# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters)
-model_hub = "fnlp/SpeechTokenizer"
-save_path = "savedir"
-model =Tokenizer_SpeechTokenizer(model_hub, save_path)  # doctest: +SKIP
-tokenizer= Tokenizer(model)
-audio = torch.randn(4, 1000)
-length = torch.tensor([1.0, .5, .75, 1.0])
-tokens = tokenizer.encode(audio, length,num_codebooks=2)
-print(tokens.shape)
-rec = tokenizer.decode(tokens)
-print(rec.shape)
\ No newline at end of file
+        return self.hparams.codec_vocoder(tokens)[:, 0]  # [B, T]

From 17898c3472ec45ae2173b5894f7c7e550918d9d4 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:40:14 -0500
Subject: [PATCH 003/270] fix precommit

---
 .../ASR-refactor/hparams/LSTM/dac.yaml        |  4 +-
 .../ASR-refactor/hparams/LSTM/encodec.yaml    |  4 +-
 .../hparams/LSTM/speech_tokenizer.yaml        |  2 +-
 .../ASR-refactor/hparams/contextnet/dac.yaml  |  4 +-
 .../hparams/contextnet/encodec.yaml           |  5 +-
 .../hparams/contextnet/speech_tokenizer.yaml  |  4 +-
 .../DASB/LibriSpeech/ASR-refactor/train.py    | 78 +++++++++--------
 benchmarks/DASB/model/ __init__.py            |  2 +-
 benchmarks/DASB/model/custom_model.py         |  6 +-
 benchmarks/DASB/model/tokenizer_interface.py  | 84 +++++++------------
 10 files changed, 84 insertions(+), 109 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
index 4accc2241..806305774 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
@@ -182,7 +182,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
index 03c29ddbb..18d967244 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
@@ -182,7 +182,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
index 8105204a5..55d7c3c91 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -172,7 +172,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
index eabeef113..aa7d2e141 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
@@ -175,7 +175,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -186,7 +186,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
index c0411bd76..a1b5262d3 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
@@ -107,7 +107,6 @@ encoder_dim: 1024
 pretrain_embeddings: False
 freeze_embedding: False
 
-
 output_neurons: 31
 
 # BPE parameters
@@ -173,7 +172,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -184,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
index 77ef2c540..c12d6f79f 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
@@ -163,7 +163,7 @@ test_beam_search:
    beam_size: !ref <beam_size>
    beam_prune_logp: !ref <beam_prune_logp>
    token_prune_min_logp: !ref <token_prune_min_logp>
-   prune_history: !ref <prune_history> 
+   prune_history: !ref <prune_history>
    alpha: 0.8
    beta: 1.2
 
@@ -174,7 +174,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index 61b6c56f4..baa80c80e 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -18,6 +18,7 @@
 from speechbrain.tokenizers.SentencePiece import SentencePiece
 from hyperpyyaml import load_hyperpyyaml
 from pathlib import Path
+
 base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 sys.path.append(base_dir)
 
@@ -32,11 +33,10 @@ def compute_forward(self, batch, stage):
         """Forward computations from the waveform batches to the output probabilities."""
         batch = batch.to(self.device)
         wavs, wav_lens = batch.sig
-       
 
         # Add waveform augmentation if specified.
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
-            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T]
+            wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens)  # [B, T]
 
         current_epoch = self.hparams.epoch_counter.current
 
@@ -49,33 +49,38 @@ def compute_forward(self, batch, stage):
         except KeyError:
             with torch.no_grad():
                 self.hparams.tokenizer.eval().to(self.device)
-                in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q]
+                in_toks = self.hparams.tokenizer.sig_to_tokens(
+                    wavs, wav_lens, num_codebooks=hparams["num_codebooks"]
+                )  # [B, T, N-Q]
             if stage != sb.Stage.TRAIN or (
-                stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment"))
+                stage == sb.Stage.TRAIN
+                and (not hasattr(self.hparams, "wav_augment"))
             ):
                 if _CACHE["size"] < self.hparams.cache_size:
                     _CACHE[key] = in_toks.cpu()
                     _CACHE["size"] += in_toks.numel()
 
         # Extract embeddings
-        in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D]
+        in_embs = self.modules.discrete_embedding_layer(
+            in_toks
+        )  # [B, T, N-Q, D]
 
-        # Attention-Pooling 
-        att_w = self.modules.attention_mlp(in_embs)  #[B, T, N-Q, 1]
-        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2)  #[B, T, D]
+        # Attention-Pooling
+        att_w = self.modules.attention_mlp(in_embs)  # [B, T, N-Q, 1]
+        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(
+            -2
+        )  # [B, T, D]
 
         # forward modules
         if type(self.modules.encoder).__name__ == "ContextNet":
             enc_out = self.modules.encoder(in_embs)
 
         elif type(self.modules.encoder).__name__ == "LSTM":
-            enc_out, _ = self.modules.encoder(
-                in_embs
-            ) 
+            enc_out, _ = self.modules.encoder(in_embs)
 
         else:
             raise NotImplementedError
-        
+
         # output layer for ctc log-probabilities
         logits = self.modules.ctc_lin(enc_out)
         p_ctc = self.hparams.log_softmax(logits)
@@ -89,7 +94,6 @@ def compute_forward(self, batch, stage):
             p_tokens = test_searcher(p_ctc, wav_lens)
 
         return p_ctc, wav_lens, p_tokens
-    
 
     def compute_objectives(self, predictions, batch, stage):
         """Computes the loss (CTC+NLL) given predictions and targets."""
@@ -98,14 +102,13 @@ def compute_objectives(self, predictions, batch, stage):
         ids = batch.id
         tokens, tokens_lens = batch.tokens
 
-
         # Label Augmentation
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
             tokens = self.hparams.wav_augment.replicate_labels(tokens)
             tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens)
-        
+
         loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-        
+
         if stage == sb.Stage.VALID:
             # Decode token terms to words
             predicted_words = self.tokenizer(
@@ -149,19 +152,15 @@ def on_stage_end(self, stage, stage_loss, epoch):
         # log stats and save checkpoint at end-of-epoch
         if stage == sb.Stage.VALID:
             if type(self.hparams.scheduler).__name__ == "NewBobScheduler":
-                lr, new_lr = self.hparams.scheduler(
-                    stage_stats["loss"]
-                )
-                sb.nnet.schedulers.update_learning_rate(
-                    self.optimizer, new_lr
-                )
-            elif type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+                lr, new_lr = self.hparams.scheduler(stage_stats["loss"])
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+            elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler":
                 lr = self.hparams.scheduler.current_lr
                 steps = self.optimizer_step
-           
+
             else:
                 raise NotImplementedError
-            
+
             optimizer = self.optimizer.__class__.__name__
             epoch_stats = {
                 "epoch": epoch,
@@ -185,15 +184,19 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 test_stats=stage_stats,
             )
             if if_main_process():
-                with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w:
+                with open(
+                    self.hparams.output_wer_folder, "w", encoding="utf-8"
+                ) as w:
                     self.wer_metric.write_stats(w)
 
     def on_fit_batch_end(self, batch, outputs, loss, should_step):
-        if should_step and type(self.hparams.scheduler).__name__  == "LinearNoamScheduler":
+        if (
+            should_step
+            and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler"
+        ):
             self.hparams.scheduler(self.optimizer)
 
 
-
 def dataio_prepare(hparams, tokenizer):
     """This function prepares the datasets to be used in the brain class.
     It also defines the data processing pipeline through user-defined functions."""
@@ -251,7 +254,7 @@ def audio_pipeline(wav):
         resampled = torchaudio.transforms.Resample(
             info.sample_rate, hparams["sample_rate"],
         )(sig)
-        #resampled = resampled.unsqueeze(0)
+        # resampled = resampled.unsqueeze(0)
         return resampled
 
     sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
@@ -272,7 +275,6 @@ def text_pipeline(wrd):
 
     sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
 
-
     # 4. Set output:
     sb.dataio.dataset.set_output_keys(
         datasets, ["id", "sig", "wrd", "char_list", "tokens"],
@@ -319,7 +321,6 @@ def text_pipeline(wrd):
     # create ddp_group with the right communication protocol
     sb.utils.distributed.ddp_init_group(run_opts)
 
-
     # Create experiment directory
     sb.create_experiment_directory(
         experiment_directory=hparams["output_folder"],
@@ -327,7 +328,6 @@ def text_pipeline(wrd):
         overrides=overrides,
     )
 
-
     # Dataset prep (parsing Librispeech)
     from librispeech_prepare import prepare_librispeech  # noqa
 
@@ -369,12 +369,17 @@ def text_pipeline(wrd):
 
     # Use pretrained embeddings
     if hparams["pretrain_embeddings"]:
-        embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"])
+        embs = hparams["tokenizer"].get_pretrained_embeddings(
+            device=run_opts["device"],
+            num_codebooks=hparams["num_codebooks"],
+            vocab_size=hparams["vocab_size"],
+        )
         hparams["discrete_embedding_layer"].init_embedding(embs)
 
-
     # Log number of parameters/buffers
-    codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()])
+    codec_params = sum(
+        [x.numel() for x in hparams["tokenizer"].state_dict().values()]
+    )
     model_params = sum(
         [
             x.numel()
@@ -407,8 +412,7 @@ def text_pipeline(wrd):
     from speechbrain.decoders.ctc import CTCBeamSearcher
 
     test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"],
-        vocab_list=vocab_list,
+        **hparams["test_beam_search"], vocab_list=vocab_list,
     )
 
     train_dataloader_opts = hparams["train_dataloader_opts"]
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
index e7db8766a..b59bcdfa5 100644
--- a/benchmarks/DASB/model/ __init__.py	
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -1 +1 @@
-from model.tokenizer_interface import EncodecTokenizer
\ No newline at end of file
+from model.tokenizer_interface import EncodecTokenizer
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index d3bf3cc9f..1c655fc65 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -59,7 +59,7 @@ def __init__(
         emb_dim,
         init=False,
         freeze=False,
-        hidden_dim =None,
+        hidden_dim=None,
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
@@ -76,11 +76,9 @@ def __init__(
         else:
             self.proj_layer = None
 
-
     def init_embedding(self, weights):
         self.embedding.weight.data.copy_(weights)
-    
-    
+
     def forward(self, in_tokens):
         """Computes the embedding for discrete tokens.
         a sample.
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 351652a57..604e3a403 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -1,4 +1,3 @@
-
 """
 Unified interface for tokenizers, standardizing the output shape of encode and decode functions.
 
@@ -12,9 +11,13 @@
 import torch
 from abc import ABC, abstractmethod
 from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
-from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL
+from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
+    DiscreteSSL,
+)
 from speechbrain.lobes.models.discrete.dac import DAC
-from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface
+from speechbrain.lobes.models.discrete.speechtokenizer_interface import (
+    SpeechTokenizer_interface,
+)
 
 
 class BaseTokenizer(ABC):
@@ -29,13 +32,14 @@ def sig_to_tokens(self, signal, lengths, **kwargs):
     def tokens_to_sig(self, tokens, **kwargs):
         """Abstract method to decode tokens into a signal."""
         pass
-    
+
     @abstractmethod
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
         pass
 
+
 class EncodecTokenizer(Encodec, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
@@ -50,20 +54,21 @@ def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
         signal = self.decode(tokens)[:, 0]  # [B, T]
         return signal
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
         embeddings = self.vocabulary
         return embeddings.reshape(-1, embeddings.shape[-1])
 
+
 class DACTokenizer(DAC, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
         # signal: [B, T]
         self.eval()
         tokens, _ = self(
-            signal[:, None], n_quantizers=kwargs['num_codebooks']
+            signal[:, None], n_quantizers=kwargs["num_codebooks"]
         )  # [B, N_Q, T]
         return tokens.movedim(-1, -2)  # [B, T, N_Q]
 
@@ -76,7 +81,7 @@ def tokens_to_sig(self, tokens, **kwargs):
         )
         signal = self.decode(quantized_feats)[:, 0]  # [B, T]
         return signal
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
@@ -88,24 +93,25 @@ def get_pretrained_embeddings(self, **kwargs):
         self.to(kwargs["device"]).eval()
         with torch.no_grad():
             z_q, z_p, _ = self.quantizer.from_codes(toks)
-        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)  # [C, D, 1] * K
+        z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)
         z_qs = []
         for i, z_p_i in enumerate(z_ps):
             with torch.no_grad():
-                z_q_i = (
-                    self.quantizer.quantizers[i].out_proj(z_p_i)
+                z_q_i = self.quantizer.quantizers[i].out_proj(
+                    z_p_i
                 )  # [C, H, 1]
             z_qs.append(z_q_i)
         assert (z_q == sum(z_qs)).all()
-        embeddings = torch.cat(z_qs)[:, :, 0]  # [CK, H]
+        embeddings = torch.cat(z_qs)[:, :, 0]
         return embeddings
 
+
 class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, **kwargs):
         # signal: [B, T]
         self.eval()
-        tokens = self(signal)[: kwargs['num_codebooks']]  # [N_Q, B, T]
+        tokens = self(signal)[: kwargs["num_codebooks"]]  # [N_Q, B, T]
         return tokens.movedim(-3, -1)  # [B, T, N_Q]
 
     @torch.no_grad()
@@ -114,7 +120,7 @@ def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
         tokens = tokens.movedim(-1, -3)  # [N_Q, B, T]
         return self.decode(tokens)  # [B, T]
-    
+
     @torch.no_grad()
     def get_pretrained_embeddings(self, **kwargs):
         """Return pretrained codebook embedding."""
@@ -128,54 +134,22 @@ def get_pretrained_embeddings(self, **kwargs):
         for i, indices in enumerate(toks):
             layer = self.model.quantizer.vq.layers[i]
             with torch.no_grad():
-                quantized = layer.decode(indices)  # [C, H, 1]
+                quantized = layer.decode(indices)
             embs.append(quantized)
-        assert (
-            self.model.quantizer.decode(toks) == sum(embs)
-        ).all()
-        embeddings = torch.cat(embs)[:, :, 0]  # [CK, H]
+        assert (self.model.quantizer.decode(toks) == sum(embs)).all()
+        embeddings = torch.cat(embs)[:, :, 0]
         return embeddings
 
+
 class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths):
-        # signal: [B, T]
-        self.hparams.codec_quantizer.to(self.device).eval()
-        tokens, _, _ = self.hparams.codec_quantizer(
-            signal,
-            lengths,
-            SSL_layers=self.hparams.SSL_layers,
-            deduplicates=[False] * len(self.hparams.SSL_layers),
-            bpe_tokenizers=[None] * len(self.hparams.SSL_layers),
-        )  # [B, T, N_Q]
-        return tokens
+        pass
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens):
-        # tokens: [B, T, N_Q]
-        self.hparams.codec_vocoder.to(self.device).eval()
-
-        all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids
-        offsets = torch.arange(
-            0,
-            len(all_layer_ids) * self.hparams.vocab_size,
-            self.hparams.vocab_size,
-            device=self.device,
-        )
-        offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers]
-        offsets = offsets[offset_idxes]
-        tokens += offsets + 1
-
-        if len(self.hparams.SSL_layers) < len(all_layer_ids):
-            full_tokens = torch.zeros(
-                *tokens.shape[:2],
-                len(all_layer_ids),
-                dtype=tokens.dtype,
-                device=self.device,
-            )
-            for i, idx in enumerate(offset_idxes):
-                full_tokens[..., idx] = tokens[..., i]
-            tokens = full_tokens
-
-        self.hparams.codec_vocoder.tokenize = False
-        return self.hparams.codec_vocoder(tokens)[:, 0]  # [B, T]
+        pass
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(self, **kwargs):
+        pass

From db1590ee346dab0896723cf8184ba8b1e12355b8 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:54:09 -0500
Subject: [PATCH 004/270] fix flake

---
 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py | 5 +----
 benchmarks/DASB/model/ __init__.py                | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
index baa80c80e..99eeb81fe 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
@@ -27,6 +27,7 @@
 
 _CACHE = {"size": 0}
 
+
 # Define training procedure
 class ASR(sb.Brain):
     def compute_forward(self, batch, stage):
@@ -38,8 +39,6 @@ def compute_forward(self, batch, stage):
         if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
             wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens)  # [B, T]
 
-        current_epoch = self.hparams.epoch_counter.current
-
         # compute features
         # Extract tokens (cache them at first epoch if augmentation is disabled)
         key = tuple(sorted(batch.id))
@@ -156,8 +155,6 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
             elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler":
                 lr = self.hparams.scheduler.current_lr
-                steps = self.optimizer_step
-
             else:
                 raise NotImplementedError
 
diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py
index b59bcdfa5..e69de29bb 100644
--- a/benchmarks/DASB/model/ __init__.py	
+++ b/benchmarks/DASB/model/ __init__.py	
@@ -1 +0,0 @@
-from model.tokenizer_interface import EncodecTokenizer

From 3361ac6e9c21e94d2957d76347c7c19bfeab88ad Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 8 Nov 2024 09:56:08 -0500
Subject: [PATCH 005/270] fix blank index

---
 .../LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
index 55d7c3c91..99d423b87 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
@@ -183,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
 log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True
 
-scheduler:  !new:speechbrain.nnet.schedulers.NewBobScheduler
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_model>
    improvement_threshold: 0.0025
    annealing_factor: 0.8

From 2678a24761d92ba71745574a006b76c10113b828 Mon Sep 17 00:00:00 2001
From: Chaanks <jarod8405@gmail.com>
Date: Sat, 9 Nov 2024 14:36:05 +0100
Subject: [PATCH 006/270] add tokens extraction / loading

---
 benchmarks/DASB/utils/tokens.py | 249 ++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 benchmarks/DASB/utils/tokens.py

diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
new file mode 100644
index 000000000..b334106f1
--- /dev/null
+++ b/benchmarks/DASB/utils/tokens.py
@@ -0,0 +1,249 @@
+import math
+import logging
+import pathlib as pl
+import kaldiio
+import torch
+import numpy as np
+from tqdm.auto import tqdm
+import speechbrain as sb
+from speechbrain.dataio.dataloader import make_dataloader
+from speechbrain.dataio.dataset import DynamicItemDataset
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_device(use_cuda):
+    logger.info("=" * 30)
+    logger.info(f"USE_CUDA SET TO: {use_cuda}")
+    logger.info(f"CUDA AVAILABLE?: {torch.cuda.is_available()}")
+    logger.info("=" * 30)
+    use_cuda = use_cuda and torch.cuda.is_available()
+    return torch.device("cuda" if use_cuda else "cpu")
+
+
+class TokensExtractor:
+    """
+    Extracts tokens from audio data using a tokenizer and saves them to a specified format.
+
+    Arguments
+    ---------
+    tokenizer : torch.nn.Module
+        The tokenizer model to use for token extraction.
+    save_path : str
+        The directory where the tokens will be saved.
+    src_key : str, optional
+        The key in the dataset that contains the audio data (default: "wav").
+    id_key : str, optional
+        The key in the dataset that contains unique identifiers (default: "id").
+    save_format : str, optional
+        The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy").
+    use_cuda : bool, optional
+        Whether to use CUDA for computation (default: True).
+    dataloader_opts : dict, optional
+        Options for the data loader (default: None).
+    pipelines : list, optional
+        List of data processing pipelines to apply (default: None).
+    save_name : str, optional
+        Base name for the saved token files (default: "tokens").
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        save_path,
+        src_key="wav",
+        id_key="id",
+        save_format="numpy",
+        use_cuda=True,
+        dataloader_opts=None,
+        pipelines=None,
+        save_name="tokens",
+    ):
+        """
+        Initializes the TokensExtractor.
+
+        Arguments
+        ---------
+        tokenizer : torch.nn.Module
+            The tokenizer model to use for token extraction.
+        save_path : str
+            The directory where the tokens will be saved.
+        src_key : str, optional
+            The key in the dataset that contains the audio data (default: "wav").
+        id_key : str, optional
+            The key in the dataset that contains unique identifiers (default: "id").
+        save_format : str, optional
+            The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy").
+        use_cuda : bool, optional
+            Whether to use CUDA for computation (default: True).
+        dataloader_opts : dict, optional
+            Options for the data loader (default: None).
+        pipelines : list, optional
+            List of data processing pipelines to apply (default: None).
+        save_name : str, optional
+            Base name for the saved token files (default: "tokens").
+
+        Raises
+        ------
+        ValueError
+            If an unsupported save_format is provided.
+        """
+        self.save_path = pl.Path(save_path).absolute()
+        self.save_path.mkdir(parents=True, exist_ok=True)
+        self.save_name = save_name
+
+        self.id_key = id_key
+        self.src_key = src_key
+
+        self.device = get_device(use_cuda)
+        self.tokenizer = tokenizer.to(self.device)
+
+        if save_format not in ["numpy", "pickle", "soundfile_flac"]:
+            raise ValueError(f"Unsupported save_format: {save_format}")
+        self.save_format = save_format
+
+        if not dataloader_opts:
+            dataloader_opts = {}
+        self.dataloader_opts = dataloader_opts
+        self.pipelines = pipelines if pipelines is not None else []
+
+        self.wspecifier = f"ark,scp,t:{self.save_path}/{self.save_name}.ark,{self.save_path}/{self.save_name}.scp"
+        self.writer = kaldiio.WriteHelper(
+            self.wspecifier, write_function="numpy"
+        )
+
+    def extract(self, dataset):
+        """
+        Extracts tokens from the dataset and saves them to the specified format.
+
+        Arguments
+        ---------
+        dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict
+            The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary.
+        """
+        if isinstance(dataset, dict):
+            dataset = DynamicItemDataset(dataset)
+        dataset.set_output_keys([self.src_key, self.id_key])
+        for pipeline in self.pipelines:
+            dataset.add_dynamic_item(pipeline)
+
+        dataloader = make_dataloader(dataset, **self.dataloader_opts)
+        batch_size = self.dataloader_opts.get("batch_size", 1)
+        batch_count = int(math.ceil(len(dataset) / batch_size))
+        for batch in tqdm(dataloader, total=batch_count):
+            batch = batch.to(self.device)
+            x, x_lengths = batch[self.src_key]
+            ids = batch[self.id_key]
+            batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths)
+            batch_tokens = sb.utils.data_utils.undo_padding(
+                batch_tokens, x_lengths
+            )
+            self.process_batch(batch_tokens, ids)
+
+    def process_batch(self, batch, ids):
+        """
+        Processes a batch of tokens and writes them to the output files.
+
+        Arguments
+        ---------
+        batch : list
+            A list of tokens for each item in the batch.
+        ids : list
+            A list of unique identifiers corresponding to each item in the batch.
+        """
+        for tokens, utt_id in zip(batch, ids):
+            tokens = np.array(tokens)
+            self.writer(utt_id, tokens)
+
+    def __del__(self):
+        """
+        Close the writer.
+        """
+        self.writer.close()
+
+
+class TokensLoader:
+    """
+    A loader class for retrieving tokens corresponding to utterance IDs.
+
+    Arguments
+    ---------
+    data_path: str
+        The path to the data directory containing the token files.
+    save_name: str, optional
+        The base name of the tokens files (default: "tokens").
+    """
+
+    def __init__(
+        self,
+        data_path,
+        save_name="tokens",
+    ):
+        """
+        Initializes the TokensLoader.
+
+        Arguments
+        ---------
+        data_path: str
+            The path to the data directory containing the token files.
+        save_name: str, optional
+            The base name of the tokens files (default: "tokens").
+        """
+        self.data_path = pl.Path(data_path)
+        if not self.data_path.exists():
+            raise ValueError(
+                f"Data folder not found: {self.data_path.as_posix()}"
+            )
+        self.tokens = self._load(data_path, save_name)
+
+    def tokens_by_uttid(self, utt_id):
+        """
+        Retrieves the tokens corresponding to a given utterance ID.
+
+        Arguments
+        ---------
+        utt_id: str
+            The utterance ID to retrieve tokens for.
+
+        Returns
+        -------
+        result: torch.LongTensor [T, N_Q]
+            The tokens associated with the utterance ID.
+
+        Raises
+        ------
+        KeyError
+            If the utterance ID is not found in the tokens.
+        """
+        if utt_id not in self.tokens:
+            raise KeyError(f"Utterance ID '{utt_id}' not found in tokens.")
+        tokens_path = self.tokens[utt_id]
+        tokens = kaldiio.load_mat(tokens_path)
+        tokens = torch.from_numpy(tokens).long()
+        return tokens
+
+    def _load(self, data_path, save_name):
+        """
+        Loads the mapping from utterance IDs to token file paths.
+
+        Arguments
+        ---------
+        data_path: str
+            The path to the data directory containing the token files.
+        save_name: str
+            The base name of the tokens files.
+
+        Returns
+        -------
+        utt2toks: dict
+            A dictionary mapping utterance IDs to their corresponding token file paths.
+        """
+        scp_path = f"{data_path}/{save_name}.scp"
+        with open(scp_path, "r") as f:
+            utt2toks = {
+                line.strip().split(None, 1)[0]: line.strip().split(None, 1)[1]
+                for line in f
+                if line.strip()
+            }
+        return utt2toks

From 0694249f678572e455305f790fa751859d750193 Mon Sep 17 00:00:00 2001
From: Chaanks <jarod8405@gmail.com>
Date: Sat, 9 Nov 2024 18:57:21 +0100
Subject: [PATCH 007/270] update tokens extraction script

---
 benchmarks/DASB/utils/tokens.py | 192 +++++++++++++++++++++++---------
 1 file changed, 137 insertions(+), 55 deletions(-)

diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index b334106f1..3762457ec 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -3,14 +3,17 @@
 import pathlib as pl
 import kaldiio
 import torch
+import torchaudio
 import numpy as np
 from tqdm.auto import tqdm
 import speechbrain as sb
 from speechbrain.dataio.dataloader import make_dataloader
 from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.dataio.dataio import load_pkl, save_pkl
 
 
 logger = logging.getLogger(__name__)
+OPT_FILE = "opt_extract.pkl"
 
 
 def get_device(use_cuda):
@@ -30,8 +33,8 @@ class TokensExtractor:
     ---------
     tokenizer : torch.nn.Module
         The tokenizer model to use for token extraction.
-    save_path : str
-        The directory where the tokens will be saved.
+    sample_rate : int
+        The sample rate of the audio data.
     src_key : str, optional
         The key in the dataset that contains the audio data (default: "wav").
     id_key : str, optional
@@ -42,62 +45,36 @@ class TokensExtractor:
         Whether to use CUDA for computation (default: True).
     dataloader_opts : dict, optional
         Options for the data loader (default: None).
-    pipelines : list, optional
-        List of data processing pipelines to apply (default: None).
-    save_name : str, optional
-        Base name for the saved token files (default: "tokens").
+
+    Raises
+    ------
+    ValueError
+        If an unsupported save_format is provided.
+    ValueError
+        If the tokenizer's sample rate does not match the provided sample_rate.
     """
 
     def __init__(
         self,
         tokenizer,
-        save_path,
+        sample_rate,
         src_key="wav",
         id_key="id",
         save_format="numpy",
         use_cuda=True,
         dataloader_opts=None,
-        pipelines=None,
-        save_name="tokens",
     ):
-        """
-        Initializes the TokensExtractor.
-
-        Arguments
-        ---------
-        tokenizer : torch.nn.Module
-            The tokenizer model to use for token extraction.
-        save_path : str
-            The directory where the tokens will be saved.
-        src_key : str, optional
-            The key in the dataset that contains the audio data (default: "wav").
-        id_key : str, optional
-            The key in the dataset that contains unique identifiers (default: "id").
-        save_format : str, optional
-            The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy").
-        use_cuda : bool, optional
-            Whether to use CUDA for computation (default: True).
-        dataloader_opts : dict, optional
-            Options for the data loader (default: None).
-        pipelines : list, optional
-            List of data processing pipelines to apply (default: None).
-        save_name : str, optional
-            Base name for the saved token files (default: "tokens").
-
-        Raises
-        ------
-        ValueError
-            If an unsupported save_format is provided.
-        """
-        self.save_path = pl.Path(save_path).absolute()
-        self.save_path.mkdir(parents=True, exist_ok=True)
-        self.save_name = save_name
-
         self.id_key = id_key
         self.src_key = src_key
 
         self.device = get_device(use_cuda)
         self.tokenizer = tokenizer.to(self.device)
+        self.sample_rate = sample_rate
+
+        if tokenizer.sample_rate != self.sample_rate:
+            raise ValueError(
+                f"Sample rate mismatch: {self.sample_rate} != {tokenizer.sample_rate}"
+            )
 
         if save_format not in ["numpy", "pickle", "soundfile_flac"]:
             raise ValueError(f"Unsupported save_format: {save_format}")
@@ -106,14 +83,9 @@ def __init__(
         if not dataloader_opts:
             dataloader_opts = {}
         self.dataloader_opts = dataloader_opts
-        self.pipelines = pipelines if pipelines is not None else []
-
-        self.wspecifier = f"ark,scp,t:{self.save_path}/{self.save_name}.ark,{self.save_path}/{self.save_name}.scp"
-        self.writer = kaldiio.WriteHelper(
-            self.wspecifier, write_function="numpy"
-        )
+        self.pipelines = self._make_pipelines()
 
-    def extract(self, dataset):
+    def extract_tokens(self, dataset, save_path, save_name="tokens"):
         """
         Extracts tokens from the dataset and saves them to the specified format.
 
@@ -122,9 +94,30 @@ def extract(self, dataset):
         dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict
             The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary.
         """
+        conf = {
+            "sample_rate": self.sample_rate,
+            "save_folder": save_path,
+            "dataset_length": len(dataset),
+        }
+
+        save_path = pl.Path(save_path).absolute()
+        save_path.mkdir(parents=True, exist_ok=True)
+
+        # Check if the extraction is already done (if so, skip it)
+        if _skip(save_path, save_name, conf):
+            logger.info("Skipping preparation, completed in previous run.")
+            return
+
+        self.wspecifier = (
+            f"ark,scp,t:{save_path}/{save_name}.ark,{save_path}/{save_name}.scp"
+        )
+        self.writer = kaldiio.WriteHelper(
+            self.wspecifier, write_function="numpy"
+        )
+
         if isinstance(dataset, dict):
             dataset = DynamicItemDataset(dataset)
-        dataset.set_output_keys([self.src_key, self.id_key])
+        dataset.set_output_keys([self.src_key, self.id_key, "sig"])
         for pipeline in self.pipelines:
             dataset.add_dynamic_item(pipeline)
 
@@ -133,7 +126,7 @@ def extract(self, dataset):
         batch_count = int(math.ceil(len(dataset) / batch_size))
         for batch in tqdm(dataloader, total=batch_count):
             batch = batch.to(self.device)
-            x, x_lengths = batch[self.src_key]
+            x, x_lengths = batch["sig"]
             ids = batch[self.id_key]
             batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths)
             batch_tokens = sb.utils.data_utils.undo_padding(
@@ -141,6 +134,11 @@ def extract(self, dataset):
             )
             self.process_batch(batch_tokens, ids)
 
+        logger.info("Extraction completed.")
+
+        save_opt = save_path / OPT_FILE
+        save_pkl(conf, save_opt.as_posix())
+
     def process_batch(self, batch, ids):
         """
         Processes a batch of tokens and writes them to the output files.
@@ -156,6 +154,32 @@ def process_batch(self, batch, ids):
             tokens = np.array(tokens)
             self.writer(utt_id, tokens)
 
+    def _make_pipelines(self):
+        """
+        Creates the data processing pipeline for audio data.
+
+        The pipeline reads audio files, resamples them to the desired sample rate, and provides
+        the processed signal under the key "sig".
+
+        Returns
+        -------
+        pipeline : list
+            A list containing the audio processing pipeline function.
+        """
+
+        @sb.utils.data_pipeline.takes(self.src_key)
+        @sb.utils.data_pipeline.provides("sig")
+        def audio_pipeline(wav):
+            info = torchaudio.info(wav)
+            sig = sb.dataio.dataio.read_audio(wav)
+            sig = torchaudio.transforms.Resample(
+                info.sample_rate,
+                self.sample_rate,
+            )(sig)
+            return sig
+
+        return [audio_pipeline]
+
     def __del__(self):
         """
         Close the writer.
@@ -163,6 +187,46 @@ def __del__(self):
         self.writer.close()
 
 
+def _skip(save_path, save_name, conf):
+    """
+    Detects if the dataset extraction has been already done.
+    If the extraction has been done, we can skip it.
+
+    Arguments
+    ---------
+    save_path : str
+        The path to the directory containing extracted tokens.
+    conf : dict
+        Configuration to match against saved config.
+
+    Returns
+    -------
+    bool
+        if True, the preparation phase can be skipped.
+        if False, it must be done.
+    """
+    skip = True
+
+    # Checking ark,scp files
+    for ext in [".ark", ".scp"]:
+        save_file = save_path / f"{save_name}{ext}"
+        if not save_file.exists:
+            skip = False
+
+    # Checking saved options
+    save_opt = save_path / OPT_FILE
+    if skip is True:
+        if save_opt.exists():
+            opts_old = load_pkl(save_opt.as_posix())
+            if opts_old == conf:
+                skip = True
+            else:
+                skip = False
+        else:
+            skip = False
+    return skip
+
+
 class TokensLoader:
     """
     A loader class for retrieving tokens corresponding to utterance IDs.
@@ -197,30 +261,48 @@ def __init__(
             )
         self.tokens = self._load(data_path, save_name)
 
-    def tokens_by_uttid(self, utt_id):
+    def tokens_by_uttid(self, utt_id, num_codebooks=None):
         """
         Retrieves the tokens corresponding to a given utterance ID.
 
         Arguments
         ---------
-        utt_id: str
+        utt_id : str
             The utterance ID to retrieve tokens for.
+        num_codebooks : int, optional
+            The number of codebooks to retrieve from the tokens. If specified, the tokens will be truncated
+            to include only the first `num_codebooks` codebooks. If not specified, all codebooks are returned.
 
         Returns
         -------
-        result: torch.LongTensor [T, N_Q]
-            The tokens associated with the utterance ID.
+        result : torch.LongTensor [T, N_Q]
+            The tokens associated with the utterance ID, possibly truncated to `num_codebooks` codebooks.
 
         Raises
         ------
         KeyError
             If the utterance ID is not found in the tokens.
+        ValueError
+            If `num_codebooks` is invalid or exceeds the number of available codebooks.
         """
         if utt_id not in self.tokens:
             raise KeyError(f"Utterance ID '{utt_id}' not found in tokens.")
         tokens_path = self.tokens[utt_id]
         tokens = kaldiio.load_mat(tokens_path)
         tokens = torch.from_numpy(tokens).long()
+
+        if num_codebooks is not None:
+            if not isinstance(num_codebooks, int) or num_codebooks <= 0:
+                raise ValueError(
+                    f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer."
+                )
+            if num_codebooks > tokens.size(-1):
+                raise ValueError(
+                    f"Invalid number of codebooks: {num_codebooks}. "
+                    f"Available codebooks: {tokens.size(-1)}."
+                )
+            tokens = tokens[:, :num_codebooks]
+
         return tokens
 
     def _load(self, data_path, save_name):

From 2c30adeec940b80b988b01fdfef0fd65148faa73 Mon Sep 17 00:00:00 2001
From: Chaanks <jarod8405@gmail.com>
Date: Mon, 11 Nov 2024 01:04:36 +0100
Subject: [PATCH 008/270] update tokens extraction script

---
 .../DASB/LibriSpeech/extraction/extract.py    | 96 +++++++++++++++++++
 .../extraction/hparams/encodec.yaml           | 57 +++++++++++
 .../extraction/librispeech_prepare.py         |  1 +
 benchmarks/DASB/utils/tokens.py               | 73 +++++++++++---
 4 files changed, 216 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/extract.py
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
 create mode 120000 benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
new file mode 100644
index 000000000..935c013bd
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env/python3
+"""Recipe for extracting a discrete tokens with librispeech.
+
+Authors
+ * Jarod Duret 2024
+"""
+
+import os
+import sys
+import torch
+import torchaudio
+import logging
+import pathlib as pl
+import speechbrain as sb
+from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+print(base_dir)
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech)
+    from librispeech_prepare import prepare_librispeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_librispeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "tr_splits": hparams["train_splits"],
+            "dev_splits": hparams["dev_splits"],
+            "te_splits": hparams["test_splits"],
+            "save_folder": hparams["output_folder"],
+            "merge_lst": hparams["train_splits"],
+            "merge_name": "train.csv",
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    tokens_extractor = hparams["tokens_extractor"]
+    data_folder = hparams["data_folder"]
+    datasets = []
+    for split in ["train", "valid"]:
+        csv_path = hparams[f"{split}_csv"]
+        name = pl.Path(csv_path).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_csv(
+            csv_path=csv_path,
+            replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    for split in hparams["test_csv"]:
+        name = pl.Path(split).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_csv(
+            csv_path=split,
+            replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    merged_data = {
+        key: value
+        for dataset in datasets
+        for key, value in dataset.data.items()
+    }
+    merged_dataset = DynamicItemDataset(merged_data)
+
+    save_folder = pl.Path(hparams["save_folder"])
+    logger.info("Extracting dataset tokens ...")
+    tokens_extractor.extract_tokens(
+        merged_dataset, (save_folder / "librispeech").as_posix()
+    )
+
+    if hparams["save_embedding"]:
+        save_folder = pl.Path(hparams["save_folder"])
+        logger.info(f"Saving embeddings ...")
+        tokens_extractor.save_pretrained_embeddings(
+            (save_folder / "embeddings").as_posix()
+        )
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
new file mode 100644
index 000000000..815b8aae6
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -0,0 +1,57 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 32
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+save_embedding: True
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py
new file mode 120000
index 000000000..a3126ec94
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py
@@ -0,0 +1 @@
+../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 3762457ec..493a0598a 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -1,3 +1,11 @@
+"""
+Unified interface for token extraction and pretrained embeddings handling for speech tokenizers.
+
+Authors
+---------
+* Jarod Duret, 2024
+"""
+
 import math
 import logging
 import pathlib as pl
@@ -105,7 +113,7 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"):
 
         # Check if the extraction is already done (if so, skip it)
         if _skip(save_path, save_name, conf):
-            logger.info("Skipping preparation, completed in previous run.")
+            logger.info("Skipping extraction, completed in previous run.")
             return
 
         self.wspecifier = (
@@ -180,6 +188,29 @@ def audio_pipeline(wav):
 
         return [audio_pipeline]
 
+    def save_pretrained_embeddings(self, save_path, save_name="embeddings"):
+        """
+        Saves the pretrained embeddings of the tokenizer to a specified directory.
+
+        This method retrieves the pretrained embeddings from the tokenizer,
+        converts them to a NumPy array, and saves them as a `.npy` file.
+
+        Parameters
+        ----------
+        save_path : str or pathlib.Path
+            The directory where the pretrained embeddings will be saved.
+            If the directory does not exist, it will be created.
+        save_name : str, optional
+            The base name of the saved embeddings file (default is "embeddings").
+            The embeddings will be saved as `<save_name>.npy` in the specified directory.
+        """
+        save_path = pl.Path(save_path).absolute()
+        save_path.mkdir(parents=True, exist_ok=True)
+
+        embeddings = self.tokenizer.get_pretrained_embeddings()
+        embeddings = embeddings.cpu().numpy()
+        np.save(save_path / save_name, embeddings)
+
     def __del__(self):
         """
         Close the writer.
@@ -196,6 +227,8 @@ def _skip(save_path, save_name, conf):
     ---------
     save_path : str
         The path to the directory containing extracted tokens.
+    save_name : str
+        The base name of the saved tokens file.
     conf : dict
         Configuration to match against saved config.
 
@@ -244,16 +277,6 @@ def __init__(
         data_path,
         save_name="tokens",
     ):
-        """
-        Initializes the TokensLoader.
-
-        Arguments
-        ---------
-        data_path: str
-            The path to the data directory containing the token files.
-        save_name: str, optional
-            The base name of the tokens files (default: "tokens").
-        """
         self.data_path = pl.Path(data_path)
         if not self.data_path.exists():
             raise ValueError(
@@ -329,3 +352,31 @@ def _load(self, data_path, save_name):
                 if line.strip()
             }
         return utt2toks
+
+    def load_pretrained_embeddings(self, data_path, save_name="embeddings"):
+        """
+        Loads pretrained embeddings from a specified path.
+
+        Arguments
+        ---------
+        data_path : str
+            The directory where the embeddings are saved.
+        save_name : str, optional
+            The name of the embeddings file (default: "embeddings").
+
+        Returns
+        -------
+        embeddings : torch.Tensor
+            The loaded embeddings as a PyTorch tensor.
+
+        Raises
+        ------
+        FileNotFoundError
+            If the embeddings file does not exist at the specified path.
+        """
+        data_path = pl.Path(data_path).absolute()
+        if not self.data_path.exists():
+            raise ValueError(f"Data folder not found: {data_path.as_posix()}")
+        embeddings = np.load(data_path / save_name)
+        embeddings = torch.from_numpy(embeddings)
+        return embeddings

From 336dd64f561d3174fc1cc31ab7fdd9253df9cabd Mon Sep 17 00:00:00 2001
From: Chaanks <jarod8405@gmail.com>
Date: Tue, 12 Nov 2024 16:48:10 +0100
Subject: [PATCH 009/270] update LibriSpeech ASR recipe

---
 .../hparams/LSTM/train.yaml                   | 239 ++++++++++
 .../hparams/contextnet/train.yaml             | 232 ++++++++++
 .../librispeech_prepare.py                    |   1 +
 .../LibriSpeech/ASR-refactor-tokens/train.py  | 438 ++++++++++++++++++
 .../extraction/hparams/encodec.yaml           |   2 +-
 benchmarks/DASB/utils/tokens.py               |   2 +-
 6 files changed, 912 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml
 create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py
 create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
new file mode 100644
index 000000000..7ae90ad4e
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
@@ -0,0 +1,239 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: Encodec
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+tokens_folder: !PLACEHOLDER
+pretain_embeddings_folder: !PLACEHOLDER # Optional
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+# bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+#    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+#    save_path: !ref <save_folder>
+#    sample_rate: !ref <sample_rate>
+#    bandwidth: !ref <bandwidth>
+#    flat_embeddings: False
+#    freeze: True
+#    renorm_embeddings: False
+
+tokens_loader: !new:utils.tokens.TokensLoader
+  data_path: !ref <tokens_folder>
+
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   # tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history>
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml
new file mode 100644
index 000000000..c28fdead0
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml
@@ -0,0 +1,232 @@
+# ############################################################################
+# Model: E2E ASR with CTC
+# Auido Tokenizer: Encodec
+# Encoder: LSTM Encoder
+# Decoder: CTC beam searcher and greedy searcher
+# Tokens: character
+# Training: Librispeech 960h
+# Authors: Pooneh Mousavi 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/enocdec/LSTM/<seed>
+output_wer_folder: !ref <output_folder>/wer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
+# then data_folder_rirs should be /localscratch/xxx_corpus
+# otherwise the dataset will automatically be downloaded
+# data_folder_rirs: !ref <data_folder>
+train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+   - !ref <output_folder>/dev-clean.csv
+   - !ref <output_folder>/test-clean.csv
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 20
+batch_size: 4 # This works for 2x GPUs with 32GB
+test_batch_size: 1
+grad_accumulation_factor: 2
+max_grad_norm: 5.0
+sorting: descending #random
+num_workers: 8
+loss_reduction: batchmean
+precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean
+valid_search_interval: 1
+avg_checkpoints: 10 # Number of checkpoints to average for evaluation
+cache_size: 1.e+10
+
+lr_model: 0.001
+weight_decay: 0.0005
+
+
+# Training parameters
+# To make Transformers converge, the global bath size should be large enough.
+# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
+# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
+# Please, set your parameters accordingly.
+dynamic_batching: True
+max_batch_length_train: 850
+max_batch_len_val: 100
+num_bucket: 200
+shuffle: False # if true re-creates batches at each epoch shuffling examples.
+max_batch_ex: 128
+batch_ordering: random
+
+dynamic_batch_sampler_train:
+   max_batch_length: !ref <max_batch_length_train>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+dynamic_batch_sampler_val:
+   max_batch_length: !ref <max_batch_len_val>
+   num_buckets: !ref <num_bucket>
+   shuffle: !ref <shuffle>
+   batch_ordering: !ref <batch_ordering>
+   max_batch_ex: !ref <max_batch_ex>
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+vocab_size: 1024
+# bandwidth: 1.5
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+pretrain_embeddings: False
+freeze_embedding: False
+
+
+# LSTM
+activation: !name:torch.nn.Sigmoid
+dnn_layers: 2
+dnn_neurons: 1024
+dropout: 0.2
+output_neurons: 31
+
+# BPE parameters
+# BPE parameters
+token_type: char  # ["unigram", "bpe", "char"]
+character_coverage: 1.0
+blank_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+beam_size: 100
+beam_prune_logp: -12.0
+token_prune_min_logp: -1.2
+prune_history: False
+
+############################## models ################################
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+#    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+#    save_path: !ref <save_folder>
+#    sample_rate: !ref <sample_rate>
+#    bandwidth: !ref <bandwidth>
+#    flat_embeddings: False
+#    freeze: True
+#    renorm_embeddings: False
+
+discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
+   num_codebooks: !ref <num_codebooks>
+   vocab_size: !ref <vocab_size>
+   emb_dim: !ref <encoder_dim>
+   # hidden_dim: !ref <encoder_dim>
+   freeze: !ref <freeze_embedding>
+   init: !ref <pretrain_embeddings>
+
+attention_mlp: !new:model.custom_model.AttentionMLP
+   input_dim: !ref <encoder_dim>
+   hidden_dim: !ref <encoder_dim>
+
+encoder: !new:speechbrain.nnet.RNN.LSTM
+   input_shape: [Null, Null, !ref <encoder_dim>]
+   num_layers: !ref <dnn_layers>
+   bidirectional: True
+   dropout: !ref <dropout>
+   hidden_size: !ref <dnn_neurons>
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+   input_size: 2048
+   n_neurons: !ref <output_neurons>
+
+modules:
+   encoder: !ref <encoder>
+   ctc_lin: !ref <ctc_lin>
+   attention_mlp: !ref <attention_mlp>
+   # tokenizer: !ref <tokenizer>
+   discrete_embedding_layer: !ref <discrete_embedding_layer>
+
+
+model: !new:torch.nn.ModuleList
+   - [!ref <encoder>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
+
+####################### Decoding & optimiser ###########################
+# Decoding parameters
+test_beam_search:
+   blank_index: !ref <blank_index>
+   beam_size: !ref <beam_size>
+   beam_prune_logp: !ref <beam_prune_logp>
+   token_prune_min_logp: !ref <token_prune_min_logp>
+   prune_history: !ref <prune_history>
+   alpha: 0.8
+   beta: 1.2
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+   blank_index: !ref <blank_index>
+
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
+
+scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
+   initial_value: !ref <lr_model>
+   improvement_threshold: 0.0025
+   annealing_factor: 0.8
+   patient: 0
+# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
+#    lr_initial: !ref <lr_model>
+#    n_warmup_steps: 7500
+#    n_keep_steps: 36000
+
+model_opt_class: !name:torch.optim.AdamW
+   lr: !ref <lr_model>
+   betas: (0.9, 0.98)
+   eps: 0.000000001
+   weight_decay: !ref <weight_decay>
+
+############################## Logging and Pretrainer ##########################
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+   checkpoints_dir: !ref <save_folder>
+   recoverables:
+      model: !ref <model>
+      scheduler: !ref <scheduler>
+      counter: !ref <epoch_counter>
+
+
+# Functions and classes
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+   limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+   save_file: !ref <train_log>
+
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+   split_tokens: True
+wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py
new file mode 120000
index 000000000..a3126ec94
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py
@@ -0,0 +1 @@
+../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
new file mode 100644
index 000000000..927d7ea84
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
@@ -0,0 +1,438 @@
+#!/usr/bin/env/python3
+"""Recipe for training an discrete tokens ctc ASR system with librispeech.
+
+Decoding is performed with greedy decoding at validation time.
+At test time, beamsearch is used with an optional external language model.
+
+Authors
+ * Pooneh Mousavi 2024
+"""
+
+import os
+import sys
+import torch
+import torchaudio
+import logging
+import speechbrain as sb
+from speechbrain.utils.distributed import run_on_main, if_main_process
+from speechbrain.tokenizers.SentencePiece import SentencePiece
+from hyperpyyaml import load_hyperpyyaml
+from pathlib import Path
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+
+logger = logging.getLogger(__name__)
+
+_CACHE = {"size": 0}
+
+
+# Define training procedure
+class ASR(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """Forward computations from the waveform batches to the output probabilities."""
+        batch = batch.to(self.device)
+        wavs, wav_lens = batch.sig
+        in_toks, _ = batch.speech_tokens
+
+        # Extract embeddings
+        in_embs = self.modules.discrete_embedding_layer(
+            in_toks
+        )  # [B, T, N-Q, D]
+
+        # Attention-Pooling
+        att_w = self.modules.attention_mlp(in_embs)  # [B, T, N-Q, 1]
+        in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(
+            -2
+        )  # [B, T, D]
+
+        # forward modules
+        if type(self.modules.encoder).__name__ == "ContextNet":
+            enc_out = self.modules.encoder(in_embs)
+
+        elif type(self.modules.encoder).__name__ == "LSTM":
+            enc_out, _ = self.modules.encoder(in_embs)
+
+        else:
+            raise NotImplementedError
+
+        # output layer for ctc log-probabilities
+        logits = self.modules.ctc_lin(enc_out)
+        p_ctc = self.hparams.log_softmax(logits)
+
+        p_tokens = None
+        if stage == sb.Stage.VALID:
+            p_tokens = sb.decoders.ctc_greedy_decode(
+                p_ctc, wav_lens, blank_id=self.hparams.blank_index
+            )
+        elif stage == sb.Stage.TEST:
+            p_tokens = test_searcher(p_ctc, wav_lens)
+
+        return p_ctc, wav_lens, p_tokens
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss (CTC+NLL) given predictions and targets."""
+
+        p_ctc, wav_lens, predicted_tokens = predictions
+        ids = batch.id
+        tokens, tokens_lens = batch.tokens
+
+        # Label Augmentation
+        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
+            tokens = self.hparams.wav_augment.replicate_labels(tokens)
+            tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens)
+
+        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
+
+        if stage == sb.Stage.VALID:
+            # Decode token terms to words
+            predicted_words = self.tokenizer(
+                predicted_tokens, task="decode_from_list"
+            )
+        elif stage == sb.Stage.TEST:
+            predicted_words = [
+                hyp[0].text.split(" ") for hyp in predicted_tokens
+            ]
+
+        if stage != sb.Stage.TRAIN:
+            target_words = [wrd.split(" ") for wrd in batch.wrd]
+            self.wer_metric.append(ids, predicted_words, target_words)
+            self.cer_metric.append(ids, predicted_words, target_words)
+
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch"""
+        if stage != sb.Stage.TRAIN:
+            self.cer_metric = self.hparams.cer_computer()
+            self.wer_metric = self.hparams.wer_computer()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a epoch."""
+        # Compute/store important stats
+        stage_stats = {"loss": stage_loss}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+        else:
+            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
+            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+            current_epoch = self.hparams.epoch_counter.current
+            valid_search_interval = self.hparams.valid_search_interval
+            if (
+                current_epoch % valid_search_interval == 0
+                or stage == sb.Stage.TEST
+            ):
+                stage_stats["WER"] = self.wer_metric.summarize("error_rate")
+
+        # log stats and save checkpoint at end-of-epoch
+        if stage == sb.Stage.VALID:
+            if type(self.hparams.scheduler).__name__ == "NewBobScheduler":
+                lr, new_lr = self.hparams.scheduler(stage_stats["loss"])
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+            elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler":
+                lr = self.hparams.scheduler.current_lr
+            else:
+                raise NotImplementedError
+
+            optimizer = self.optimizer.__class__.__name__
+            epoch_stats = {
+                "epoch": epoch,
+                "lr": lr,
+                "optimizer": optimizer,
+            }
+            self.hparams.train_logger.log_stats(
+                stats_meta=epoch_stats,
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+            self.checkpointer.save_and_keep_only(
+                meta={"WER": stage_stats["WER"], "epoch": epoch},
+                min_keys=["WER"],
+                num_to_keep=self.hparams.avg_checkpoints,
+            )
+
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+            if if_main_process():
+                with open(
+                    self.hparams.output_wer_folder, "w", encoding="utf-8"
+                ) as w:
+                    self.wer_metric.write_stats(w)
+
+    def on_fit_batch_end(self, batch, outputs, loss, should_step):
+        if (
+            should_step
+            and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler"
+        ):
+            self.hparams.scheduler(self.optimizer)
+
+
+def dataio_prepare(hparams, tokenizer):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+    """
+    data_folder = hparams["data_folder"]
+
+    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_csv"],
+        replacements={"data_root": data_folder},
+    )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True
+        )
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+
+    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_csv"],
+        replacements={"data_root": data_folder},
+    )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    # test is separate
+    test_datasets = {}
+    for csv_file in hparams["test_csv"]:
+        name = Path(csv_file).stem
+        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
+            csv_path=csv_file, replacements={"data_root": data_folder}
+        )
+        test_datasets[name] = test_datasets[name].filtered_sorted(
+            sort_key="duration"
+        )
+
+    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
+
+    # 1. Define tokens pipeline:
+    tokens_loader = hparams["tokens_loader"]
+    num_codebooks = hparams["num_codebooks"]
+
+    @sb.utils.data_pipeline.takes("id")
+    @sb.utils.data_pipeline.provides("speech_tokens")
+    def tokens_pipeline(id):
+        tokens = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks)
+        return tokens
+
+    sb.dataio.dataset.add_dynamic_item(datasets, tokens_pipeline)
+
+    # 2. Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        info = torchaudio.info(wav)
+        resampled = torchaudio.transforms.Resample(
+            info.sample_rate,
+            hparams["sample_rate"],
+        )(sig)
+        # resampled = resampled.unsqueeze(0)
+        return resampled
+
+    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @sb.utils.data_pipeline.takes("wrd")
+    @sb.utils.data_pipeline.provides(
+        "wrd", "char_list", "tokens_list", "tokens"
+    )
+    def text_pipeline(wrd):
+        yield wrd
+        char_list = list(wrd)
+        yield char_list
+        tokens_list = tokenizer.sp.encode_as_ids(wrd)
+        yield tokens_list
+        tokens = torch.LongTensor(tokens_list)
+        yield tokens
+
+    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
+
+    # 4. Set output:
+    sb.dataio.dataset.set_output_keys(
+        datasets,
+        ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"],
+    )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from speechbrain.dataio.sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams_train = hparams["dynamic_batch_sampler_train"]
+        dynamic_hparams_val = hparams["dynamic_batch_sampler_val"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_train,
+        )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            length_func=lambda x: x["duration"],
+            **dynamic_hparams_val,
+        )
+
+    return (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_batch_sampler,
+        valid_batch_sampler,
+    )
+
+
+if __name__ == "__main__":
+
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # If distributed_launch=True then
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech)
+    from librispeech_prepare import prepare_librispeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_librispeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "tr_splits": hparams["train_splits"],
+            "dev_splits": hparams["dev_splits"],
+            "te_splits": hparams["test_splits"],
+            "save_folder": hparams["output_folder"],
+            "merge_lst": hparams["train_splits"],
+            "merge_name": "train.csv",
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    # Defining tokenizer and loading it
+    tokenizer = SentencePiece(
+        model_dir=hparams["save_folder"],
+        vocab_size=hparams["output_neurons"],
+        annotation_train=hparams["train_csv"],
+        annotation_read="wrd",
+        model_type=hparams["token_type"],
+        character_coverage=hparams["character_coverage"],
+        bos_id=hparams["bos_index"],
+        eos_id=hparams["eos_index"],
+    )
+
+    # here we create the datasets objects as well as tokenization and encoding
+    (
+        train_data,
+        valid_data,
+        test_datasets,
+        train_bsampler,
+        valid_bsampler,
+    ) = dataio_prepare(hparams, tokenizer)
+
+    # Use pretrained embeddings
+    if hparams["pretrain_embeddings"]:
+        tokens_loader = hparams["tokens_loader"]
+        embs = tokens_loader.load_pretrained_embeddings(
+            hparams["pretain_embeddings_folder"]
+        )
+        hparams["discrete_embedding_layer"].init_embedding(embs)
+
+    # Log number of parameters/buffers
+    model_params = sum(
+        [
+            x.numel()
+            for module in hparams["modules"].values()
+            for x in module.state_dict().values()
+        ]
+    )
+    hparams["train_logger"].log_stats(
+        stats_meta={
+            "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}",
+        },
+    )
+
+    # Trainer initialization
+    asr_brain = ASR(
+        modules=hparams["modules"],
+        opt_class=hparams["model_opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # Adding objects to trainer.
+    asr_brain.tokenizer = tokenizer
+    vocab_list = [
+        tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size())
+    ]
+
+    from speechbrain.decoders.ctc import CTCBeamSearcher
+
+    test_searcher = CTCBeamSearcher(
+        **hparams["test_beam_search"],
+        vocab_list=vocab_list,
+    )
+
+    train_dataloader_opts = hparams["train_dataloader_opts"]
+    valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+    if train_bsampler is not None:
+        train_dataloader_opts = {
+            "batch_sampler": train_bsampler,
+            "num_workers": hparams["num_workers"],
+        }
+
+    if valid_bsampler is not None:
+        valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+    # Training
+    asr_brain.fit(
+        asr_brain.hparams.epoch_counter,
+        train_data,
+        valid_data,
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Testing
+    if not os.path.exists(hparams["output_wer_folder"]):
+        os.makedirs(hparams["output_wer_folder"])
+
+    for k in test_datasets.keys():  # keys are test_clean, test_other etc
+        asr_brain.hparams.output_wer_folder = os.path.join(
+            hparams["output_wer_folder"], f"wer_{k}.txt"
+        )
+        asr_brain.evaluate(
+            test_datasets[k],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+            min_key="WER",
+        )
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index 815b8aae6..f68ab9b37 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -23,7 +23,7 @@ test_csv:
   - !ref <output_folder>/test-clean.csv
   - !ref <output_folder>/test-other.csv
 
-batch_size: 32
+batch_size: 8
 num_workers: 8
 src_key: wav
 id_key: id
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 493a0598a..272e01ebe 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -377,6 +377,6 @@ def load_pretrained_embeddings(self, data_path, save_name="embeddings"):
         data_path = pl.Path(data_path).absolute()
         if not self.data_path.exists():
             raise ValueError(f"Data folder not found: {data_path.as_posix()}")
-        embeddings = np.load(data_path / save_name)
+        embeddings = np.load(data_path / f"{save_name}.npy")
         embeddings = torch.from_numpy(embeddings)
         return embeddings

From cf4041207ff844c75f714b0dd1950c7baf7d69ff Mon Sep 17 00:00:00 2001
From: Chaanks <jarod8405@gmail.com>
Date: Tue, 3 Dec 2024 00:29:49 +0100
Subject: [PATCH 010/270] update LibriSpeech ASR recipe

---
 .../hparams/LSTM/train.yaml                   |  20 +--
 .../LibriSpeech/ASR-refactor-tokens/train.py  |   3 -
 .../DASB/LibriSpeech/extraction/extract.py    |   4 +-
 .../LibriSpeech/extraction/hparams/dac.yaml   |  65 +++++++
 .../hparams/discrete_ssl_wavlm.yaml           |  83 +++++++++
 .../extraction/hparams/encodec.yaml           |   2 +-
 .../extraction/hparams/speech_tokenizer.yaml  |  54 ++++++
 benchmarks/DASB/model/tokenizer_interface.py  | 170 ++++++++++--------
 benchmarks/DASB/utils/tokens.py               |  14 +-
 9 files changed, 319 insertions(+), 96 deletions(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
index 7ae90ad4e..89d347862 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
@@ -11,7 +11,8 @@
 
 seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/enocdec/LSTM/<seed>
+run_name: !PLACEHOLDER
+output_folder: !ref results/LSTM/<run_name>/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
@@ -39,7 +40,7 @@ pretain_embeddings_folder: !PLACEHOLDER # Optional
 
 ####################### Training Parameters ####################################
 number_of_epochs: 20
-batch_size: 4 # This works for 2x GPUs with 32GB
+batch_size: 4
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -56,10 +57,6 @@ weight_decay: 0.0005
 
 
 # Training parameters
-# To make Transformers converge, the global bath size should be large enough.
-# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
-# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
-# Please, set your parameters accordingly.
 dynamic_batching: True
 max_batch_length_train: 850
 max_batch_len_val: 100
@@ -134,20 +131,9 @@ token_prune_min_logp: -1.2
 prune_history: False
 
 ############################## models ################################
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
-#    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-#    save_path: !ref <save_folder>
-#    sample_rate: !ref <sample_rate>
-#    bandwidth: !ref <bandwidth>
-#    flat_embeddings: False
-#    freeze: True
-#    renorm_embeddings: False
-
 tokens_loader: !new:utils.tokens.TokensLoader
   data_path: !ref <tokens_folder>
 
-
 discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
    num_codebooks: !ref <num_codebooks>
    vocab_size: !ref <vocab_size>
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
index 927d7ea84..746a068e1 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
@@ -25,8 +25,6 @@
 
 logger = logging.getLogger(__name__)
 
-_CACHE = {"size": 0}
-
 
 # Define training procedure
 class ASR(sb.Brain):
@@ -36,7 +34,6 @@ def compute_forward(self, batch, stage):
         wavs, wav_lens = batch.sig
         in_toks, _ = batch.speech_tokens
 
-        # Extract embeddings
         in_embs = self.modules.discrete_embedding_layer(
             in_toks
         )  # [B, T, N-Q, D]
diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 935c013bd..ef3e677b5 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -85,7 +85,9 @@
     save_folder = pl.Path(hparams["save_folder"])
     logger.info("Extracting dataset tokens ...")
     tokens_extractor.extract_tokens(
-        merged_dataset, (save_folder / "librispeech").as_posix()
+        merged_dataset,
+        hparams["num_codebooks"],
+        (save_folder / "librispeech").as_posix(),
     )
 
     if hparams["save_embedding"]:
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
new file mode 100644
index 000000000..c380f0478
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
@@ -0,0 +1,65 @@
+# ############################################################################
+# Auido Tokenizer: DAC
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 2
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:model.tokenizer_interface.DACTokenizer
+   model_type: !ref <model_type>
+   model_bitrate: !ref <model_bitrate>
+   load_pretrained: True
+   tag: latest
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml
new file mode 100644
index 000000000..2263547c5
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml
@@ -0,0 +1,83 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: wavml
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+num_clusters: 1000
+save_embedding: False
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+# ssl_layer_num: [3, 7, 12, 23]
+# deduplicate: [False, False, False, False]
+# bpe_tokenizer_path: [null , null,  null, null]
+ssl_layer_num: [1, 3, 7, 12, 18, 23]
+num_codebooks: 6
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+   source: !ref <ssl_hub>
+   output_norm: False
+   freeze: !ref <freeze_ssl>
+   freeze_feature_extractor: !ref <freeze_feature_extractor>
+   output_all_hiddens: True
+   save_path: !ref <ssl_folder>
+
+tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
+   save_path: !ref <kmeans_cache_dir>
+   ssl_model: !ref <ssl_model>
+   vocoder_repo_id: !ref <vocoder_repo_id>
+   kmeans_dataset: !ref <kmeans_dataset>
+   num_clusters: !ref <num_clusters>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index f68ab9b37..81cbd0fb2 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -37,7 +37,7 @@ dataloader_opts:
 bandwidth: 1.5
 num_codebooks: 2
 sample_rate: 24000
-save_embedding: True
+save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
new file mode 100644
index 000000000..176768d5e
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -0,0 +1,54 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+vocab_size: 1024
+num_codebooks: 2
+sample_rate: 16000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 604e3a403..652fa53e1 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -21,135 +21,161 @@
 
 
 class BaseTokenizer(ABC):
+    def __init__(self):
+        super().__init__()
+
     @abstractmethod
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, **kwargs):
-        """Abstract method to encode a signal into tokens."""
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+        """Encode signal into tokens."""
         pass
 
     @abstractmethod
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
-        """Abstract method to decode tokens into a signal."""
+        """Decode tokens to signal."""
         pass
 
     @abstractmethod
     @torch.no_grad()
-    def get_pretrained_embeddings(self, **kwargs):
-        """Return pretrained codebook embedding."""
+    def get_pretrained_embeddings(
+        self, vocab_size, num_codebooks, device="cpu", **kwargs
+    ):
+        """Get codebook embeddings."""
         pass
 
 
 class EncodecTokenizer(Encodec, BaseTokenizer):
+    def __init__(self, source, **kwargs):
+        Encodec.__init__(self, source=source, **kwargs)
+        BaseTokenizer.__init__(self)
+
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, **kwargs):
-        # signal: [B, T]
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
         self.eval()
-        tokens, _ = self.encode(signal, lengths)  # [B, T, N_Q]
+        tokens, _ = self.encode(signal, lengths)
+        if num_codebooks:
+            if tokens.shape[-1] < num_codebooks:
+                raise ValueError(
+                    f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested"
+                )
+            tokens = tokens[..., :num_codebooks]
         return tokens
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
-        # tokens: [B, T, N_Q]
         self.eval()
-        signal = self.decode(tokens)[:, 0]  # [B, T]
+        signal = self.decode(tokens)[:, 0]
         return signal
 
     @torch.no_grad()
-    def get_pretrained_embeddings(self, **kwargs):
-        """Return pretrained codebook embedding."""
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, device=None, **kwargs
+    ):
         embeddings = self.vocabulary
         return embeddings.reshape(-1, embeddings.shape[-1])
 
 
 class DACTokenizer(DAC, BaseTokenizer):
+    def __init__(self, *args, **kwargs):
+        DAC.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, **kwargs):
-        # signal: [B, T]
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
         self.eval()
-        tokens, _ = self(
-            signal[:, None], n_quantizers=kwargs["num_codebooks"]
-        )  # [B, N_Q, T]
-        return tokens.movedim(-1, -2)  # [B, T, N_Q]
+        tokens, _ = self(signal[:, None], n_quantizers=num_codebooks)
+        return tokens.movedim(-1, -2)
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
-        # tokens: [B, T, N_Q]
         self.eval()
         quantized_feats, _, _ = self.quantizer.from_codes(
-            tokens.movedim(-1, -2)  # [B, N_Q, T]
+            tokens.movedim(-1, -2)
         )
-        signal = self.decode(quantized_feats)[:, 0]  # [B, T]
-        return signal
+        return self.decode(quantized_feats)[:, 0]
 
     @torch.no_grad()
-    def get_pretrained_embeddings(self, **kwargs):
-        """Return pretrained codebook embedding."""
-        # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200
-        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
-        toks = (
-            toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone()
-        )  # [C, K, 1]
-        self.to(kwargs["device"]).eval()
-        with torch.no_grad():
-            z_q, z_p, _ = self.quantizer.from_codes(toks)
+    def get_pretrained_embeddings(
+        self, vocab_size, num_codebooks, device="cpu", **kwargs
+    ):
+        toks = torch.arange(vocab_size, device=device)
+        toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone()
+        self.to(device).eval()
+        z_q, z_p, _ = self.quantizer.from_codes(toks)
         z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)
-        z_qs = []
-        for i, z_p_i in enumerate(z_ps):
-            with torch.no_grad():
-                z_q_i = self.quantizer.quantizers[i].out_proj(
-                    z_p_i
-                )  # [C, H, 1]
-            z_qs.append(z_q_i)
-        assert (z_q == sum(z_qs)).all()
-        embeddings = torch.cat(z_qs)[:, :, 0]
-        return embeddings
+        z_qs = [
+            self.quantizer.quantizers[i].out_proj(z_p_i)
+            for i, z_p_i in enumerate(z_ps)
+        ]
+        return torch.cat(z_qs)[:, :, 0]
 
 
 class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
+    def __init__(self, *args, **kwargs):
+        SpeechTokenizer_interface.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, **kwargs):
-        # signal: [B, T]
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
         self.eval()
-        tokens = self(signal)[: kwargs["num_codebooks"]]  # [N_Q, B, T]
-        return tokens.movedim(-3, -1)  # [B, T, N_Q]
+        tokens = self(signal)
+        if num_codebooks:
+            if len(tokens) < num_codebooks:
+                raise ValueError(
+                    f"Model only outputs {len(tokens)} codebooks, but {num_codebooks} requested"
+                )
+            tokens = tokens[:num_codebooks]
+        return tokens.movedim(-3, -1)
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
-        # tokens: [B, T, N_Q]
         self.eval()
-        tokens = tokens.movedim(-1, -3)  # [N_Q, B, T]
-        return self.decode(tokens)  # [B, T]
+        return self.decode(tokens.movedim(-1, -3))
 
     @torch.no_grad()
-    def get_pretrained_embeddings(self, **kwargs):
-        """Return pretrained codebook embedding."""
-        # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360
-        toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"])
-        toks = (
-            toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone()
-        )  # [K, C, 1]
-        self.to(kwargs["device"]).eval()
-        embs = []
-        for i, indices in enumerate(toks):
-            layer = self.model.quantizer.vq.layers[i]
-            with torch.no_grad():
-                quantized = layer.decode(indices)
-            embs.append(quantized)
-        assert (self.model.quantizer.decode(toks) == sum(embs)).all()
-        embeddings = torch.cat(embs)[:, :, 0]
-        return embeddings
+    def get_pretrained_embeddings(
+        self, vocab_size, num_codebooks, device="cpu", **kwargs
+    ):
+        toks = torch.arange(vocab_size, device=device)
+        toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
+        self.to(device).eval()
+        embs = [
+            self.model.quantizer.vq.layers[i].decode(indices)
+            for i, indices in enumerate(toks)
+        ]
+        return torch.cat(embs)[:, :, 0]
 
 
 class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
+    def __init__(self, *args, **kwargs):
+        DiscreteSSL.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths):
-        pass
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+        self.eval()
+        tokens, _, _ = self.encode(signal, lengths)
+        if num_codebooks:
+            if tokens.shape[-1] < num_codebooks:
+                raise ValueError(
+                    f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested"
+                )
+            tokens = tokens[..., :num_codebooks]
+        return tokens
 
     @torch.no_grad()
-    def tokens_to_sig(self, tokens):
-        pass
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.eval()
+        return self.decode(tokens)
 
     @torch.no_grad()
-    def get_pretrained_embeddings(self, **kwargs):
-        pass
+    def get_pretrained_embeddings(
+        self, vocab_size, num_codebooks, device="cpu", **kwargs
+    ):
+        toks = torch.arange(vocab_size, device=device)
+        toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
+        self.to(device).eval()
+        return torch.cat(
+            [self.quantizer.codebooks[i] for i in range(num_codebooks)]
+        )
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 272e01ebe..705184d80 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -93,7 +93,9 @@ def __init__(
         self.dataloader_opts = dataloader_opts
         self.pipelines = self._make_pipelines()
 
-    def extract_tokens(self, dataset, save_path, save_name="tokens"):
+    def extract_tokens(
+        self, dataset, num_codebooks, save_path, save_name="tokens"
+    ):
         """
         Extracts tokens from the dataset and saves them to the specified format.
 
@@ -101,6 +103,12 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"):
         ---------
         dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict
             The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary.
+        num_codebooks: int
+            The number of codebooks to retrieve from the tokens.
+        save_path: str
+            The path where tokens will be saved.
+        save_name: str
+            The name of the .scp and .ark files.
         """
         conf = {
             "sample_rate": self.sample_rate,
@@ -136,7 +144,9 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"):
             batch = batch.to(self.device)
             x, x_lengths = batch["sig"]
             ids = batch[self.id_key]
-            batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths)
+            batch_tokens = self.tokenizer.sig_to_tokens(
+                x, x_lengths, num_codebooks=num_codebooks
+            )
             batch_tokens = sb.utils.data_utils.undo_padding(
                 batch_tokens, x_lengths
             )

From 973e12b97a502af4dd04c8f2738c82d1462d6939 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Fri, 20 Dec 2024 13:34:46 -0500
Subject: [PATCH 011/270] change name

---
 .../{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/dac.yaml        | 0
 .../{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/encodec.yaml    | 0
 .../hparams/LSTM/speech_tokenizer.yaml                            | 0
 .../{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/dac.yaml  | 0
 .../hparams/contextnet/encodec.yaml                               | 0
 .../hparams/contextnet/speech_tokenizer.yaml                      | 0
 .../{ASR-refactor => ASR-on-the-fly}/librispeech_prepare.py       | 0
 .../DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/train.py    | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/dac.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/encodec.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/speech_tokenizer.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/dac.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/encodec.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/speech_tokenizer.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/librispeech_prepare.py (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/train.py (100%)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor/train.py
rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py

From 8dca49daf10eb89a86922287e6d8a016f15cb249 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 12:09:28 -0500
Subject: [PATCH 012/270] add discrete_ssl, reorgnaize folder

---
 .../DASB/LibriSpeech/ASR/LSTM/custom_model.py |   1 -
 .../ASR/LSTM/hparams/train_dac.yaml           | 178 ---------
 .../ASR/LSTM/hparams/train_discrete_ssl.yaml  | 216 -----------
 .../ASR/LSTM/hparams/train_encodec.yaml       | 183 ----------
 .../LSTM/hparams/train_speech_tokenizer.yaml  | 169 ---------
 .../ASR/LSTM/hparams/train_weighted_ssl.yaml  | 162 ---------
 .../ASR/LSTM/librispeech_prepare.py           |   1 -
 .../DASB/LibriSpeech/ASR/LSTM/train_dac.py    | 335 -----------------
 .../ASR/LSTM/train_discrete_ssl.py            | 333 -----------------
 .../LibriSpeech/ASR/LSTM/train_encodec.py     | 340 ------------------
 .../ASR/LSTM/train_speech_tokenizer.py        | 335 -----------------
 .../ASR/LSTM/train_weighted_ssl.py            | 322 -----------------
 .../ASR/contextnet/custom_model.py            |   1 -
 .../ASR/contextnet/hparams/train_dac.yaml     | 172 ---------
 .../hparams/train_discrete_ssl.yaml           | 214 -----------
 .../ASR/contextnet/hparams/train_encodec.yaml | 178 ---------
 .../hparams/train_speech_tokenizer.yaml       | 160 ---------
 .../hparams/train_weighted_ssl.yaml           | 157 --------
 .../ASR/contextnet/librispeech_prepare.py     |   1 -
 .../LibriSpeech/ASR/contextnet/train_dac.py   | 321 -----------------
 .../ASR/contextnet/train_discrete_ssl.py      | 319 ----------------
 .../ASR/contextnet/train_encodec.py           | 316 ----------------
 .../ASR/contextnet/train_speech_tokenizer.py  | 319 ----------------
 .../ASR/contextnet/train_weighted_ssl.py      | 318 ----------------
 .../hparams/LSTM/train.yaml                   |   0
 .../hparams/contextnet/train.yaml             |   0
 .../librispeech_prepare.py                    |   0
 .../{ASR-refactor-tokens => ASR}/train.py     |   0
 .../DASB/LibriSpeech/extraction/extract.py    |   6 +-
 benchmarks/DASB/model/tokenizer_interface.py  |  50 +--
 benchmarks/DASB/utils/tokens.py               |   7 +-
 31 files changed, 30 insertions(+), 5084 deletions(-)
 delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml
 delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py
 delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml
 delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py
 delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/hparams/LSTM/train.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/hparams/contextnet/train.yaml (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/librispeech_prepare.py (100%)
 rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/train.py (100%)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py
deleted file mode 120000
index 4b3f08ebb..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../model/custom_model.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml
deleted file mode 100644
index 0b00db1f7..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# ################################
-# Recipe for training an discrete-input ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * Pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-LSTM/dac/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-
-### Config for Tokenizer
-# DAC parameters
-# model_type: [16khz, 24khz, 44khz, 44khz]
-# vocab_size: [1024, 1024, 1024, 1024]
-# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
-# max_num_codebooks: [12, 32, 9, 18]
-# embedding_dim: [1024, 1024, 1024, 128]
-model_type: 24khz
-vocab_size: 1024
-model_bitrate: 8kbps
-num_codebooks: 2  # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type
-sample_rate: 24000
-encoder_dim: 1024
-
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 768
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# Modules
-# DAC model (see https://github.com/descriptinc/descript-audio-codec)
-codec: !new:speechbrain.lobes.models.discrete.dac.DAC
-   model_type: !ref <model_type>
-   model_bitrate: !ref <model_bitrate>
-   load_pretrained: True
-   tag: latest
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: 2
-   bidirectional: True
-   dropout: 0.2
-   hidden_size: 1024
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      scheduler_model: !ref <lr_annealing_model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml
deleted file mode 100644
index c5a920693..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml
+++ /dev/null
@@ -1,216 +0,0 @@
-# ################################
-# Recipe for training an discrete-input ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * Pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-LSTM/discrete_ssl/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-### Configuration for  discrete SSL model
-# ssl_model_type: hubert, wavlm, wav2vec2
-# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
-ssl_model_type: hubert # hubert, wavml or wav2vec2
-ssl_hub: facebook/hubert-large-ll60k
-ssl_folder: !ref <save_folder>/ssl_checkpoint
-kmeans_repo_id: speechbrain/SSL_Quantization
-kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
-kmeans_dataset: LibriSpeech-100-360-500
-freeze_ssl: True
-freeze_feature_extractor: True
-num_clusters: 1000
-
-### Config for Tokenizer
-# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
-# ssl_layer_num: [3, 7, 12, 23]
-# deduplicate: [False, False, False, False]
-# bpe_tokenizer_path: [null , null,  null, null]
-ssl_layer_num: [1, 3, 7, 12, 18, 23]
-num_codebooks: 6
-deduplicate: [False, False, False, False, False, False]
-bpe_tokenizer_path: [null, null, null, null, null, null]
-sample_rate: 16000
-encoder_dim: 1024
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 1024
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer_config:
-   SSL_layers: !ref <ssl_layer_num>
-   deduplicates: !ref <deduplicate>
-   bpe_tokenizers: !ref <bpe_tokenizer_path>
-
-ssl_model: !apply:speechbrain.utils.hparams.choice
-   value: !ref <ssl_model_type>
-   choices:
-      wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-      hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-      wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-
-codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
-   save_path: !ref <kmeans_cache_dir>
-   ssl_model: !ref <ssl_model>
-   kmeans_dataset: !ref <kmeans_dataset>
-   kmeans_repo_id: !ref <kmeans_repo_id>
-   num_clusters: !ref <num_clusters>
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <num_clusters>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: 2
-   bidirectional: True
-   dropout: 0.2
-   hidden_size: 1024
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      scheduler_model: !ref <lr_annealing_model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml
deleted file mode 100644
index e2477819a..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml
+++ /dev/null
@@ -1,183 +0,0 @@
-# ################################
-# Recipe for training an discrete-input ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * Pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-LSTM/encodec/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-
-# Data files
-data_folder: data # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-
-### Config for Tokenizer
-# EnCodec parameters
-# sample_rate: [24000, 24000, 24000, 24000]
-# vocab_size: [1024, 1024, 1024, 1024]
-# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
-# num_codebooks: [2, 4, 8, 16, 32]
-vocab_size: 1024
-bandwidth: 1.5
-num_codebooks: 2
-sample_rate: 24000
-# Feature parameters
-encoder_dim: 1024
-# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
-init_embedding: False
-freeze_embedding: False
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 1024
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
-   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-   sample_rate: !ref <sample_rate>
-   bandwidth: !ref <bandwidth>
-   flat_embeddings: False
-   freeze: True
-   renorm_embeddings: False
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-   freeze: !ref <freeze_embedding>
-   init: !ref <init_embedding>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: 2
-   bidirectional: True
-   dropout: 0.2
-   hidden_size: 1024
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      scheduler_model: !ref <lr_annealing_model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml
deleted file mode 100644
index eda9a2bad..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml
+++ /dev/null
@@ -1,169 +0,0 @@
-# ################################
-# Recipe for training an discrete-input ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * Pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-LSTM/speech_tokenizer/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-
-### Config for Tokenizer
-vocab_size: 1024
-num_codebooks: 2
-sample_rate: 16000
-
-# Feature parameters
-
-encoder_dim: 1024
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 1024
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: 2
-   bidirectional: True
-   dropout: 0.2
-   hidden_size: 1024
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      scheduler_model: !ref <lr_annealing_model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml
deleted file mode 100644
index bcfbe8d50..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml
+++ /dev/null
@@ -1,162 +0,0 @@
-# ################################
-# Recipe for training an SSL-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * Salah Zaiem 2023
-# * Youcef Kemiche 2023
-# * Pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-LSTM/weighted_ssl/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-ssl_hub: microsoft/wavlm-large
-ssl_folder: !ref <output_folder>/ssl_checkpoints
-encoder_dim: 1024
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-lr_weights: 0.01
-sorting: ascending
-precision: fp32
-sample_rate: 16000
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 768
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30  # BPE size, index(blank/eos/bos) = 0
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-#
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length
-   hub: !ref <ssl_hub>
-   save_path: !ref <ssl_folder>
-
-enc: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: 2
-   bidirectional: True
-   dropout: 0.2
-   hidden_size: 1024
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   weighted_ssl_model: !ref <weighted_ssl_model>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-weights_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr_weights>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr_weights>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.9
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      ssl_model: !ref <weighted_ssl_model>
-      scheduler_model: !ref <lr_annealing_model>
-      scheduler_encoder: !ref <lr_annealing_weights>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py
deleted file mode 120000
index cf4adfd79..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py
+++ /dev/null
@@ -1 +0,0 @@
-../../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py
deleted file mode 100644
index 479d6719b..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens ctc ASR system with librispeech.
-
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import torchaudio
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _ = self.hparams.codec(
-                wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks
-            )
-        embeddings = self.modules.discrete_embedding_layer(
-            tokens.movedim(-2, -1)
-        )
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-        y = y[0]  # As it is an RNN output
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-            #     stage_stats["loss"]
-            # )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            # sb.nnet.schedulers.update_learning_rate(
-            #     self.weights_optimizer, new_lr_weights
-            # )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        # "Initializes the weights optimizer and model optimizer"
-        # self.weights_optimizer = self.hparams.weights_opt_class(
-        #     self.hparams.attention_mlp.parameters()
-        # )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            # "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            # self.checkpointer.add_recoverable(
-            #     "weights_opt", self.weights_optimizer
-            # )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # Loading the SSL model
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py
deleted file mode 100644
index 2aac19193..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens ctc ASR system with librispeech.
-
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import torchaudio
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _, _ = self.hparams.codec(
-                wavs, wav_lens, **self.hparams.tokenizer_config
-            )
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-        y = y[0]  # As it is an RNN output
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-            #     stage_stats["loss"]
-            # )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            # sb.nnet.schedulers.update_learning_rate(
-            #     self.weights_optimizer, new_lr_weights
-            # )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        # "Initializes the weights optimizer and model optimizer"
-        # self.weights_optimizer = self.hparams.weights_opt_class(
-        #     self.hparams.attention_mlp.parameters()
-        # )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            # "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            # self.checkpointer.add_recoverable(
-            #     "weights_opt", self.weights_optimizer
-            # )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # Loading the SSL model
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py
deleted file mode 100644
index d2215ce45..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an SSL-based ctc ASR system with librispeech.
-
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Adel Moumen 2024
- * Salah Zaiem 2023
- * Youcef Kemiche 2023
-"""
-
-import os
-import sys
-import torch
-import torchaudio
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _ = self.hparams.codec.encode(wavs, wav_lens)
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-        y = y[0]  # As it is an RNN output
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-            #     stage_stats["loss"]
-            # )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            # sb.nnet.schedulers.update_learning_rate(
-            #     self.weights_optimizer, new_lr_weights
-            # )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the weights optimizer and model optimizer"
-        # self.weights_optimizer = self.hparams.weights_opt_class(
-        #     self.hparams.attention_mlp.parameters()
-        # )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            # "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            # self.checkpointer.add_recoverable(
-            #     "weights_opt", self.weights_optimizer
-            # )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    if hparams["discrete_embedding_layer"].init:
-        hparams["discrete_embedding_layer"].init_embedding(
-            hparams["codec"]
-            .vocabulary[: hparams["num_codebooks"], :, :]
-            .flatten(0, 1)
-        )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # Loading the SSL model
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py
deleted file mode 100644
index 1493b5972..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an SSL-based ctc ASR system with librispeech.
-
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Adel Moumen 2024
- * Salah Zaiem 2023
- * Youcef Kemiche 2023
-"""
-
-import os
-import sys
-import torch
-import torchaudio
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens = self.hparams.codec(wavs).permute(1, 2, 0)[
-                :, :, : self.hparams.num_codebooks
-            ]
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-        y = y[0]  # As it is an RNN output
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-            #     stage_stats["loss"]
-            # )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            # sb.nnet.schedulers.update_learning_rate(
-            #     self.weights_optimizer, new_lr_weights
-            # )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the weights optimizer and model optimizer"
-        # self.weights_optimizer = self.hparams.weights_opt_class(
-        #     self.hparams.attention_mlp.parameters()
-        # )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            # "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            # self.checkpointer.add_recoverable(
-            #     "weights_opt", self.weights_optimizer
-            # )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # Loading the SSL model
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py
deleted file mode 100644
index 4a7aed382..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py
+++ /dev/null
@@ -1,322 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an SSL-based ctc ASR system with librispeech.
-
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Adel Moumen 2024
- * Salah Zaiem 2023
- * Youcef Kemiche 2023
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        feats = self.modules.weighted_ssl_model(wavs)
-        y = self.modules.enc(feats)
-        y = y[0]  # As it is an RNN output
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.weights_optimizer, new_lr_weights
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the weights optimizer and model optimizer"
-        self.weights_optimizer = self.hparams.weights_opt_class(
-            [self.modules.weighted_ssl_model.weights]
-        )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            self.checkpointer.add_recoverable(
-                "weights_opt", self.weights_optimizer
-            )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        return sig
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # Loading the SSL model
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py
deleted file mode 120000
index 4b3f08ebb..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../model/custom_model.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml
deleted file mode 100644
index 4533e2e8d..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml
+++ /dev/null
@@ -1,172 +0,0 @@
-# ################################
-# Recipe for training an dac-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-contextnet/dac/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-num_layers_ssl: 25  #Number of layers in the SSL model (should be 25 for large)
-### Config for Tokenizer
-# DAC parameters
-# model_type: [16khz, 24khz, 44khz, 44khz]
-# vocab_size: [1024, 1024, 1024, 1024]
-# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
-# max_num_codebooks: [12, 32, 9, 18]
-# embedding_dim: [1024, 1024, 1024, 128]
-model_type: 24khz
-vocab_size: 1024
-model_bitrate: 8kbps
-num_codebooks: 2  # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type
-sample_rate: 24000
-encoder_dim: 1024
-
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 640
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-codec: !new:speechbrain.lobes.models.discrete.dac.DAC
-   model_type: !ref <model_type>
-   model_bitrate: !ref <model_bitrate>
-   load_pretrained: True
-   tag: latest
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.lobes.models.ContextNet.ContextNet
-   input_shape: [null, null, !ref <encoder_dim>]
-   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-# only unitary strides to keep the frame rate
-
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 640
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      scheduler_model: !ref <lr_annealing_model>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml
deleted file mode 100644
index c394c73c1..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml
+++ /dev/null
@@ -1,214 +0,0 @@
-# ################################
-# Recipe for training an discrete_ssl-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-contextnet/encodec/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-num_layers_ssl: 25  #Number of layers in the SSL model (should be 25 for large)
-
-### Configuration for  discrete SSL model
-# ssl_model_type: hubert, wavlm, wav2vec2
-# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
-ssl_model_type: hubert # hubert, wavml or wav2vec2
-ssl_hub: facebook/hubert-large-ll60k
-ssl_folder: !ref <save_folder>/ssl_checkpoint
-kmeans_repo_id: speechbrain/SSL_Quantization
-kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
-kmeans_dataset: LibriSpeech-100-360-500
-freeze_ssl: True
-freeze_feature_extractor: True
-num_clusters: 1000
-
-### Config for Tokenizer
-# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
-# ssl_layer_num: [3, 7, 12, 23]
-# deduplicate: [False, False, False, False]
-# bpe_tokenizer_path: [null , null,  null, null]
-ssl_layer_num: [1, 3, 7, 12, 18, 23]
-num_codebooks: 6
-deduplicate: [False, False, False, False, False, False]
-bpe_tokenizer_path: [null, null, null, null, null, null]
-sample_rate: 16000
-encoder_dim: 1024
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 640
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer_config:
-   SSL_layers: !ref <ssl_layer_num>
-   deduplicates: !ref <deduplicate>
-   bpe_tokenizers: !ref <bpe_tokenizer_path>
-
-ssl_model: !apply:speechbrain.utils.hparams.choice
-   value: !ref <ssl_model_type>
-   choices:
-      wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-      hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-      wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-         source: !ref <ssl_hub>
-         output_norm: False
-         freeze: !ref <freeze_ssl>
-         freeze_feature_extractor: !ref <freeze_feature_extractor>
-         output_all_hiddens: True
-         save_path: !ref <ssl_folder>
-
-codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
-   save_path: !ref <kmeans_cache_dir>
-   ssl_model: !ref <ssl_model>
-   kmeans_dataset: !ref <kmeans_dataset>
-   kmeans_repo_id: !ref <kmeans_repo_id>
-   num_clusters: !ref <num_clusters>
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <num_clusters>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.lobes.models.ContextNet.ContextNet
-   input_shape: [null, null, !ref <encoder_dim>]
-   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-# only unitary strides to keep the frame rate
-
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 640
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      scheduler_model: !ref <lr_annealing_model>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml
deleted file mode 100644
index 6163550e9..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml
+++ /dev/null
@@ -1,178 +0,0 @@
-# ################################
-# Recipe for training an encodec-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-contextnet/encodec/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-num_layers_ssl: 25  #Number of layers in the SSL model (should be 25 for large)
-### Config for Tokenizer
-# EnCodec parameters
-# sample_rate: [24000, 24000, 24000, 24000]
-# vocab_size: [1024, 1024, 1024, 1024]
-# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
-# num_codebooks: [2, 4, 8, 16, 32]
-vocab_size: 1024
-bandwidth: 1.5
-num_codebooks: 2
-sample_rate: 24000
-# Feature parameters
-encoder_dim: 1024
-# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
-init_embedding: False
-freeze_embedding: False
-
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 640
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
-   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-   sample_rate: !ref <sample_rate>
-   bandwidth: !ref <bandwidth>
-   flat_embeddings: False
-   freeze: True
-   renorm_embeddings: False
-
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-   freeze: !ref <freeze_embedding>
-   init: !ref <init_embedding>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.lobes.models.ContextNet.ContextNet
-   input_shape: [null, null, !ref <encoder_dim>]
-   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-# only unitary strides to keep the frame rate
-
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 640
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      scheduler_model: !ref <lr_annealing_model>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml
deleted file mode 100644
index aef1307ec..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-# ################################
-# Recipe for training an speech_tokenizer-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-contextnet/speech_tokenizer/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-num_layers_ssl: 25  #Number of layers in the SSL model (should be 25 for large)
-### Config for Tokenizer
-vocab_size: 1024
-num_codebooks: 2
-sample_rate: 16000
-
-encoder_dim: 1024
-# Training parameters
-number_of_epochs: 20
-lr: 0.0002
-sorting: ascending
-precision: fp32
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 640
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
-   num_codebooks: !ref <num_codebooks>
-   vocab_size: !ref <vocab_size>
-   emb_dim: !ref <encoder_dim>
-
-attention_mlp: !new:custom_model.AttentionMLP
-   input_dim: !ref <encoder_dim>
-   hidden_dim: !ref <encoder_dim>
-
-enc: !new:speechbrain.lobes.models.ContextNet.ContextNet
-   input_shape: [null, null, !ref <encoder_dim>]
-   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-# only unitary strides to keep the frame rate
-
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 640
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   attention_mlp: !ref <attention_mlp>
-   codec: !ref <codec>
-   discrete_embedding_layer: !ref <discrete_embedding_layer>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>, !ref <discrete_embedding_layer>, !ref <attention_mlp>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      attention_mlp: !ref <attention_mlp>
-      codec: !ref <codec>
-      discrete_embedding_layer: !ref <discrete_embedding_layer>
-      scheduler_model: !ref <lr_annealing_model>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml
deleted file mode 100644
index 6d806f0a5..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-# ################################
-# Recipe for training an encodec-based ctc ASR system with librispeech.
-# Decoding is performed with ctc greedy or LM-rescored decoder.
-#
-# Authors
-# * pooneh Mousavi 2024
-# ################################
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/MP3S-contextnet/encodec/<seed>
-output_wer_folder: !ref <output_folder>/
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech
-# noise/ris dataset will automatically be downloaded
-# data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100"]
-dev_splits: ["dev-clean"]
-test_splits: ["test-clean", "test-other"]
-skip_prep: False
-ckpt_interval_minutes: 25 # save checkpoint every N min
-train_csv: !ref <output_folder>/train-clean-100.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-   - !ref <output_folder>/test-clean.csv
-   - !ref <output_folder>/test-other.csv
-
-num_layers_ssl: 25  #Number of layers in the SSL model (should be 25 for large)
-ssl_hub: microsoft/wavlm-large
-ssl_folder: !ref <output_folder>/ssl_checkpoints
-encoder_dim: 1024
-
-# Training parameters
-number_of_epochs: 2
-lr: 0.0002
-lr_weights: 0.01
-sorting: ascending
-precision: fp32
-sample_rate: 16000
-
-# With data_parallel batch_size is split into N jobs
-# With DDP batch_size is multiplied by N jobs
-# Must be 3 per GPU to fit 32GB of VRAM
-batch_size: 4
-test_batch_size: 1
-
-# Dataloader options
-train_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-valid_dataloader_opts:
-   batch_size: !ref <batch_size>
-
-test_dataloader_opts:
-   batch_size: !ref <test_batch_size>
-
-# Model parameters
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 1
-dnn_neurons: 640
-freeze_encoder: True
-
-# Outputs
-output_neurons: 30
-
-# Decoding parameters
-blank_index: 0
-unk_index: 1
-
-test_beam_search:
-   beam_size: 143
-   topk: 1
-   blank_index: !ref <blank_index>
-   space_token: ' ' # make sure this is the same as the one used in the tokenizer
-   beam_prune_logp: -12.0
-   token_prune_min_logp: -1.2
-   prune_history: True
-   alpha: 0.8
-   beta: 1.2
-   # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM
-   # It can either be a .bin or .arpa ; note: .arpa is much slower at loading
-   # If you don't want to use an LM, comment it out or set it to null
-   kenlm_model_path: null
-
-# Functions and classes
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-   limit: !ref <number_of_epochs>
-
-weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length
-   hub: !ref <ssl_hub>
-   save_path: !ref <ssl_folder>
-
-enc: !new:speechbrain.lobes.models.ContextNet.ContextNet
-   input_shape: [null, null, !ref <encoder_dim>]
-   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-
-# only unitary strides to keep the frame rate
-
-
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 640
-   n_neurons: !ref <output_neurons>
-
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
-
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-   blank_index: !ref <blank_index>
-
-modules:
-   enc: !ref <enc>
-   ctc_lin: !ref <ctc_lin>
-   weighted_ssl_model: !ref <weighted_ssl_model>
-
-model: !new:torch.nn.ModuleList
-   - [!ref <enc>, !ref <ctc_lin>]
-
-model_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr>
-
-weights_opt_class: !name:torch.optim.Adam
-   lr: !ref <lr_weights>
-
-lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.8
-   patient: 0
-
-lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler
-   initial_value: !ref <lr_weights>
-   improvement_threshold: 0.0025
-   annealing_factor: 0.9
-   patient: 0
-
-label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-   checkpoints_dir: !ref <save_folder>
-   recoverables:
-      model: !ref <model>
-      ssl_model: !ref <weighted_ssl_model>
-      scheduler_model: !ref <lr_annealing_model>
-      scheduler_encoder: !ref <lr_annealing_weights>
-      counter: !ref <epoch_counter>
-      tokenizer: !ref <label_encoder>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-   save_file: !ref <train_log>
-
-error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-
-cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
-   split_tokens: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py
deleted file mode 120000
index cf4adfd79..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py
+++ /dev/null
@@ -1 +0,0 @@
-../../librispeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py
deleted file mode 100644
index a177e48a5..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py
+++ /dev/null
@@ -1,321 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens + ctc ASR system with librispeech.
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-import torchaudio
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _ = self.hparams.codec(
-                wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks
-            )
-        embeddings = self.modules.discrete_embedding_layer(
-            tokens.movedim(-2, -1)
-        )
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the model optimizer"
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py
deleted file mode 100644
index 640f6a220..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens + ctc ASR system with librispeech.
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-import torchaudio
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _, _ = self.hparams.codec(
-                wavs, wav_lens, **self.hparams.tokenizer_config
-            )
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the model optimizer"
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py
deleted file mode 100644
index eb7232303..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py
+++ /dev/null
@@ -1,316 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens + ctc ASR system with librispeech.
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-import torchaudio
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens, _ = self.hparams.codec.encode(wavs, wav_lens)
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the model optimizer"
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py
deleted file mode 100644
index cd784c80c..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an discrete tokens + ctc ASR system with librispeech.
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-import torchaudio
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        # Feature extraction and attention pooling
-        with torch.no_grad():
-            self.hparams.codec.to(self.device).eval()
-            tokens = self.hparams.codec(wavs).permute(1, 2, 0)[
-                :, :, : self.hparams.num_codebooks
-            ]
-        embeddings = self.modules.discrete_embedding_layer(tokens)
-        att_w = self.modules.attention_mlp(embeddings)
-        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
-        y = self.modules.enc(feats)
-
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the model optimizer"
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        info = torchaudio.info(wav)
-        resampled = torchaudio.transforms.Resample(
-            info.sample_rate, hparams["sample_rate"],
-        )(sig)
-        #         resampled = resampled.unsqueeze(0)
-        return resampled
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py
deleted file mode 100644
index 6d053fceb..000000000
--- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py
+++ /dev/null
@@ -1,318 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training an SSL-based ctc ASR system with librispeech.
-Decoding is performed with greedy decoding at validation time.
-At test time, beamsearch is used with an optional external language model.
-
-Authors
- * Pooneh Mousavi 2024
-"""
-
-import os
-import sys
-import torch
-import logging
-import speechbrain as sb
-from speechbrain.utils.distributed import run_on_main, if_main_process
-from hyperpyyaml import load_hyperpyyaml
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-
-# Define training procedure
-class ASR(sb.Brain):
-    def compute_forward(self, batch, stage):
-        """Forward computations from the waveform batches to the output probabilities."""
-        batch = batch.to(self.device)
-        wavs, wav_lens = batch.sig
-
-        # Forward pass
-        feats = self.modules.weighted_ssl_model(wavs)
-        y = self.modules.enc(feats)
-
-        # Compute outputs
-        p_tokens = None
-        logits = self.modules.ctc_lin(y)
-        p_ctc = self.hparams.log_softmax(logits)
-
-        if stage == sb.Stage.VALID:
-            p_tokens = sb.decoders.ctc_greedy_decode(
-                p_ctc, wav_lens, blank_id=self.hparams.blank_index
-            )
-        elif stage == sb.Stage.TEST:
-            p_tokens = test_searcher(p_ctc, wav_lens)
-
-        return p_ctc, wav_lens, p_tokens
-
-    def compute_objectives(self, predictions, batch, stage):
-        """Computes the loss (CTC+NLL) given predictions and targets."""
-
-        p_ctc, wav_lens, predicted_tokens = predictions
-        ids = batch.id
-        tokens, tokens_lens = batch.tokens
-        loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
-
-        if stage == sb.Stage.VALID:
-            # Decode token terms to words
-            predicted_words = [
-                "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ")
-                for utt_seq in predicted_tokens
-            ]
-        elif stage == sb.Stage.TEST:
-            predicted_words = [
-                hyp[0].text.split(" ") for hyp in predicted_tokens
-            ]
-
-        if stage != sb.Stage.TRAIN:
-            target_words = [wrd.split(" ") for wrd in batch.wrd]
-            self.wer_metric.append(ids, predicted_words, target_words)
-            self.cer_metric.append(ids, predicted_words, target_words)
-
-        return loss
-
-    def on_stage_start(self, stage, epoch):
-        """Gets called at the beginning of each epoch"""
-        if stage != sb.Stage.TRAIN:
-            self.cer_metric = self.hparams.cer_computer()
-            self.wer_metric = self.hparams.error_rate_computer()
-
-    def on_stage_end(self, stage, stage_loss, epoch):
-        """Gets called at the end of an epoch."""
-        # Compute/store important stats
-        stage_stats = {"loss": stage_loss}
-        if stage == sb.Stage.TRAIN:
-            self.train_stats = stage_stats
-        else:
-            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
-            stage_stats["WER"] = self.wer_metric.summarize("error_rate")
-
-        # Perform end-of-iteration things, like annealing, logging, etc.
-        if stage == sb.Stage.VALID:
-            old_lr_model, new_lr_model = self.hparams.lr_annealing_model(
-                stage_stats["loss"]
-            )
-            old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights(
-                stage_stats["loss"]
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.model_optimizer, new_lr_model
-            )
-            sb.nnet.schedulers.update_learning_rate(
-                self.weights_optimizer, new_lr_weights
-            )
-
-            self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr_model": old_lr_model},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
-            )
-            self.checkpointer.save_and_keep_only(
-                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
-            )
-        elif stage == sb.Stage.TEST:
-            self.hparams.train_logger.log_stats(
-                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
-                test_stats=stage_stats,
-            )
-            if if_main_process():
-                with open(self.hparams.test_wer_file, "w") as w:
-                    self.wer_metric.write_stats(w)
-
-    def init_optimizers(self):
-        "Initializes the weights optimizer and model optimizer"
-        self.weights_optimizer = self.hparams.weights_opt_class(
-            [self.modules.weighted_ssl_model.weights]
-        )
-        self.model_optimizer = self.hparams.model_opt_class(
-            self.hparams.model.parameters()
-        )
-        self.optimizers_dict = {
-            "weights_optimizer": self.weights_optimizer,
-            "model_optimizer": self.model_optimizer,
-        }
-        # Initializing the weights
-        if self.checkpointer is not None:
-            self.checkpointer.add_recoverable("modelopt", self.model_optimizer)
-            self.checkpointer.add_recoverable(
-                "weights_opt", self.weights_optimizer
-            )
-
-
-def dataio_prepare(hparams):
-    """This function prepares the datasets to be used in the brain class.
-    It also defines the data processing pipeline through user-defined functions."""
-    data_folder = hparams["data_folder"]
-
-    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
-    )
-
-    if hparams["sorting"] == "ascending":
-        # we sort training data to speed up training and get better results.
-        train_data = train_data.filtered_sorted(sort_key="duration")
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "descending":
-        train_data = train_data.filtered_sorted(
-            sort_key="duration", reverse=True
-        )
-        # when sorting do not shuffle in dataloader ! otherwise is pointless
-        hparams["train_dataloader_opts"]["shuffle"] = False
-
-    elif hparams["sorting"] == "random":
-        pass
-
-    else:
-        raise NotImplementedError(
-            "sorting must be random, ascending or descending"
-        )
-
-    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
-    )
-    valid_data = valid_data.filtered_sorted(sort_key="duration")
-
-    # test is separate
-    test_datasets = {}
-    for csv_file in hparams["test_csv"]:
-        name = Path(csv_file).stem
-        test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_file, replacements={"data_root": data_folder}
-        )
-        test_datasets[name] = test_datasets[name].filtered_sorted(
-            sort_key="duration"
-        )
-
-    datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()]
-
-    # 2. Define audio pipeline:
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        sig = sb.dataio.dataio.read_audio(wav)
-        return sig
-
-    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
-    label_encoder = sb.dataio.encoder.CTCTextEncoder()
-
-    # 3. Define text pipeline:
-    @sb.utils.data_pipeline.takes("wrd")
-    @sb.utils.data_pipeline.provides(
-        "wrd", "char_list", "tokens_list", "tokens"
-    )
-    def text_pipeline(wrd):
-        yield wrd
-        char_list = list(wrd)
-        yield char_list
-        tokens_list = label_encoder.encode_sequence(char_list)
-        yield tokens_list
-        tokens = torch.LongTensor(tokens_list)
-        yield tokens
-
-    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
-
-    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
-    special_labels = {
-        "blank_label": hparams["blank_index"],
-        "unk_label": hparams["unk_index"],
-    }
-    label_encoder.load_or_create(
-        path=lab_enc_file,
-        from_didatasets=[train_data],
-        output_key="char_list",
-        special_labels=special_labels,
-        sequence_input=True,
-    )
-
-    # 4. Set output:
-    sb.dataio.dataset.set_output_keys(
-        datasets, ["id", "sig", "wrd", "char_list", "tokens"],
-    )
-    return train_data, valid_data, test_datasets, label_encoder
-
-
-if __name__ == "__main__":
-
-    # CLI:
-    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
-
-    # If distributed_launch=True then
-    # create ddp_group with the right communication protocol
-    sb.utils.distributed.ddp_init_group(run_opts)
-
-    with open(hparams_file) as fin:
-        hparams = load_hyperpyyaml(fin, overrides)
-
-    # Create experiment directory
-    sb.create_experiment_directory(
-        experiment_directory=hparams["output_folder"],
-        hyperparams_to_save=hparams_file,
-        overrides=overrides,
-    )
-
-    # Dataset prep (parsing Librispeech)
-    from librispeech_prepare import prepare_librispeech  # noqa
-
-    # multi-gpu (ddp) save data preparation
-    run_on_main(
-        prepare_librispeech,
-        kwargs={
-            "data_folder": hparams["data_folder"],
-            "tr_splits": hparams["train_splits"],
-            "dev_splits": hparams["dev_splits"],
-            "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
-            "merge_lst": hparams["train_splits"],
-            "merge_name": "train.csv",
-            "skip_prep": hparams["skip_prep"],
-        },
-    )
-
-    # here we create the datasets objects as well as tokenization and encoding
-    train_data, valid_data, test_datasets, label_encoder = dataio_prepare(
-        hparams
-    )
-
-    # Trainer initialization
-    asr_brain = ASR(
-        modules=hparams["modules"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
-
-    # We dynamicaly add the tokenizer to our brain class.
-    asr_brain.tokenizer = label_encoder
-
-    ind2lab = label_encoder.ind2lab
-    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
-
-    from speechbrain.decoders.ctc import CTCBeamSearcher
-
-    test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"], vocab_list=vocab_list,
-    )
-
-    # Training
-    asr_brain.fit(
-        asr_brain.hparams.epoch_counter,
-        train_data,
-        valid_data,
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
-
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.test_wer_file = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml
rename to benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml
rename to benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py
rename to benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py
diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
similarity index 100%
rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py
rename to benchmarks/DASB/LibriSpeech/ASR/train.py
diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index ef3e677b5..62d45cfec 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -62,16 +62,14 @@
         csv_path = hparams[f"{split}_csv"]
         name = pl.Path(csv_path).stem
         dataset = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=csv_path,
-            replacements={"data_root": data_folder},
+            csv_path=csv_path, replacements={"data_root": data_folder},
         )
         datasets.append(dataset)
 
     for split in hparams["test_csv"]:
         name = pl.Path(split).stem
         dataset = sb.dataio.dataset.DynamicItemDataset.from_csv(
-            csv_path=split,
-            replacements={"data_root": data_folder},
+            csv_path=split, replacements={"data_root": data_folder},
         )
         datasets.append(dataset)
 
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 652fa53e1..3499bba9e 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -39,15 +39,15 @@ def tokens_to_sig(self, tokens, **kwargs):
     @abstractmethod
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size, num_codebooks, device="cpu", **kwargs
+        self, vocab_size, num_codebooks, **kwargs
     ):
         """Get codebook embeddings."""
         pass
 
 
 class EncodecTokenizer(Encodec, BaseTokenizer):
-    def __init__(self, source, **kwargs):
-        Encodec.__init__(self, source=source, **kwargs)
+    def __init__(self, *args, **kwargs):
+        Encodec.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
@@ -70,7 +70,7 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size=None, num_codebooks=None, device=None, **kwargs
+        self, vocab_size=None, num_codebooks=None, **kwargs
     ):
         embeddings = self.vocabulary
         return embeddings.reshape(-1, embeddings.shape[-1])
@@ -97,7 +97,7 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size, num_codebooks, device="cpu", **kwargs
+        self, vocab_size=None, num_codebooks=None , **kwargs
     ):
         toks = torch.arange(vocab_size, device=device)
         toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone()
@@ -135,11 +135,11 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size, num_codebooks, device="cpu", **kwargs
+        self, vocab_size=None, num_codebooks=None , **kwargs
     ):
-        toks = torch.arange(vocab_size, device=device)
+        toks = torch.arange(vocab_size)
         toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
-        self.to(device).eval()
+        self.eval()
         embs = [
             self.model.quantizer.vq.layers[i].decode(indices)
             for i, indices in enumerate(toks)
@@ -153,29 +153,31 @@ def __init__(self, *args, **kwargs):
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None,**kwargs):
         self.eval()
-        tokens, _, _ = self.encode(signal, lengths)
-        if num_codebooks:
-            if tokens.shape[-1] < num_codebooks:
-                raise ValueError(
-                    f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested"
-                )
-            tokens = tokens[..., :num_codebooks]
+        tokens, _, _ = self.encode(signal, lengths, SSL_layers=num_codebooks,**kwargs)
         return tokens
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
-        return self.decode(tokens)
+        return self.decode(tokens, **kwargs)
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size, num_codebooks, device="cpu", **kwargs
+        self, vocab_size=None, num_codebooks=None, **kwargs
     ):
-        toks = torch.arange(vocab_size, device=device)
-        toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
-        self.to(device).eval()
-        return torch.cat(
-            [self.quantizer.codebooks[i] for i in range(num_codebooks)]
-        )
+        embs = []
+        for layer_num, vocabulary in zip(
+            self.ssl_layer_ids,
+            self.vocabularies,
+        ):
+            if layer_num not in num_codebooks:
+                continue
+            embs.append(
+                torch.as_tensor(
+                    vocabulary, dtype=torch.float32
+                )
+            )
+        embs = torch.cat(embs)
+        return embs
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 705184d80..7090325db 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -191,8 +191,7 @@ def audio_pipeline(wav):
             info = torchaudio.info(wav)
             sig = sb.dataio.dataio.read_audio(wav)
             sig = torchaudio.transforms.Resample(
-                info.sample_rate,
-                self.sample_rate,
+                info.sample_rate, self.sample_rate,
             )(sig)
             return sig
 
@@ -283,9 +282,7 @@ class TokensLoader:
     """
 
     def __init__(
-        self,
-        data_path,
-        save_name="tokens",
+        self, data_path, save_name="tokens",
     ):
         self.data_path = pl.Path(data_path)
         if not self.data_path.exists():

From e317d3a95e1839e86503799269f458abce62794a Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 13:25:00 -0500
Subject: [PATCH 013/270] clean code and fix speechtokenzier bug

---
 .../extraction/hparams/discrete_ssl.yaml      | 100 ++++++++++++++++++
 benchmarks/DASB/extra_requirements.txt        |   1 +
 benchmarks/DASB/model/tokenizer_interface.py  |   1 +
 3 files changed, 102 insertions(+)
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
new file mode 100644
index 000000000..6a58b0135
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -0,0 +1,100 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+   batch_size: !ref <batch_size>
+   shuffle: True
+   num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: wavml
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+num_clusters: 1000
+save_embedding: False
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+# ssl_layer_num: [3, 7, 12, 23]
+# deduplicate: [False, False, False, False]
+# bpe_tokenizer_path: [null , null,  null, null]
+ssl_layer_num: [1, 3, 7, 12, 18, 23]
+num_codebooks: 6
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <ssl_hub>
+            output_norm: False
+            freeze: !ref <freeze_ssl>
+            freeze_feature_extractor: !ref <freeze_feature_extractor>
+            output_all_hiddens: True
+            save_path: !ref <ssl_folder>
+
+tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
+   save_path: !ref <kmeans_cache_dir>
+   ssl_model: !ref <ssl_model>
+   vocoder_repo_id: !ref <vocoder_repo_id>
+   kmeans_dataset: !ref <kmeans_dataset>
+   num_clusters: !ref <num_clusters>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index 4d1d241c3..db9ae4376 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -8,3 +8,4 @@ speechtokenizer>=0.1.2
 tensorboard
 tgt
 unidecode
+kaldiio
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 3499bba9e..68fdf4221 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -115,6 +115,7 @@ class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         SpeechTokenizer_interface.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
+        self.sample_rate = 16000
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):

From fcb5209e80ae7f4588ec4c205884f4d14a06bbc0 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 14:29:59 -0500
Subject: [PATCH 014/270] fix discrete_ssl bug

---
 .../extraction/hparams/discrete_ssl.yaml      |  2 +-
 .../hparams/discrete_ssl_wavlm.yaml           | 83 -------------------
 2 files changed, 1 insertion(+), 84 deletions(-)
 delete mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 6a58b0135..d6715c54e 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -54,7 +54,7 @@ save_embedding: False
 # deduplicate: [False, False, False, False]
 # bpe_tokenizer_path: [null , null,  null, null]
 ssl_layer_num: [1, 3, 7, 12, 18, 23]
-num_codebooks: 6
+num_codebooks: [1, 3, 7, 12, 18, 23]
 deduplicate: [False, False, False, False, False, False]
 bpe_tokenizer_path: [null, null, null, null, null, null]
 sample_rate: 16000
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml
deleted file mode 100644
index 2263547c5..000000000
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# ############################################################################
-# Auido Tokenizer: WavLM
-# Extraction: Librispeech 960h
-# Authors: Jarod Duret 2024
-# ############################################################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-
-seed: 1986
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/wavlm
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/extraction_log.txt
-
-# Data files
-data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
-train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
-dev_splits: ["dev-clean"]
-test_splits: ["dev-clean", "test-clean", "test-other"]
-skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
-test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
-
-batch_size: 8
-num_workers: 8
-src_key: wav
-id_key: id
-
-# Dataloader options
-dataloader_opts:
-   batch_size: !ref <batch_size>
-   shuffle: True
-   num_workers: !ref <num_workers>
-
-### Configuration for  discrete SSL model
-# ssl_model_type: hubert, wavlm, wav2vec2
-# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
-ssl_model_type: wavml
-ssl_hub: microsoft/wavlm-large
-ssl_folder: !ref <save_folder>/ssl_checkpoint
-kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
-kmeans_dataset: LibriSpeech
-vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
-freeze_ssl: True
-freeze_feature_extractor: True
-num_clusters: 1000
-save_embedding: False
-
-### Config for Tokenizer
-# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
-# ssl_layer_num: [3, 7, 12, 23]
-# deduplicate: [False, False, False, False]
-# bpe_tokenizer_path: [null , null,  null, null]
-ssl_layer_num: [1, 3, 7, 12, 18, 23]
-num_codebooks: 6
-deduplicate: [False, False, False, False, False, False]
-bpe_tokenizer_path: [null, null, null, null, null, null]
-sample_rate: 16000
-encoder_dim: 1024
-
-ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-   source: !ref <ssl_hub>
-   output_norm: False
-   freeze: !ref <freeze_ssl>
-   freeze_feature_extractor: !ref <freeze_feature_extractor>
-   output_all_hiddens: True
-   save_path: !ref <ssl_folder>
-
-tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
-   save_path: !ref <kmeans_cache_dir>
-   ssl_model: !ref <ssl_model>
-   vocoder_repo_id: !ref <vocoder_repo_id>
-   kmeans_dataset: !ref <kmeans_dataset>
-   num_clusters: !ref <num_clusters>
-
-tokens_extractor: !new:utils.tokens.TokensExtractor
-  tokenizer: !ref <tokenizer>
-  sample_rate: !ref <sample_rate>
-  src_key: !ref <src_key>
-  id_key: !ref <id_key>
-  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file

From 0d575d43cb4e818d5bf01d90906d9844e21d8a05 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 17:55:16 -0500
Subject: [PATCH 015/270] fix bug

---
 .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index d6715c54e..9ce170b66 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -37,7 +37,7 @@ dataloader_opts:
 ### Configuration for  discrete SSL model
 # ssl_model_type: hubert, wavlm, wav2vec2
 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
-ssl_model_type: wavml
+ssl_model_type: wavlm
 ssl_hub: microsoft/wavlm-large
 ssl_folder: !ref <save_folder>/ssl_checkpoint
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint

From 447844c3fcb5ff22e9bc5212725afad02d648d76 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 20:52:21 -0500
Subject: [PATCH 016/270] fix bug

---
 benchmarks/DASB/model/tokenizer_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 68fdf4221..6c5ab5acd 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -99,9 +99,9 @@ def tokens_to_sig(self, tokens, **kwargs):
     def get_pretrained_embeddings(
         self, vocab_size=None, num_codebooks=None , **kwargs
     ):
-        toks = torch.arange(vocab_size, device=device)
+        toks = torch.arange(vocab_size)
         toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone()
-        self.to(device).eval()
+        self.eval()
         z_q, z_p, _ = self.quantizer.from_codes(toks)
         z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1)
         z_qs = [

From 8aeaeb92a8238cece88e619cb0674809368409d9 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 21:04:39 -0500
Subject: [PATCH 017/270] fix discrete_ssl train.py for specifiying which layer
 to use

---
 benchmarks/DASB/utils/tokens.py | 37 +++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 7090325db..994490958 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -299,9 +299,10 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None):
         ---------
         utt_id : str
             The utterance ID to retrieve tokens for.
-        num_codebooks : int, optional
-            The number of codebooks to retrieve from the tokens. If specified, the tokens will be truncated
-            to include only the first `num_codebooks` codebooks. If not specified, all codebooks are returned.
+        num_codebooks : int or list, optional
+            The number of codebooks to retrieve from the tokens. If specified as an int, the tokens
+            will be truncated to include only the first `num_codebooks` codebooks. If specified as a list,
+            the tokens will include only the codebooks at the specified indices. If not specified, all codebooks are returned.
 
         Returns
         -------
@@ -322,16 +323,26 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None):
         tokens = torch.from_numpy(tokens).long()
 
         if num_codebooks is not None:
-            if not isinstance(num_codebooks, int) or num_codebooks <= 0:
-                raise ValueError(
-                    f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer."
-                )
-            if num_codebooks > tokens.size(-1):
-                raise ValueError(
-                    f"Invalid number of codebooks: {num_codebooks}. "
-                    f"Available codebooks: {tokens.size(-1)}."
-                )
-            tokens = tokens[:, :num_codebooks]
+            if isinstance(num_codebooks, int):
+                if num_codebooks <= 0:
+                    raise ValueError(
+                        f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer."
+                    )
+                if num_codebooks > tokens.size(-1):
+                    raise ValueError(
+                        f"Invalid number of codebooks: {num_codebooks}. "
+                        f"Available codebooks: {tokens.size(-1)}."
+                    )
+                tokens = tokens[:, :num_codebooks]
+            elif isinstance(num_codebooks, list):
+                if not all(isinstance(idx, int) and 0 <= idx < tokens.size(-1) for idx in num_codebooks):
+                    raise ValueError(
+                        f"Invalid indices in num_codebooks list: {num_codebooks}. "
+                        f"All indices must be integers within the range [0, {tokens.size(-1) - 1}]."
+                    )
+                tokens = tokens[:, num_codebooks]
+            else:
+                raise ValueError("num_codebooks must be an int or a list.")
 
         return tokens
 

From c831e609dc78d251089433a01baf39ec2beccc24 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 21:18:05 -0500
Subject: [PATCH 018/270] fix discrete_ssl

---
 benchmarks/DASB/model/custom_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 1c655fc65..3ad6830c6 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -63,7 +63,7 @@ def __init__(
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
-        self.num_codebooks = num_codebooks
+        self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks
         self.freeze = freeze
         self.embedding = torch.nn.Embedding(
             num_codebooks * vocab_size, emb_dim

From ecf761a99e2593a5fa9d28a97339864dc9247878 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 21:19:46 -0500
Subject: [PATCH 019/270] fix bug introduced in last commit

---
 benchmarks/DASB/model/custom_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 3ad6830c6..01ff586df 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -66,7 +66,7 @@ def __init__(
         self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks
         self.freeze = freeze
         self.embedding = torch.nn.Embedding(
-            num_codebooks * vocab_size, emb_dim
+            self.num_codebooks * vocab_size, emb_dim
         ).requires_grad_(not self.freeze)
         self.init = init
 

From 0d2e30989e772e44faeb94e2bdb841b7fa26c9cf Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 22:40:51 -0500
Subject: [PATCH 020/270] fix bug in saving pretrained embedding

---
 benchmarks/DASB/LibriSpeech/extraction/extract.py | 4 +++-
 benchmarks/DASB/utils/tokens.py                   | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 62d45cfec..93b309ff5 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -92,5 +92,7 @@
         save_folder = pl.Path(hparams["save_folder"])
         logger.info(f"Saving embeddings ...")
         tokens_extractor.save_pretrained_embeddings(
-            (save_folder / "embeddings").as_posix()
+            (save_folder / "embeddings").as_posix(),
+            hparams["num_codebooks"],
+            hparams["vocab_size"]
         )
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 994490958..474ec496f 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -197,7 +197,7 @@ def audio_pipeline(wav):
 
         return [audio_pipeline]
 
-    def save_pretrained_embeddings(self, save_path, save_name="embeddings"):
+    def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codebooks=None, vocab_size=None):
         """
         Saves the pretrained embeddings of the tokenizer to a specified directory.
 
@@ -216,7 +216,7 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings"):
         save_path = pl.Path(save_path).absolute()
         save_path.mkdir(parents=True, exist_ok=True)
 
-        embeddings = self.tokenizer.get_pretrained_embeddings()
+        embeddings = self.tokenizer.get_pretrained_embeddings(num_codebooks,vocab_size)
         embeddings = embeddings.cpu().numpy()
         np.save(save_path / save_name, embeddings)
 

From 4729007d099d3d474c4a47f89602ed64a18214f0 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 22:44:08 -0500
Subject: [PATCH 021/270] fix

---
 benchmarks/DASB/LibriSpeech/extraction/extract.py | 3 ++-
 benchmarks/DASB/utils/tokens.py                   | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 93b309ff5..5ee2bbbba 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -93,6 +93,7 @@
         logger.info(f"Saving embeddings ...")
         tokens_extractor.save_pretrained_embeddings(
             (save_folder / "embeddings").as_posix(),
+            hparams["vocab_size"],
             hparams["num_codebooks"],
-            hparams["vocab_size"]
+           
         )
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 474ec496f..930b10253 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -197,7 +197,7 @@ def audio_pipeline(wav):
 
         return [audio_pipeline]
 
-    def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codebooks=None, vocab_size=None):
+    def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_size=None,num_codebooks=None):
         """
         Saves the pretrained embeddings of the tokenizer to a specified directory.
 
@@ -216,7 +216,7 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codeb
         save_path = pl.Path(save_path).absolute()
         save_path.mkdir(parents=True, exist_ok=True)
 
-        embeddings = self.tokenizer.get_pretrained_embeddings(num_codebooks,vocab_size)
+        embeddings = self.tokenizer.get_pretrained_embeddings(vocab_size,num_codebooks)
         embeddings = embeddings.cpu().numpy()
         np.save(save_path / save_name, embeddings)
 

From 7a0ecc2875db91f06301c11adc5ef080a2b3647c Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 23:00:46 -0500
Subject: [PATCH 022/270] fix bug intriduced in prev commit

---
 benchmarks/DASB/LibriSpeech/extraction/extract.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 5ee2bbbba..7310a1469 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -93,7 +93,7 @@
         logger.info(f"Saving embeddings ...")
         tokens_extractor.save_pretrained_embeddings(
             (save_folder / "embeddings").as_posix(),
-            hparams["vocab_size"],
-            hparams["num_codebooks"],
+            vocab_size=hparams["vocab_size"],
+            num_codebooks=hparams["num_codebooks"],
            
         )

From 73dfa4d32429dc64f18be78201779c39771d2c44 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 23:30:09 -0500
Subject: [PATCH 023/270] fix bug for saveing embeedng

---
 .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml     | 4 ++--
 benchmarks/DASB/model/tokenizer_interface.py                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 9ce170b66..6d38e285c 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -45,7 +45,7 @@ kmeans_dataset: LibriSpeech
 vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
 freeze_ssl: True
 freeze_feature_extractor: True
-num_clusters: 1000
+vocab_size: 1000
 save_embedding: False
 
 ### Config for Tokenizer
@@ -90,7 +90,7 @@ tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
    ssl_model: !ref <ssl_model>
    vocoder_repo_id: !ref <vocoder_repo_id>
    kmeans_dataset: !ref <kmeans_dataset>
-   num_clusters: !ref <num_clusters>
+   num_clusters: !ref <vocab_size>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 6c5ab5acd..a4d3ae111 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -138,7 +138,7 @@ def tokens_to_sig(self, tokens, **kwargs):
     def get_pretrained_embeddings(
         self, vocab_size=None, num_codebooks=None , **kwargs
     ):
-        toks = torch.arange(vocab_size)
+        toks = torch.arange(vocab_size).to(next(self.parameters()).device)
         toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
         self.eval()
         embs = [
@@ -181,4 +181,4 @@ def get_pretrained_embeddings(
                 )
             )
         embs = torch.cat(embs)
-        return embs
+        return embs
\ No newline at end of file

From a9e8f3b8b2e76831c7c460fde426479b95b2c769 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 23:35:46 -0500
Subject: [PATCH 024/270] add vocab_size to encodec

---
 benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index 81cbd0fb2..255914c86 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -36,6 +36,7 @@ dataloader_opts:
 
 bandwidth: 1.5
 num_codebooks: 2
+vocab_size: 1024
 sample_rate: 24000
 save_embedding: False
 

From 4237bacf10713ffa5281447909fb6e6a51230af7 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Mon, 23 Dec 2024 23:48:45 -0500
Subject: [PATCH 025/270] fix bug

---
 benchmarks/DASB/model/tokenizer_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index a4d3ae111..91dea8042 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -99,7 +99,7 @@ def tokens_to_sig(self, tokens, **kwargs):
     def get_pretrained_embeddings(
         self, vocab_size=None, num_codebooks=None , **kwargs
     ):
-        toks = torch.arange(vocab_size)
+        toks = torch.arange(vocab_size).to(next(self.parameters()).device)
         toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone()
         self.eval()
         z_q, z_p, _ = self.quantizer.from_codes(toks)

From 867228ebcb1ca586944315c91ecea312650cf7ac Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 00:24:15 -0500
Subject: [PATCH 026/270] fix embedding loading for train.py

---
 benchmarks/DASB/LibriSpeech/ASR/train.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index 746a068e1..b6a9f712e 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -361,6 +361,12 @@ def text_pipeline(wrd):
         embs = tokens_loader.load_pretrained_embeddings(
             hparams["pretain_embeddings_folder"]
         )
+        if isinstance(hparams['num_codebooks'], int):
+            embs= embs[:hparams['num_codebooks']*hparams['vocab_size'],]
+        elif isinstance(hparams['num_codebooks'], list):
+            indices = [i for codebook_idx in hparams['num_codebooks'] for i in range(codebook_idx * hparams['vocab_size'], (codebook_idx + 1) * hparams['vocab_size'])]
+            indices = torch.tensor(indices, dtype=torch.long)
+            embs = embs[indices]
         hparams["discrete_embedding_layer"].init_embedding(embs)
 
     # Log number of parameters/buffers

From 3570b636309becb4318371f546deefece4599d6d Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 10:45:25 -0500
Subject: [PATCH 027/270] fix precommit

---
 .../LibriSpeech/ASR/hparams/LSTM/train.yaml   |  2 +-
 benchmarks/DASB/LibriSpeech/ASR/train.py      | 32 ++++++----
 .../DASB/LibriSpeech/extraction/extract.py    |  3 -
 .../LibriSpeech/extraction/hparams/dac.yaml   | 16 ++---
 .../extraction/hparams/discrete_ssl.yaml      | 64 +++++++++----------
 .../extraction/hparams/encodec.yaml           | 22 +++----
 .../extraction/hparams/speech_tokenizer.yaml  | 12 ++--
 benchmarks/DASB/extra_requirements.txt        |  2 +-
 benchmarks/DASB/model/custom_model.py         |  6 +-
 benchmarks/DASB/model/tokenizer_interface.py  | 25 +++-----
 benchmarks/DASB/utils/tokens.py               | 17 ++++-
 11 files changed, 106 insertions(+), 95 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 89d347862..0f807c937 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -132,7 +132,7 @@ prune_history: False
 
 ############################## models ################################
 tokens_loader: !new:utils.tokens.TokensLoader
-  data_path: !ref <tokens_folder>
+   data_path: !ref <tokens_folder>
 
 discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
    num_codebooks: !ref <num_codebooks>
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index b6a9f712e..d7b86f659 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -175,8 +175,7 @@ def dataio_prepare(hparams, tokenizer):
     data_folder = hparams["data_folder"]
 
     train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["train_csv"],
-        replacements={"data_root": data_folder},
+        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
     )
 
     if hparams["sorting"] == "ascending":
@@ -201,8 +200,7 @@ def dataio_prepare(hparams, tokenizer):
         )
 
     valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
-        csv_path=hparams["valid_csv"],
-        replacements={"data_root": data_folder},
+        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
     )
     valid_data = valid_data.filtered_sorted(sort_key="duration")
 
@@ -238,8 +236,7 @@ def audio_pipeline(wav):
         sig = sb.dataio.dataio.read_audio(wav)
         info = torchaudio.info(wav)
         resampled = torchaudio.transforms.Resample(
-            info.sample_rate,
-            hparams["sample_rate"],
+            info.sample_rate, hparams["sample_rate"],
         )(sig)
         # resampled = resampled.unsqueeze(0)
         return resampled
@@ -264,8 +261,7 @@ def text_pipeline(wrd):
 
     # 4. Set output:
     sb.dataio.dataset.set_output_keys(
-        datasets,
-        ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"],
+        datasets, ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"],
     )
 
     # 5. If Dynamic Batching is used, we instantiate the needed samplers.
@@ -361,10 +357,19 @@ def text_pipeline(wrd):
         embs = tokens_loader.load_pretrained_embeddings(
             hparams["pretain_embeddings_folder"]
         )
-        if isinstance(hparams['num_codebooks'], int):
-            embs= embs[:hparams['num_codebooks']*hparams['vocab_size'],]
-        elif isinstance(hparams['num_codebooks'], list):
-            indices = [i for codebook_idx in hparams['num_codebooks'] for i in range(codebook_idx * hparams['vocab_size'], (codebook_idx + 1) * hparams['vocab_size'])]
+        if isinstance(hparams["num_codebooks"], int):
+            embs = embs[
+                : hparams["num_codebooks"] * hparams["vocab_size"],
+            ]
+        elif isinstance(hparams["num_codebooks"], list):
+            indices = [
+                i
+                for codebook_idx in hparams["num_codebooks"]
+                for i in range(
+                    codebook_idx * hparams["vocab_size"],
+                    (codebook_idx + 1) * hparams["vocab_size"],
+                )
+            ]
             indices = torch.tensor(indices, dtype=torch.long)
             embs = embs[indices]
         hparams["discrete_embedding_layer"].init_embedding(embs)
@@ -401,8 +406,7 @@ def text_pipeline(wrd):
     from speechbrain.decoders.ctc import CTCBeamSearcher
 
     test_searcher = CTCBeamSearcher(
-        **hparams["test_beam_search"],
-        vocab_list=vocab_list,
+        **hparams["test_beam_search"], vocab_list=vocab_list,
     )
 
     train_dataloader_opts = hparams["train_dataloader_opts"]
diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 7310a1469..3979ba731 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -7,8 +7,6 @@
 
 import os
 import sys
-import torch
-import torchaudio
 import logging
 import pathlib as pl
 import speechbrain as sb
@@ -95,5 +93,4 @@
             (save_folder / "embeddings").as_posix(),
             vocab_size=hparams["vocab_size"],
             num_codebooks=hparams["num_codebooks"],
-           
         )
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
index c380f0478..13356cf63 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
@@ -30,9 +30,9 @@ id_key: id
 
 # Dataloader options
 dataloader_opts:
-   batch_size: !ref <batch_size>
-   shuffle: True
-   num_workers: !ref <num_workers>
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
 
 ####################### Model parameters ###########################
 # Tokenizer parameters
@@ -52,14 +52,14 @@ encoder_dim: 1024
 save_embedding: False
 
 tokenizer: !new:model.tokenizer_interface.DACTokenizer
-   model_type: !ref <model_type>
-   model_bitrate: !ref <model_bitrate>
-   load_pretrained: True
-   tag: latest
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  load_pretrained: True
+  tag: latest
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
   sample_rate: !ref <sample_rate>
   src_key: !ref <src_key>
   id_key: !ref <id_key>
-  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 6d38e285c..847038dd2 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -30,9 +30,9 @@ id_key: id
 
 # Dataloader options
 dataloader_opts:
-   batch_size: !ref <batch_size>
-   shuffle: True
-   num_workers: !ref <num_workers>
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
 
 ### Configuration for  discrete SSL model
 # ssl_model_type: hubert, wavlm, wav2vec2
@@ -61,40 +61,40 @@ sample_rate: 16000
 encoder_dim: 1024
 
 ssl_model: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-            source: !ref <ssl_hub>
-            output_norm: False
-            freeze: !ref <freeze_ssl>
-            freeze_feature_extractor: !ref <freeze_feature_extractor>
-            output_all_hiddens: True
-            save_path: !ref <ssl_folder>
-        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-            source: !ref <ssl_hub>
-            output_norm: False
-            freeze: !ref <freeze_ssl>
-            freeze_feature_extractor: !ref <freeze_feature_extractor>
-            output_all_hiddens: True
-            save_path: !ref <ssl_folder>
-        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-            source: !ref <ssl_hub>
-            output_norm: False
-            freeze: !ref <freeze_ssl>
-            freeze_feature_extractor: !ref <freeze_feature_extractor>
-            output_all_hiddens: True
-            save_path: !ref <ssl_folder>
+  value: !ref <ssl_model_type>
+  choices:
+    wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
 
 tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
-   save_path: !ref <kmeans_cache_dir>
-   ssl_model: !ref <ssl_model>
-   vocoder_repo_id: !ref <vocoder_repo_id>
-   kmeans_dataset: !ref <kmeans_dataset>
-   num_clusters: !ref <vocab_size>
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
   sample_rate: !ref <sample_rate>
   src_key: !ref <src_key>
   id_key: !ref <id_key>
-  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index 255914c86..3cd3b691a 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -30,9 +30,9 @@ id_key: id
 
 # Dataloader options
 dataloader_opts:
-   batch_size: !ref <batch_size>
-   shuffle: True
-   num_workers: !ref <num_workers>
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
 
 bandwidth: 1.5
 num_codebooks: 2
@@ -42,17 +42,17 @@ save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
-   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-   sample_rate: !ref <sample_rate>
-   bandwidth: !ref <bandwidth>
-   flat_embeddings: False
-   freeze: True
-   renorm_embeddings: False
+  source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
+  sample_rate: !ref <sample_rate>
+  bandwidth: !ref <bandwidth>
+  flat_embeddings: False
+  freeze: True
+  renorm_embeddings: False
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
   sample_rate: !ref <sample_rate>
   src_key: !ref <src_key>
   id_key: !ref <id_key>
-  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 176768d5e..7726422f3 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -30,9 +30,9 @@ id_key: id
 
 # Dataloader options
 dataloader_opts:
-   batch_size: !ref <batch_size>
-   shuffle: True
-   num_workers: !ref <num_workers>
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
 
 vocab_size: 1024
 num_codebooks: 2
@@ -43,12 +43,12 @@ save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
-   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
   sample_rate: !ref <sample_rate>
   src_key: !ref <src_key>
   id_key: !ref <id_key>
-  dataloader_opts: !ref <dataloader_opts>
\ No newline at end of file
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index db9ae4376..e04ccf781 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -1,5 +1,6 @@
 beartype
 jsonlines
+kaldiio
 librosa>=0.9.2
 onnxruntime>=1.16.3
 scikit-learn
@@ -8,4 +9,3 @@ speechtokenizer>=0.1.2
 tensorboard
 tgt
 unidecode
-kaldiio
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 01ff586df..972d35c66 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -63,7 +63,11 @@ def __init__(
     ):
         super(Discrete_EmbeddingLayer, self).__init__()
         self.vocab_size = vocab_size
-        self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks
+        self.num_codebooks = (
+            len(num_codebooks)
+            if isinstance(num_codebooks, list)
+            else num_codebooks
+        )
         self.freeze = freeze
         self.embedding = torch.nn.Embedding(
             self.num_codebooks * vocab_size, emb_dim
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py
index 91dea8042..f63ddd6aa 100644
--- a/benchmarks/DASB/model/tokenizer_interface.py
+++ b/benchmarks/DASB/model/tokenizer_interface.py
@@ -38,9 +38,7 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @abstractmethod
     @torch.no_grad()
-    def get_pretrained_embeddings(
-        self, vocab_size, num_codebooks, **kwargs
-    ):
+    def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs):
         """Get codebook embeddings."""
         pass
 
@@ -97,7 +95,7 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size=None, num_codebooks=None , **kwargs
+        self, vocab_size=None, num_codebooks=None, **kwargs
     ):
         toks = torch.arange(vocab_size).to(next(self.parameters()).device)
         toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone()
@@ -136,7 +134,7 @@ def tokens_to_sig(self, tokens, **kwargs):
 
     @torch.no_grad()
     def get_pretrained_embeddings(
-        self, vocab_size=None, num_codebooks=None , **kwargs
+        self, vocab_size=None, num_codebooks=None, **kwargs
     ):
         toks = torch.arange(vocab_size).to(next(self.parameters()).device)
         toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone()
@@ -154,9 +152,11 @@ def __init__(self, *args, **kwargs):
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None,**kwargs):
+    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
         self.eval()
-        tokens, _, _ = self.encode(signal, lengths, SSL_layers=num_codebooks,**kwargs)
+        tokens, _, _ = self.encode(
+            signal, lengths, SSL_layers=num_codebooks, **kwargs
+        )
         return tokens
 
     @torch.no_grad()
@@ -170,15 +170,10 @@ def get_pretrained_embeddings(
     ):
         embs = []
         for layer_num, vocabulary in zip(
-            self.ssl_layer_ids,
-            self.vocabularies,
+            self.ssl_layer_ids, self.vocabularies,
         ):
             if layer_num not in num_codebooks:
                 continue
-            embs.append(
-                torch.as_tensor(
-                    vocabulary, dtype=torch.float32
-                )
-            )
+            embs.append(torch.as_tensor(vocabulary, dtype=torch.float32))
         embs = torch.cat(embs)
-        return embs
\ No newline at end of file
+        return embs
diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py
index 930b10253..03ea5049c 100644
--- a/benchmarks/DASB/utils/tokens.py
+++ b/benchmarks/DASB/utils/tokens.py
@@ -197,7 +197,13 @@ def audio_pipeline(wav):
 
         return [audio_pipeline]
 
-    def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_size=None,num_codebooks=None):
+    def save_pretrained_embeddings(
+        self,
+        save_path,
+        save_name="embeddings",
+        vocab_size=None,
+        num_codebooks=None,
+    ):
         """
         Saves the pretrained embeddings of the tokenizer to a specified directory.
 
@@ -216,7 +222,9 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_siz
         save_path = pl.Path(save_path).absolute()
         save_path.mkdir(parents=True, exist_ok=True)
 
-        embeddings = self.tokenizer.get_pretrained_embeddings(vocab_size,num_codebooks)
+        embeddings = self.tokenizer.get_pretrained_embeddings(
+            vocab_size, num_codebooks
+        )
         embeddings = embeddings.cpu().numpy()
         np.save(save_path / save_name, embeddings)
 
@@ -335,7 +343,10 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None):
                     )
                 tokens = tokens[:, :num_codebooks]
             elif isinstance(num_codebooks, list):
-                if not all(isinstance(idx, int) and 0 <= idx < tokens.size(-1) for idx in num_codebooks):
+                if not all(
+                    isinstance(idx, int) and 0 <= idx < tokens.size(-1)
+                    for idx in num_codebooks
+                ):
                     raise ValueError(
                         f"Invalid indices in num_codebooks list: {num_codebooks}. "
                         f"All indices must be integers within the range [0, {tokens.size(-1) - 1}]."

From 3ef996451bd4c189bb2b9d0032068f89824a58c1 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 10:51:32 -0500
Subject: [PATCH 028/270] move tokenizer_interface to util

---
 .../DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml       | 2 +-
 .../DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml   | 2 +-
 .../ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml           | 2 +-
 .../DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml | 2 +-
 .../LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml  | 2 +-
 .../ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml     | 2 +-
 benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml         | 2 +-
 .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml       | 2 +-
 benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml     | 2 +-
 .../DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml   | 2 +-
 benchmarks/DASB/{model => utils}/tokenizer_interface.py         | 0
 11 files changed, 10 insertions(+), 10 deletions(-)
 rename benchmarks/DASB/{model => utils}/tokenizer_interface.py (100%)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
index 806305774..ff1749fab 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
@@ -135,7 +135,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.DACTokenizer
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
    model_type: !ref <model_type>
    model_bitrate: !ref <model_bitrate>
    load_pretrained: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
index 18d967244..dd4f62bf4 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
@@ -132,7 +132,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
    sample_rate: !ref <sample_rate>
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
index 99d423b87..bb0b32a43 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
@@ -127,7 +127,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
index aa7d2e141..b60b32604 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
@@ -131,7 +131,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.DACTokenizer
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
    model_type: !ref <model_type>
    model_bitrate: !ref <model_bitrate>
    load_pretrained: True
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
index a1b5262d3..7c0dcfc45 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
@@ -125,7 +125,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
    sample_rate: !ref <sample_rate>
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
index c12d6f79f..3dcd7eea7 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
@@ -121,7 +121,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
index 13356cf63..3f3d7e92f 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
@@ -51,7 +51,7 @@ sample_rate: 24000
 encoder_dim: 1024
 save_embedding: False
 
-tokenizer: !new:model.tokenizer_interface.DACTokenizer
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
   model_type: !ref <model_type>
   model_bitrate: !ref <model_bitrate>
   load_pretrained: True
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 847038dd2..12b738bfd 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -85,7 +85,7 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
       output_all_hiddens: True
       save_path: !ref <ssl_folder>
 
-tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
   save_path: !ref <kmeans_cache_dir>
   ssl_model: !ref <ssl_model>
   vocoder_repo_id: !ref <vocoder_repo_id>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index 3cd3b691a..1e226c45b 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -41,7 +41,7 @@ sample_rate: 24000
 save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
   sample_rate: !ref <sample_rate>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 7726422f3..acd292a19 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -42,7 +42,7 @@ freeze_embedding: False
 save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:model.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
similarity index 100%
rename from benchmarks/DASB/model/tokenizer_interface.py
rename to benchmarks/DASB/utils/tokenizer_interface.py

From ca05ac6189d9c77ee6c3272328c235e4f5b42c39 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 11:13:24 -0500
Subject: [PATCH 029/270] update extract doc and comments and set to highest
 bitrate

---
 .../LibriSpeech/extraction/hparams/dac.yaml    |  2 +-
 .../extraction/hparams/discrete_ssl.yaml       | 18 ++++++++++--------
 .../extraction/hparams/encodec.yaml            |  9 +++++++--
 .../extraction/hparams/speech_tokenizer.yaml   |  2 +-
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
index 3f3d7e92f..d2d935ed0 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
@@ -45,7 +45,7 @@ dataloader_opts:
 model_type: 24khz
 vocab_size: 1024
 model_bitrate: 8kbps
-num_codebooks: 2
+num_codebooks: 32
 sample_rate: 24000
 # Feature parameters
 encoder_dim: 1024
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 12b738bfd..7d4938625 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -35,9 +35,15 @@ dataloader_opts:
   num_workers: !ref <num_workers>
 
 ### Configuration for  discrete SSL model
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+
 # ssl_model_type: hubert, wavlm, wav2vec2
 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
-ssl_model_type: wavlm
+ssl_model_type: WavLM
 ssl_hub: microsoft/wavlm-large
 ssl_folder: !ref <save_folder>/ssl_checkpoint
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
@@ -50,10 +56,6 @@ save_embedding: False
 
 ### Config for Tokenizer
 # Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
-# ssl_layer_num: [3, 7, 12, 23]
-# deduplicate: [False, False, False, False]
-# bpe_tokenizer_path: [null , null,  null, null]
-ssl_layer_num: [1, 3, 7, 12, 18, 23]
 num_codebooks: [1, 3, 7, 12, 18, 23]
 deduplicate: [False, False, False, False, False, False]
 bpe_tokenizer_path: [null, null, null, null, null, null]
@@ -63,21 +65,21 @@ encoder_dim: 1024
 ssl_model: !apply:speechbrain.utils.hparams.choice
   value: !ref <ssl_model_type>
   choices:
-    wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+    WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
       source: !ref <ssl_hub>
       output_norm: False
       freeze: !ref <freeze_ssl>
       freeze_feature_extractor: !ref <freeze_feature_extractor>
       output_all_hiddens: True
       save_path: !ref <ssl_folder>
-    hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+    HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
       source: !ref <ssl_hub>
       output_norm: False
       freeze: !ref <freeze_ssl>
       freeze_feature_extractor: !ref <freeze_feature_extractor>
       output_all_hiddens: True
       save_path: !ref <ssl_folder>
-    wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+    Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
       source: !ref <ssl_hub>
       output_norm: False
       freeze: !ref <freeze_ssl>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index 1e226c45b..ee0a7e910 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -34,8 +34,13 @@ dataloader_opts:
   shuffle: True
   num_workers: !ref <num_workers>
 
-bandwidth: 1.5
-num_codebooks: 2
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+bandwidth: 24.0
+num_codebooks: 32
 vocab_size: 1024
 sample_rate: 24000
 save_embedding: False
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index acd292a19..5d897a782 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -35,7 +35,7 @@ dataloader_opts:
   num_workers: !ref <num_workers>
 
 vocab_size: 1024
-num_codebooks: 2
+num_codebooks: 8
 sample_rate: 16000
 encoder_dim: 1024
 freeze_embedding: False

From a08891eb9647e14ede66844c46cd8c0231b6363d Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 13:52:11 -0500
Subject: [PATCH 030/270] add run_script.sh

---
 .../LibriSpeech/ASR/hparams/LSTM/train.yaml   |  32 ++-
 .../ASR/hparams/contextnet/train.yaml         |  62 ++----
 benchmarks/DASB/LibriSpeech/ASR/train.py      |   3 +
 benchmarks/DASB/run_experiment.sh             | 203 ++++++++++++++++++
 benchmarks/DASB/utils/aggregate_results.py    | 147 +++++++++++++
 5 files changed, 389 insertions(+), 58 deletions(-)
 create mode 100644 benchmarks/DASB/run_experiment.sh
 create mode 100644 benchmarks/DASB/utils/aggregate_results.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 0f807c937..69e74ca54 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -5,7 +5,9 @@
 # Decoder: CTC beam searcher and greedy searcher
 # Tokens: character
 # Training: Librispeech 960h
-# Authors: Pooneh Mousavi 2024
+# Authors: 
+#     - Pooneh Mousavi 2024
+#     - Jarod Duret 2024
 # ############################################################################
 # Seed needs to be set at top of yaml, before objects with parameters are made
 
@@ -20,6 +22,7 @@ train_log: !ref <output_folder>/train_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
 # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
 # then data_folder_rirs should be /localscratch/xxx_corpus
 # otherwise the dataset will automatically be downloaded
@@ -28,15 +31,14 @@ train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-   - !ref <output_folder>/dev-clean.csv
-   - !ref <output_folder>/test-clean.csv
-
-tokens_folder: !PLACEHOLDER
-pretain_embeddings_folder: !PLACEHOLDER # Optional
+   - !ref <cached_data_folder>/dev-clean.csv
+   - !ref <cached_data_folder>/test-clean.csv
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved.
 
 ####################### Training Parameters ####################################
 number_of_epochs: 20
@@ -94,21 +96,17 @@ test_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Tokenizer parameters
-# sample_rate: [24000, 24000, 24000, 24000]
-# vocab_size: [1024, 1024, 1024, 1024]
-# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
-# num_codebooks: [2, 4, 8, 16, 32]
+# These parameters should be set according to the tokenizer used to extract tokens saved in <tokens_folder>.
 vocab_size: 1024
-# bandwidth: 1.5
 num_codebooks: 2
 sample_rate: 24000
+
 # Feature parameters
 encoder_dim: 1024
-# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128.
 pretrain_embeddings: False
 freeze_embedding: False
 
-
 # LSTM
 activation: !name:torch.nn.Sigmoid
 dnn_layers: 2
@@ -191,10 +189,6 @@ scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    improvement_threshold: 0.0025
    annealing_factor: 0.8
    patient: 0
-# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
-#    lr_initial: !ref <lr_model>
-#    n_warmup_steps: 7500
-#    n_keep_steps: 36000
 
 model_opt_class: !name:torch.optim.AdamW
    lr: !ref <lr_model>
diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
index c28fdead0..dcedc415d 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
@@ -5,13 +5,16 @@
 # Decoder: CTC beam searcher and greedy searcher
 # Tokens: character
 # Training: Librispeech 960h
-# Authors: Pooneh Mousavi 2024
+# Authors: 
+#     - Pooneh Mousavi 2024
+#     - Jarod Duret 2024
 # ############################################################################
 # Seed needs to be set at top of yaml, before objects with parameters are made
 
 seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/enocdec/LSTM/<seed>
+run_name: !PLACEHOLDER
+output_folder: !ref results/LSTM/<run_name>/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
@@ -19,24 +22,27 @@ train_log: !ref <output_folder>/train_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
 # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
 # then data_folder_rirs should be /localscratch/xxx_corpus
 # otherwise the dataset will automatically be downloaded
 # data_folder_rirs: !ref <data_folder>
-train_splits: ["train-clean-100", "train-clean-360", "train-other-500"]
+train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"]
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-   - !ref <output_folder>/dev-clean.csv
-   - !ref <output_folder>/test-clean.csv
+   - !ref <cached_data_folder>/dev-clean.csv
+   - !ref <cached_data_folder>/test-clean.csv
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved.
 
 ####################### Training Parameters ####################################
 number_of_epochs: 20
-batch_size: 4 # This works for 2x GPUs with 32GB
+batch_size: 4
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -53,10 +59,6 @@ weight_decay: 0.0005
 
 
 # Training parameters
-# To make Transformers converge, the global bath size should be large enough.
-# The global batch size is max_batch_len * n_gpus * gradient_accumulation.
-# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2.
-# Please, set your parameters accordingly.
 dynamic_batching: True
 max_batch_length_train: 850
 max_batch_len_val: 100
@@ -94,21 +96,17 @@ test_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Tokenizer parameters
-# sample_rate: [24000, 24000, 24000, 24000]
-# vocab_size: [1024, 1024, 1024, 1024]
-# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
-# num_codebooks: [2, 4, 8, 16, 32]
+# These parameters should be set according to the tokenizer used to extract tokens saved in <tokens_folder>.
 vocab_size: 1024
-# bandwidth: 1.5
 num_codebooks: 2
 sample_rate: 24000
+
 # Feature parameters
 encoder_dim: 1024
-# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128.
+# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128.
 pretrain_embeddings: False
 freeze_embedding: False
 
-
 # LSTM
 activation: !name:torch.nn.Sigmoid
 dnn_layers: 2
@@ -131,15 +129,8 @@ token_prune_min_logp: -1.2
 prune_history: False
 
 ############################## models ################################
-# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer
-#    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-#    save_path: !ref <save_folder>
-#    sample_rate: !ref <sample_rate>
-#    bandwidth: !ref <bandwidth>
-#    flat_embeddings: False
-#    freeze: True
-#    renorm_embeddings: False
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
 
 discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer
    num_codebooks: !ref <num_codebooks>
@@ -153,15 +144,12 @@ attention_mlp: !new:model.custom_model.AttentionMLP
    input_dim: !ref <encoder_dim>
    hidden_dim: !ref <encoder_dim>
 
-encoder: !new:speechbrain.nnet.RNN.LSTM
-   input_shape: [Null, Null, !ref <encoder_dim>]
-   num_layers: !ref <dnn_layers>
-   bidirectional: True
-   dropout: !ref <dropout>
-   hidden_size: !ref <dnn_neurons>
+encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet
+   input_shape: [null, null, !ref <encoder_dim>]
+   strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 
 ctc_lin: !new:speechbrain.nnet.linear.Linear
-   input_size: 2048
+   input_size: 640
    n_neurons: !ref <output_neurons>
 
 modules:
@@ -198,10 +186,6 @@ scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler
    improvement_threshold: 0.0025
    annealing_factor: 0.8
    patient: 0
-# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler
-#    lr_initial: !ref <lr_model>
-#    n_warmup_steps: 7500
-#    n_keep_steps: 36000
 
 model_opt_class: !name:torch.optim.AdamW
    lr: !ref <lr_model>
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index d7b86f659..2758eb0eb 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -6,6 +6,7 @@
 
 Authors
  * Pooneh Mousavi 2024
+ * Jarod Duret 2024
 """
 
 import os
@@ -361,6 +362,8 @@ def text_pipeline(wrd):
             embs = embs[
                 : hparams["num_codebooks"] * hparams["vocab_size"],
             ]
+        # For discrete SSL, num_codebooks is a list used to determine which layers to use.
+        # It is not sequential and can be, for example, [0, 1] or [1, 4].
         elif isinstance(hparams["num_codebooks"], list):
             indices = [
                 i
diff --git a/benchmarks/DASB/run_experiment.sh b/benchmarks/DASB/run_experiment.sh
new file mode 100644
index 000000000..35a3ba4bc
--- /dev/null
+++ b/benchmarks/DASB/run_experiment.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+###########################################################
+# Script to run downstream evaluation training, optionally with multiple seeds.
+# This script loops over seeds and trains different models.
+# At the end, the final performance is computed with the aggregate_results.py script that provides the average performance.
+#
+# Usage:
+# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \
+# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10
+#
+# Authors:
+# - Pooneh Mousavi (2024)
+###########################################################
+
+# Initialize variables
+hparams=""
+data_folder=""
+cached_data_folder=""
+output_folder=""
+task=""
+dataset=""
+seed=""
+nruns=""
+eval_metric="acc"
+eval_set="test"
+rnd_dir=False
+additional_flags=""
+
+
+# Function to print argument descriptions and exit
+print_argument_descriptions() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --hparams hparams_path            Hparam YAML file"
+    echo "  --data_folder data_folder_path    Data folder path"
+    echo "  --cached_data_folder cache_path   Cached data folder path"
+    echo "  --output_folder output_path       Output folder path"
+    echo "  --task task                       downstream task"
+    echo "  --dataset dataset                 dataset"
+    echo "  --seed random_seed                Seed (random if not specified)"
+    echo "  --nruns num_runs                  Number of runs"
+    echo "  --eval_metric metric              Evaluation metric (e.g., acc or WER)"
+    echo "  --eval_set dev or test            Evaluation set. Default: test"
+    echo "  --rnd_dir                         If True the results are stored in a subdir of the output folder with a random name (useful to store all the results of an hparam tuning).  Default: False"
+    exit 1
+}
+
+
+# Parse command line
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --hparams)
+      hparams="$2"
+      shift
+      shift
+      ;;
+
+    --data_folder)
+      data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --cached_data_folder)
+      cached_data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --output_folder)
+      output_folder="$2"
+      shift
+      shift
+      ;;
+   
+      --task)
+      task="$2"
+      shift
+      shift
+      ;;
+     
+      --dataset)
+      dataset="$2"
+      shift
+      shift
+      ;;
+
+    --seed)
+      seed="$2"
+      shift
+      shift
+      ;;
+
+    --nruns)
+      nruns="$2"
+      shift
+      shift
+      ;;
+
+    --eval_metric)
+      eval_metric="$2"
+      shift
+      shift
+      ;;
+
+    --eval_set)
+      eval_set="$2"
+      shift
+      shift
+      ;;
+
+    --rnd_dir)
+      rnd_dir="$2"
+      shift
+      shift
+      ;;
+
+
+    --help)
+      print_argument_descriptions
+      ;;
+
+    -*|--*)
+      additional_flags+="$1 $2 " # store additional flags
+      shift # past argument
+      ;;
+
+
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+
+# Check for required arguments
+if  [ -z "$hparams" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ]  || [ -z "$nruns" ]; then
+    echo "ERROR: Missing required arguments! Please provide all required options."
+    print_argument_descriptions
+fi
+
+# Manage Seed (optional argument)
+seed="${seed:-$RANDOM}"
+
+
+if [ "$rnd_dir" = True ]; then
+    rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
+    output_folder="$output_folder/$rnd_dirname"
+fi
+
+# Make sure  the output_folder is created
+mkdir -p $output_folder
+
+# Print command line arguments and save to file
+{
+    echo "hparams: $hparams"
+    echo "data_folder: $data_folder"
+    echo "cached_data_folder: $cached_data_folder"
+    echo "output_folder: $output_folder"
+    echo "task: $task"
+    echo "dataset: $dataset"
+    echo "seed: $seed"
+    echo "nruns: $nruns"
+    echo "eval_metric: $eval_metric"
+    echo "eval_set: $eval_set"
+    echo "rnd_dir: $rnd_dir"
+    echo "additional flags: $additional_flags"
+} | tee "$output_folder/flags.txt"
+
+
+# Creating output folder
+mkdir -p $output_folder
+mkdir -p $data_folder
+mkdir -p $cached_data_folder
+
+# Function to run the training experiment
+run_experiment() {
+
+python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
+$additional_flags 
+
+}
+
+# Run multiple training experiments (with different seeds)
+for i in $(seq 0 1 $(( nruns - 1 ))); do
+  ((run_idx = i + 1))
+  run_name=run"$run_idx"
+  output_folder_exp="$output_folder"/"$run_name"/$seed
+
+  run_experiment  $output_folder_exp
+
+
+  # Changing Random seed
+  seed=$((seed+1))
+done
+
+
+echo 'Final Results (Performance Aggregation)'
+python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
\ No newline at end of file
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
new file mode 100644
index 000000000..be30bdb85
--- /dev/null
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python
+"""
+Snippet to aggregate the results over multiple runs of the same experiment.
+This is useful when we run multiple experiments with different seeds and we
+want to compute the average performance. The script also reports the final
+metric to Orion (when needed for hyperparameter tuning).
+
+The script searches for the result files (_results.txt) and computes the mean
+and the standard deviation of the given evaluation metrics (e.g., acc or f1).
+The results must have an identical format (with only different performance
+numbers).
+
+To run this script:
+
+    > python aggregate_results.py your_result_folder acc
+
+Author
+------
+Pooneh Mousavi 2024
+"""
+
+import sys
+import re
+import numpy as np
+from orion.client import report_objective
+from speechbrain.utils.data_utils import get_all_files
+
+
+def get_prototype(res_file, eval_metric):
+    """Parses a result file and adds a placeholder where the aggregated metrics
+    should be printed. It also returns the number of detected metrics.
+
+    Arguments
+    ---------
+    res_file: path
+        Path of the result file to parse.
+    eval_metric: path
+        Metric of interest (e.g, acc or f1).
+
+    Returns
+    ---------
+    prototype: list
+        List of the lines of the result file (with <values> as placeholder).
+    n_metrics: int
+        Number of metrics to replace in the result files.
+    """
+    prototype = []
+    n_metrics = 0
+
+    # Open the first res file and figure out where the metrics are
+    with open(res_file) as file_in:
+        for line in file_in:
+            if eval_metric in line:
+                line = line.split(eval_metric)[0]
+                # The placeholder for the metric is <values>
+                line = line + eval_metric + " <values>"
+                n_metrics = n_metrics + 1
+            prototype.append(line)
+    return prototype, n_metrics
+
+
+def get_metrics(res_files, eval_metric):
+    """Summarizes the metrics of interest in a matrix.
+
+    Arguments
+    ---------
+    res_files: list
+        List of all the result files.
+    eval_metric: path
+        Metric of interest (e.g, acc or f1).
+
+    Returns
+    ---------
+    metrics: np.array
+        Matrix (n_metrics, n_files) containing the metrics of interest.
+    """
+
+    # Metric initialization
+    metrics = np.zeros([n_metrics, len(res_files)])
+
+    # Loop over files
+    for i in range(len(res_files)):
+        cnt = 0
+        # Metric extraction
+        with open(res_files[i]) as file_in:
+            for line in file_in:
+                if eval_metric in line:
+                    # Use regex to find the test WER value
+                    match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line)
+                    if match:
+                        value = match.group(1)
+                        value = float(value)
+                        metrics[cnt, i] = value
+                        cnt = cnt + 1
+    return metrics
+
+
+def aggregate_metrics(prototype, metrics):
+    """Prints the aggregated metrics.It replaces the <values> placeholders with
+    the corresponding metrics.
+
+    Arguments
+    ---------
+    prototype: list
+        List of the lines of the result file (with <values> as placeholder).
+    metrics: np.array
+        Matrix (n_metrics, n_files) containing the metrics of interest.
+    """
+    cnt = 0
+    for line in prototype:
+        if eval_metric in line:
+            values_line = "["
+            for i in range(len(res_files)):
+                values_line = values_line + "%f " % float(metrics[cnt, i])
+            values_line = values_line[:-1]
+            values_line = values_line + "] avg: %f ± %f " % (
+                float(metrics[cnt, :].mean()),
+                float(metrics[cnt, :].std()),
+            )
+            line = line.replace("<values>", values_line)
+            cnt = cnt + 1
+        print(line)
+
+
+if __name__ == "__main__":
+    output_folder = sys.argv[1]
+    eval_metric = sys.argv[2]
+    
+    # Getting the list of the result files in the output folder
+    res_files = get_all_files(output_folder, match_and=["train_log.txt"])
+
+    # Gettin a prototype file
+    prototype, n_metrics = get_prototype(res_files[0], eval_metric)
+
+    # Extracting the metrics of interest
+    metrics = get_metrics(res_files, eval_metric)
+
+    # print aggregated metrics
+    aggregate_metrics(prototype, metrics)
+
+    final_metric = metrics[-1, :].mean()
+
+    # Report final metric to Orion
+    # Remember: orion expects metrics to be minimized!
+    if eval_metric == "acc" or eval_metric == "f1":
+        final_metric = 1 - final_metric
+    report_objective(final_metric)
\ No newline at end of file

From d41c6e4a51591bd2cd57d9a8a0fd5f11e0756ddb Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 18:28:34 -0500
Subject: [PATCH 031/270] fix run_experiments.sh bug

---
 .../DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml      |  2 +-
 .../LibriSpeech/ASR/hparams/contextnet/train.yaml     |  2 +-
 benchmarks/DASB/LibriSpeech/ASR/train.py              |  4 ++--
 benchmarks/DASB/extra_requirements.txt                |  1 +
 .../DASB/{run_experiment.sh => run_experiments.sh}    | 11 ++++++-----
 benchmarks/DASB/utils/aggregate_results.py            |  8 +++++---
 6 files changed, 16 insertions(+), 12 deletions(-)
 rename benchmarks/DASB/{run_experiment.sh => run_experiments.sh} (93%)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 69e74ca54..eb0d98d4b 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -5,7 +5,7 @@
 # Decoder: CTC beam searcher and greedy searcher
 # Tokens: character
 # Training: Librispeech 960h
-# Authors: 
+# Authors:
 #     - Pooneh Mousavi 2024
 #     - Jarod Duret 2024
 # ############################################################################
diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
index dcedc415d..aaca2668d 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
@@ -5,7 +5,7 @@
 # Decoder: CTC beam searcher and greedy searcher
 # Tokens: character
 # Training: Librispeech 960h
-# Authors: 
+# Authors:
 #     - Pooneh Mousavi 2024
 #     - Jarod Duret 2024
 # ############################################################################
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index 2758eb0eb..19aa43786 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -324,7 +324,7 @@ def text_pipeline(wrd):
             "tr_splits": hparams["train_splits"],
             "dev_splits": hparams["dev_splits"],
             "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
+            "save_folder": hparams["cached_data_folder"],
             "merge_lst": hparams["train_splits"],
             "merge_name": "train.csv",
             "skip_prep": hparams["skip_prep"],
@@ -333,7 +333,7 @@ def text_pipeline(wrd):
 
     # Defining tokenizer and loading it
     tokenizer = SentencePiece(
-        model_dir=hparams["save_folder"],
+        model_dir=hparams["cached_data_folder"],
         vocab_size=hparams["output_neurons"],
         annotation_train=hparams["train_csv"],
         annotation_read="wrd",
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index e04ccf781..1068c2b2a 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -3,6 +3,7 @@ jsonlines
 kaldiio
 librosa>=0.9.2
 onnxruntime>=1.16.3
+orion
 scikit-learn
 speechbrain>=1.0.0
 speechtokenizer>=0.1.2
diff --git a/benchmarks/DASB/run_experiment.sh b/benchmarks/DASB/run_experiments.sh
similarity index 93%
rename from benchmarks/DASB/run_experiment.sh
rename to benchmarks/DASB/run_experiments.sh
index 35a3ba4bc..e0f848aef 100644
--- a/benchmarks/DASB/run_experiment.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -6,8 +6,9 @@
 # At the end, the final performance is computed with the aggregate_results.py script that provides the average performance.
 #
 # Usage:
-# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \
-# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10
+# ./run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ \
+# --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER  --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/
+
 #
 # Authors:
 # - Pooneh Mousavi (2024)
@@ -75,13 +76,13 @@ while [[ $# -gt 0 ]]; do
       shift
       shift
       ;;
-   
+
       --task)
       task="$2"
       shift
       shift
       ;;
-     
+
       --dataset)
       dataset="$2"
       shift
@@ -181,7 +182,7 @@ mkdir -p $cached_data_folder
 run_experiment() {
 
 python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
-$additional_flags 
+$additional_flags
 
 }
 
diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
index be30bdb85..0df315b7e 100644
--- a/benchmarks/DASB/utils/aggregate_results.py
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -86,7 +86,9 @@ def get_metrics(res_files, eval_metric):
             for line in file_in:
                 if eval_metric in line:
                     # Use regex to find the test WER value
-                    match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line)
+                    match = re.search(
+                        rf"{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)", line
+                    )
                     if match:
                         value = match.group(1)
                         value = float(value)
@@ -125,7 +127,7 @@ def aggregate_metrics(prototype, metrics):
 if __name__ == "__main__":
     output_folder = sys.argv[1]
     eval_metric = sys.argv[2]
-    
+
     # Getting the list of the result files in the output folder
     res_files = get_all_files(output_folder, match_and=["train_log.txt"])
 
@@ -144,4 +146,4 @@ def aggregate_metrics(prototype, metrics):
     # Remember: orion expects metrics to be minimized!
     if eval_metric == "acc" or eval_metric == "f1":
         final_metric = 1 - final_metric
-    report_objective(final_metric)
\ No newline at end of file
+    report_objective(final_metric)

From 04ea1e62a1b74310466891a3e5f8e0fd950364ea Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 18:48:32 -0500
Subject: [PATCH 032/270] add bash script for token extraction

---
 .../DASB/run_discriminative_benchmark.sh      |  36 ------
 benchmarks/DASB/run_extraction.sh             | 114 ++++++++++++++++++
 benchmarks/DASB/run_generative_benchmark.sh   |  67 ----------
 3 files changed, 114 insertions(+), 103 deletions(-)
 delete mode 100644 benchmarks/DASB/run_discriminative_benchmark.sh
 create mode 100644 benchmarks/DASB/run_extraction.sh
 delete mode 100644 benchmarks/DASB/run_generative_benchmark.sh

diff --git a/benchmarks/DASB/run_discriminative_benchmark.sh b/benchmarks/DASB/run_discriminative_benchmark.sh
deleted file mode 100644
index 79383deb2..000000000
--- a/benchmarks/DASB/run_discriminative_benchmark.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Please consult the README.md file for instructions on how to run the benchmark.
-
-tokenizer_name=$1
-if [[ "$tokenizer_name" == "" ]]; then
-        echo "Usage: run_generative_benchmark.sh <tokenizer_name>"
-        exit 1
-fi
-
-output_folder='/path/to/output'
-declare -a DatasetsFolders=('path/to/LibriSpeech' 'path/to/CommonVoice' 'path/to/IEMOCAP' 'path/to/SLURP' 'path/to/Google-speech-commands' 'path/to/VoiceCeleb1')
-declare -a ConsideredTasks=('LibriSpeech/ASR' 'CommonVoice/ASR' 'IEMOCAP/emotion_recognition' 'SLURP/intent_classification' 'Google-speech-commands/keyword-spotting' 'VoiceCeleb1/speaker_ver')
-declare -a DownStreams=('LSTM' 'LSTM' 'ecapa_tdnn' 'LSTM_linear' 'Xvector','Xvector')
-declare -a Locales=('cy' 'eu')
-declare -a LocalesVobSize=(100 200)
-
-shift
-script_args="$@"
-
-for i in "${!ConsideredTasks[@]}"; do
-        task=${ConsideredTasks[i]}
-        downstream=${DownStreams[i]}
-        dataset_folder=${DatasetsFolders[i]}
-        recipe_extra_args="$script_args"
-        set -- "$recipe_extra_args"
-        if [[ "$task" == "CommonVoice/ASR" ]]; then
-                echo "${tokenizer_name}/${task}/${downstream}"
-                for j in "${!Locales[@]}"; do
-                        locale=${Locales[j]}
-                        vocab=${LocalesVobSize[j]}
-                        python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml   --output_folder $output_folder/$tokenizer_name/$task/$downstream/$locale --data_folder $dataset_folder/$locale --language $locale  --output_neurons $vocab $@
-                done
-        else
-                python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml   --output_folder $output_folder/$tokenizer_name/$task/$downstream --data_folder $dataset_folder $@
-        fi
-done
diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh
new file mode 100644
index 000000000..2d419bac5
--- /dev/null
+++ b/benchmarks/DASB/run_extraction.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+###########################################################
+# Script to extracts and save tokens from dataset.
+#
+# Usage:
+# ./ $run_extraction.sh  --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encidec --dataset LibriSpeech
+
+# Authors:
+# - Pooneh Mousavi (2024)
+###########################################################
+
+# Initialize variables
+data_folder=""
+output_folder=""
+tokenizer=""
+dataset=""
+save_embedding=False
+additional_flags=""
+
+
+# Function to print argument descriptions and exit
+print_argument_descriptions() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --data_folder data_folder_path    Data folder path"
+    echo "  --output_folder output_path       Output folder path"
+    echo "  --tokenizer tokenizer             tokenizer"
+    echo "  --dataset dataset                 dataset"
+    echo "  --save_embedding save_embedding   If True the the embedding are saved. Default: False"
+    exit 1
+}
+
+
+# Parse command line
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --data_folder)
+      data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --output_folder)
+      output_folder="$2"
+      shift
+      shift
+      ;;
+
+      --tokenizer)
+      task="$2"
+      shift
+      shift
+      ;;
+
+      --dataset)
+      dataset="$2"
+      shift
+      shift
+      ;;
+
+    --save_embedding)
+      save_embedding="$2"
+      shift
+      shift
+      ;;
+
+    --help)
+      print_argument_descriptions
+      ;;
+
+    -*|--*)
+      additional_flags+="$1 $2 " # store additional flags
+      shift # past argument
+      ;;
+
+
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+
+# Check for required arguments
+if  [ -z "$tokenizer" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ]  || [ -z "$dataset" ]; then
+    echo "ERROR: Missing required arguments! Please provide all required options."
+    print_argument_descriptions
+fi
+
+
+# Make sure  the output_folder is created
+mkdir -p $output_folder
+
+# Print command line arguments and save to file
+{
+    echo "data_folder: $data_folder"
+    echo "output_folder: $output_folder"
+    echo "tokenizer: $tokenizer"
+    echo "dataset: $dataset"
+    echo "save_embedding: $save_embedding"
+    echo "additional flags: $additional_flags"
+} | tee "$output_folder/flags.txt"
+
+
+# Creating output folder
+mkdir -p $output_folder
+mkdir -p $data_folder
+
+python $dataset/extraction/extract.py $dataset/extraction/hparams/$tokenizer.yaml --data_folder=$data_folder --output_folder=$output_folder --save_embedding=$save_embedding \
+$additional_flags
diff --git a/benchmarks/DASB/run_generative_benchmark.sh b/benchmarks/DASB/run_generative_benchmark.sh
deleted file mode 100644
index d5dc0d1d4..000000000
--- a/benchmarks/DASB/run_generative_benchmark.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-# Please consult the README.md file for instructions on how to run the benchmark.
-
-tokenizer_name=$1
-if [[ "$tokenizer_name" == "" ]]; then
-        echo "Usage: run_generative_benchmark.sh <tokenizer_name>"
-        exit 1
-fi
-
-output_folder='path/to/output'
-librimix_path='path/to/Libri2Mix'
-voicebank_path='path/to/VoiceBank'
-ljspeech_path='path/to/ljspeech'
-utmos_path='path/to/utmos'
-tts_args="--token_list_file_text %recipe_root%/hparams/char_en.txt --utmos_model_path $utmos_path"
-
-declare -a DatasetsFolders=(\
-        "$librimix_path" \
-        "$voicebank_path" \
-        "$ljspeech_path" \
-        "$ljspeech_path" \
-)
-declare -a ConsideredTasks=(\
-        'Libri2Mix/separation' \
-        'VoiceBank/enhancement' \
-        'LJSpeech/TTS' \
-        'LJSpeech/TTS' \
-)
-declare -a DownStreams=(\
-        'conformer' \
-        'conformer' \
-        'tokotron' \
-        'tokotron' \
-)
-declare -a ExtraArgs=(\
-        '' \
-        '' \
-        "$tts_args" \
-        "$tts_args --enc_num_layers 3 --dec_num_layers 6" \
-)
-
-declare -a OutputSuffix=(\
-        '' \
-        '' \
-        '' \
-        '-small'
-)
-
-shift
-script_args="$@"
-
-for i in "${!ConsideredTasks[@]}"; do
-        task=${ConsideredTasks[i]}
-        downstream=${DownStreams[i]}
-        dataset_folder=${DatasetsFolders[i]}
-        extra_args=${ExtraArgs[i]}
-        suffix=${OutputSuffix[i]}
-        recipe_root="$task/$downstream"
-        recipe_extra_args="$script_args ${extra_args//%recipe_root%/$recipe_root}"
-        set -- "$recipe_extra_args"
-        echo "${tokenizer_name}/${task}/${downstream}"
-        python $task/$downstream/train_$tokenizer_name.py \
-                $task/$downstream/hparams/train_$tokenizer_name.yaml  \
-                --output_folder $output_folder/$tokenizer_name/$task/$downstream$suffix \
-                --data_folder $dataset_folder \
-                $@
-done

From 95333cf4c9ab0a19c5254840fb6a7d14505eefd7 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Tue, 24 Dec 2024 18:56:21 -0500
Subject: [PATCH 033/270] fix bug

---
 benchmarks/DASB/run_extraction.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh
index 2d419bac5..e121c35cb 100644
--- a/benchmarks/DASB/run_extraction.sh
+++ b/benchmarks/DASB/run_extraction.sh
@@ -49,13 +49,13 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
-      --tokenizer)
-      task="$2"
+    --tokenizer)
+      tokenizer="$2"
       shift
       shift
       ;;
 
-      --dataset)
+    --dataset)
       dataset="$2"
       shift
       shift

From 096fc43c659122952e7be36257ac6a4d7f75ce39 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 13:48:17 -0500
Subject: [PATCH 034/270] add hyperparam tuning

---
 .../LibriSpeech/ASR/hparams/LSTM/train.yaml   |   8 +-
 benchmarks/DASB/extra_requirements.txt        |   1 +
 benchmarks/DASB/orion/hparams_tpe.yaml        |   6 +
 benchmarks/DASB/run_hparam_optimization.sh    | 422 ++++++++++++++++++
 4 files changed, 433 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/DASB/orion/hparams_tpe.yaml
 create mode 100644 benchmarks/DASB/run_hparam_optimization.sh

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index eb0d98d4b..98ba22d23 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -42,7 +42,8 @@ pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, th
 
 ####################### Training Parameters ####################################
 number_of_epochs: 20
-batch_size: 4
+batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)"
+batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -54,7 +55,7 @@ valid_search_interval: 1
 avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 cache_size: 1.e+10
 
-lr_model: 0.001
+lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
 weight_decay: 0.0005
 
 
@@ -109,8 +110,7 @@ freeze_embedding: False
 
 # LSTM
 activation: !name:torch.nn.Sigmoid
-dnn_layers: 2
-dnn_neurons: 1024
+dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
 
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index 1068c2b2a..e97e16b28 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -4,6 +4,7 @@ kaldiio
 librosa>=0.9.2
 onnxruntime>=1.16.3
 orion
+orion[profet]
 scikit-learn
 speechbrain>=1.0.0
 speechtokenizer>=0.1.2
diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml
new file mode 100644
index 000000000..cf2f6fd54
--- /dev/null
+++ b/benchmarks/DASB/orion/hparams_tpe.yaml
@@ -0,0 +1,6 @@
+experiment:
+    algorithms:
+        tpe:
+            seed: 1986
+            n_initial_points: 20
+            n_ei_candidates: 24
\ No newline at end of file
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
new file mode 100644
index 000000000..de5110b96
--- /dev/null
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -0,0 +1,422 @@
+#!/bin/bash
+
+###########################################################
+# Hyperparameter Tuning Script for EEG Model with Orion
+###########################################################
+
+# Description:
+# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion.
+# It supports leave-one-subject-out and/or leave-one-session-out training strategies.
+
+# Usage:
+# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \
+#                             --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \
+#                             --data_folder eeg_data/ \
+#                             --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \
+#                             --nruns 1 --nruns_eval 10 \
+#                             --eval_metric acc \
+#                             --exp_max_trials 50 \
+#                             --store_all True \
+#                             --device 'cpu'
+#
+# Optimization Steps:
+# The script supports multiple hyperparameter optimization steps.
+# We found it convenient to first optimize training and model hyperparameters,
+# and then optimize data augmentation hyperparameters in a separate step.
+
+# Script Workflow:
+# 1. Search for the orion flags in the specified hparam file.
+# 2. Run the orion-hunt command for hyperparameter tuning.
+#    By default, TPE (Tree-structured Parzen Estimator) hyperparameter tuning is
+#    performed, as specified in the default orion config file at hparams/orion/hparams_tpe.yaml.
+# 3. Save the best hyperparameters, which can be viewed using torch-info.
+# 4. Loop until flags like @orion_step<stepid> are found in the YAML file.
+#
+# Final Performance Evaluation:
+# At the end of the optimization process, the script computes the final performance
+# using the best hyperparameters on the test set.
+# This is done by averaging over nruns_eval different seeds.
+#
+# Note: More detailed information can be found in the README.md file.
+
+# Authors:
+# - Pooneh Mousavi 2024
+###########################################################
+
+# Initialize variables
+exp_name="hopt"
+output_folder=""
+data_folder=""
+cached_data_folder=""
+task=""
+dataset=""
+hparams=""
+nruns=""
+nruns_eval=10
+eval_metric="acc"
+seed=1986
+config_file="orion/hparams_tpe.yaml"
+mne_dir=""
+orion_db_address=""
+orion_db_type="PickledDB"
+exp_max_trials=50
+store_all=True
+compress_exp=True
+
+# Function to print argument descriptions and exit
+print_argument_descriptions() {
+    echo "Usage: $0 [options]"
+    echo "Options:"
+    echo "  --exp_name Name                       Name that Orion gives to the experiment"
+    echo "  --output_folder output_path           Output folder were the results will be stored"
+    echo "  --data_folder data_path               Folder were the data are stored. If not available, they will be downloaded there."
+    echo "  --cached_data_folder path [Optional]  Folder were the data in pkl format will be cached."
+    echo "  --task task                       downstream task"
+    echo "  --dataset dataset                 dataset"
+    echo "  --hparms hparam_file                  YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used"
+    echo "  --nruns num_runs                      Number of runs for each hparam selection."
+    echo "  --nruns_eval num_runs                 Number of runs for the final evaluation  (with best hparams) on the test set"
+    echo "  --eval_metric metric [Optional]       Evaluation metric description. Default:acc"
+    echo "  --seed random_seed [Optional]         Seed (random if not specified)"
+    echo "  --config_file config_file [Optional]  Orion config file. Default: hparams/orion/hparams_tpe.yaml"
+    echo "  --mne_dir mne_dir [Optional]          MNE directory. Need it different from your home (see notes on MNE in README.md)"
+    echo "  --orion_db_address [Optional]         Path of the database where orion will store hparams and performance"
+    echo "  --orion_db_type db_type [Optional]    Type of the dataset that orion will use. Default: PickledDB"
+    echo "  --exp_max_trials int [Optional]       Maximum number of hparam trials for each oprimization step. Default:50"
+    echo "  --store_all Bool [Optional]           When set to True, the output folders of all hparam trials will be stored in randomly named folders. Default: False"
+    echo "  --compress_exp Bool [Optional]        When set to True, this option compresses the output folders of all hyperparameter trials into a single tar.gz file. This is particularly useful when store_all is set to True, as it helps prevent the accumulation of a large number of files. Default: False"
+    exit 1
+}
+
+POSITIONAL_ARGS=()
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+
+    --exp_name)
+      exp_name="$2"
+      shift
+      shift
+      ;;
+
+    --output_folder)
+      output_folder="$2"
+      shift
+      shift
+      ;;
+
+    --data_folder)
+      data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --hparams)
+      hparams="$2"
+      shift
+      shift
+      ;;
+
+    --cached_data_folder)
+      cached_data_folder="$2"
+      shift
+      shift
+      ;;
+
+    --task)
+      task="$2"
+      shift
+      shift
+      ;;
+     
+    --dataset)
+      dataset="$2"
+      shift
+      shift
+      ;;
+
+    --seed)
+      seed="$2"
+      shift
+      shift
+      ;;
+
+    --nruns)
+      nruns="$2"
+      shift
+      shift
+      ;;
+
+    --nruns_eval)
+      nruns_eval="$2"
+      shift
+      shift
+      ;;
+
+
+    --eval_metric)
+      eval_metric="$2"
+      shift
+      shift
+      ;;
+
+
+
+    --config_file)
+      config_file="$2"
+      shift
+      shift
+      ;;
+
+    --mne_dir)
+      mne_dir="$2"
+      shift
+      shift
+      ;;
+
+    --orion_db_address)
+      orion_db_address="$2"
+      shift
+      shift
+      ;;
+
+    --orion_db_type)
+      orion_db_type="$2"
+      shift
+      shift
+      ;;
+
+    --exp_max_trials)
+      exp_max_trials="$2"
+      shift
+      shift
+      ;;
+
+    --store_all)
+      store_all="$2"
+      shift
+      shift
+      ;;
+
+    --compress_exp)
+      compress_exp="$2"
+      shift
+      shift
+      ;;
+
+    --help)
+      print_argument_descriptions
+      ;;
+
+    -*|--*)
+      additional_flags+="$1 $2 " # store additional flags
+      shift # past argument
+      ;;
+
+
+    *)
+      POSITIONAL_ARGS+=("$1") # save positional arg
+      shift # past argument
+      ;;
+  esac
+done
+
+
+# Check for required arguments
+if [ -z "$output_folder" ] || [ -z "$data_folder" ]  || [ -z "$hparams" ] || [ -z "$nruns" ]; then
+    echo "ERROR: Missing required arguments! Please provide all required options."
+    print_argument_descriptions
+fi
+
+# Set mne_dir if specified
+if [ "$mne_dir" ]; then
+   export _MNE_FAKE_HOME_DIR=$mne_dir
+fi
+
+# Assign default value to cached_data_folder
+if [ -z "$cached_data_folder" ]; then
+    cached_data_folder="$data_folder/cache"
+fi
+
+
+# Set orion db address if specified
+if [ -z "$orion_db_address" ]; then
+    orion_db_address=$output_folder'/'$exp_name'.pkl'
+fi
+export ORION_DB_ADDRESS=$orion_db_address
+export ORION_DB_TYPE=$orion_db_type
+
+echo "-------------------------------------"
+echo "Experiment Name: $exp_name"
+echo "hparams: $hparams"
+echo "Output Folder: $output_folder"
+echo "Data Folder: $data_folder"
+echo "Cached Data Folder: $cached_data_folder"
+echo "task: $task"
+echo "dataset: $dataset"
+echo "Hparam File: $hparams"
+echo "Number of Runs: $nruns"
+echo "Number of Eval Runs: $nruns_eval"
+echo "Eval Metric: $eval_metric"
+echo "Seed: $seed"
+echo "Additional Flags: $additional_flags"
+echo "Orion Config File: $config_file"
+echo "Orion Database type: $orion_db_type"
+echo "Orion Database file: $orion_db_address"
+echo "Experiment Max Trials: $exp_max_trials"
+echo "-------------------------------------"
+
+
+# This function will extract all the optimization flags added in the yaml file
+# The input is a text file (e.g, a yaml file) and a pattern (e.g, "@orion_step1:")
+# The ouput are the detected flags (e.g.,  --dropout~"uniform(0.0, 0.5)").
+get_flag() {
+    local file_path="$1"
+    local pattern="$2"
+
+    # Check if the file exists
+    if [ ! -f "$file_path" ]; then
+        echo "Error: File '$file_path' not found."
+        return 1
+    fi
+
+    # Use grep to find all lines containing the pattern and then extract the flags using sed
+    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n'
+}
+
+
+# Function for updatading the hparam yaml file with the best hparams found at step 1
+update_hparams() {
+    local best_hparams_file="$1"
+    local hparams_yaml_file="$2"
+    local output_yaml_file="$3"
+
+    # Read the values from best_hparams.txt into an associative array
+    declare -A best_hparams
+    while IFS=": " read -r key value; do
+        best_hparams["$key"]=$value
+    done < "$best_hparams_file"
+
+
+    # Read the hparams.yaml file into a variable
+    local hparams_content=$(cat "$hparams_yaml_file")
+
+    # Update values in hparams_content using values from best_hparams
+    for key in "${!best_hparams[@]}"; do
+        local pattern="^$key: .*"
+        local replacement="$key: ${best_hparams[$key]}"
+        hparams_content=$(sed "s/$pattern/$replacement/g" <<< "$hparams_content")
+    done
+
+    # Write the updated content to a new YAML file
+    echo "$hparams_content" > "$output_yaml_file"
+}
+
+# Function for extracting the best hparams from orion-info
+function extract_best_params() {
+    local input_file="$1"
+    local best_trial_line=$(grep -n "best trial:" "$input_file" | cut -d ":" -f 1)
+    local params_lines=$(tail -n +$best_trial_line "$input_file" | awk '/params:/{flag=1;next}/start time:/{flag=0}flag')
+    local formatted_params=$(echo "$params_lines" | sed -e 's/^[[:space:]]*//' -e 's/: /: /' -e '/^$/d' -e 's#^/##')
+    echo "$formatted_params"
+}
+
+# Running hparam tuning (loop over multiple steps)
+step_id=1
+hparams_step=$hparams
+pattern="@orion_step1:"
+opt_flags=$(get_flag "$hparams_step" "$pattern")
+
+# Check if the string is empty and exit with an error if it is
+if [ -z "$opt_flags" ]; then
+    echo "Error: Optimization flags not found in '$hparams'"
+    echo "Please ensure that the Orion optimization flags are set in the hparam file using in-line comments like:"
+    echo "# @orion_step1: --dropout~\"uniform(0.0, 0.5)\""
+    exit 1  # Exit with a non-zero error code
+fi
+
+
+while [ -n "$opt_flags" ]; do
+    # Do something
+    output_folder_step="$output_folder"/step"$step_id"
+    mkdir -p $output_folder_step
+    exp_name_step="$exp_name"_step"$step_id"
+
+    echo
+    echo "**********************************************************************************************"
+    echo "Running hparam tuning (step $step_id)..."
+    echo "- This might take several hours!"
+    echo "- The best set of hparams will be save in $output_folder_step"
+    echo "- You can monitor the evolution of the hparam optimization with: orion status -n $exp_name"
+    echo "......"
+    echo "**********************************************************************************************"
+    echo
+    # Setting up orion command
+    orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \
+    	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \
+    	--output_folder $output_folder_step/exp   --task $task   --dataset $dataset  --seed $seed --nruns $nruns \
+    	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all $additional_flags"
+
+
+    # Appending the optimization flags
+    orion_hunt_command="$orion_hunt_command $opt_flags"
+
+    echo $orion_hunt_command &> "$output_folder_step/orion_hunt_command.txt"
+
+    # Execute the command for hparm tuning
+    eval $orion_hunt_command
+
+    # Compress the exp folder (if required)
+    if [ "$compress_exp" = True ]; then
+        tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp"
+        if [ -d "$output_folder_step/exp" ]; then
+            rm -rf "$output_folder_step/exp"
+        fi
+
+    fi
+
+    # Storing best haprams
+    orion info --name $exp_name_step &> $output_folder_step/orion-info.txt
+
+    # Extract list of the best hparams from orion-info
+    # Find the line number where "best trial:" appears
+    best_trial_line=$(grep -n "best trial:" $output_folder_step/orion-info.txt | cut -d ":" -f 1)
+
+    # Extract and store the best set of hparams
+    best_params_output=$(extract_best_params "$output_folder_step/orion-info.txt")
+    best_hparams_file="$output_folder_step/best_hparams.txt"
+    echo "$best_params_output" > $best_hparams_file
+
+    # Store the current best yaml file
+    best_yaml_file="$output_folder_step/best_hparams.yaml"
+    update_hparams "$best_hparams_file" "$hparams_step" "$best_yaml_file"
+
+    # Update best hparam step
+    hparams_step=$best_yaml_file
+
+    # Update step variable
+    ((step_id++))
+
+    # Update search pattern
+    pattern="@orion_step$step_id:"
+
+    # update optimization flags pattern
+    opt_flags=$(get_flag "$hparams_step" "$pattern")
+done
+
+echo
+echo "**********************************************************************************************"
+echo "Running Final Evaluation on the best hparams (test-set)..."
+echo "**********************************************************************************************"
+echo
+
+final_yaml_file="$output_folder/best_hparams.yaml"
+scp $best_yaml_file $final_yaml_file
+
+# Running evaluation on the test set for the best models
+./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
+  --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
+  --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
+  --rnd_dir $store_all $additional_flags
+
+echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file

From 8dc0161dd7c088e97faf8d7e22429646678d535b Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 13:50:59 -0500
Subject: [PATCH 035/270] fix precommit

---
 benchmarks/DASB/orion/hparams_tpe.yaml     | 2 +-
 benchmarks/DASB/run_hparam_optimization.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml
index cf2f6fd54..fb6a7c9b0 100644
--- a/benchmarks/DASB/orion/hparams_tpe.yaml
+++ b/benchmarks/DASB/orion/hparams_tpe.yaml
@@ -3,4 +3,4 @@ experiment:
         tpe:
             seed: 1986
             n_initial_points: 20
-            n_ei_candidates: 24
\ No newline at end of file
+            n_ei_candidates: 24
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index de5110b96..39766018f 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -128,7 +128,7 @@ while [[ $# -gt 0 ]]; do
       shift
       shift
       ;;
-     
+
     --dataset)
       dataset="$2"
       shift

From c0f4feeafaad74e6e9dea038129917277e76f756 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 13:57:56 -0500
Subject: [PATCH 036/270] modify  hparams.sh input order

---
 benchmarks/DASB/run_hparam_optimization.sh | 31 ++++++++++------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 39766018f..3c84f5ad4 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -45,16 +45,16 @@
 
 # Initialize variables
 exp_name="hopt"
-output_folder=""
+hparams=""
 data_folder=""
 cached_data_folder=""
+output_folder=""
 task=""
 dataset=""
-hparams=""
+seed=1986
 nruns=""
 nruns_eval=10
 eval_metric="acc"
-seed=1986
 config_file="orion/hparams_tpe.yaml"
 mne_dir=""
 orion_db_address=""
@@ -68,16 +68,16 @@ print_argument_descriptions() {
     echo "Usage: $0 [options]"
     echo "Options:"
     echo "  --exp_name Name                       Name that Orion gives to the experiment"
-    echo "  --output_folder output_path           Output folder were the results will be stored"
+    echo "  --hparms hparam_file                  YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used"
     echo "  --data_folder data_path               Folder were the data are stored. If not available, they will be downloaded there."
     echo "  --cached_data_folder path [Optional]  Folder were the data in pkl format will be cached."
-    echo "  --task task                       downstream task"
-    echo "  --dataset dataset                 dataset"
-    echo "  --hparms hparam_file                  YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used"
+    echo "  --output_folder output_path           Output folder were the results will be stored"
+    echo "  --task task                           downstream task"
+    echo "  --dataset dataset                     dataset"
+    echo "  --seed random_seed [Optional]         Seed (random if not specified)"
     echo "  --nruns num_runs                      Number of runs for each hparam selection."
     echo "  --nruns_eval num_runs                 Number of runs for the final evaluation  (with best hparams) on the test set"
     echo "  --eval_metric metric [Optional]       Evaluation metric description. Default:acc"
-    echo "  --seed random_seed [Optional]         Seed (random if not specified)"
     echo "  --config_file config_file [Optional]  Orion config file. Default: hparams/orion/hparams_tpe.yaml"
     echo "  --mne_dir mne_dir [Optional]          MNE directory. Need it different from your home (see notes on MNE in README.md)"
     echo "  --orion_db_address [Optional]         Path of the database where orion will store hparams and performance"
@@ -99,8 +99,8 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
-    --output_folder)
-      output_folder="$2"
+    --hparams)
+      hparams="$2"
       shift
       shift
       ;;
@@ -111,14 +111,14 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
-    --hparams)
-      hparams="$2"
+    --cached_data_folder)
+      cached_data_folder="$2"
       shift
       shift
       ;;
 
-    --cached_data_folder)
-      cached_data_folder="$2"
+    --output_folder)
+      output_folder="$2"
       shift
       shift
       ;;
@@ -153,15 +153,12 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
-
     --eval_metric)
       eval_metric="$2"
       shift
       shift
       ;;
 
-
-
     --config_file)
       config_file="$2"
       shift

From a595cf6172f4c035cc3c2e9cc880ce411bf47a4c Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 16:13:54 -0500
Subject: [PATCH 037/270] only applying testing for final run HT

---
 .../LibriSpeech/ASR/hparams/LSTM/train.yaml   |  2 +-
 benchmarks/DASB/LibriSpeech/ASR/train.py      | 27 ++++++++++---------
 benchmarks/DASB/run_hparam_optimization.sh    |  9 +++----
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 98ba22d23..1be23bc0c 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -18,7 +18,7 @@ output_folder: !ref results/LSTM/<run_name>/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index 19aa43786..49d2248cb 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -433,16 +433,17 @@ def text_pipeline(wrd):
         valid_loader_kwargs=hparams["valid_dataloader_opts"],
     )
 
-    # Testing
-    if not os.path.exists(hparams["output_wer_folder"]):
-        os.makedirs(hparams["output_wer_folder"])
-
-    for k in test_datasets.keys():  # keys are test_clean, test_other etc
-        asr_brain.hparams.output_wer_folder = os.path.join(
-            hparams["output_wer_folder"], f"wer_{k}.txt"
-        )
-        asr_brain.evaluate(
-            test_datasets[k],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            min_key="WER",
-        )
+    if hparams["testing"]:
+        # Testing
+        if not os.path.exists(hparams["output_wer_folder"]):
+            os.makedirs(hparams["output_wer_folder"])
+
+        for k in test_datasets.keys():  # keys are test_clean, test_other etc
+            asr_brain.hparams.output_wer_folder = os.path.join(
+                hparams["output_wer_folder"], f"wer_{k}.txt"
+            )
+            asr_brain.evaluate(
+                test_datasets[k],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                min_key="WER",
+            )
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 3c84f5ad4..390177b28 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -5,8 +5,7 @@
 ###########################################################
 
 # Description:
-# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion.
-# It supports leave-one-subject-out and/or leave-one-session-out training strategies.
+# This script facilitates hyperparameter tuning for a given audio tokenizer,  dowsnteram model and dataset using Orion.
 
 # Usage:
 # ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \
@@ -21,8 +20,6 @@
 #
 # Optimization Steps:
 # The script supports multiple hyperparameter optimization steps.
-# We found it convenient to first optimize training and model hyperparameters,
-# and then optimize data augmentation hyperparameters in a separate step.
 
 # Script Workflow:
 # 1. Search for the orion flags in the specified hparam file.
@@ -352,7 +349,7 @@ while [ -n "$opt_flags" ]; do
     orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \
     	./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \
     	--output_folder $output_folder_step/exp   --task $task   --dataset $dataset  --seed $seed --nruns $nruns \
-    	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all $additional_flags"
+    	--eval_metric $eval_metric --eval_set dev  --rnd_dir $store_all --testing False $additional_flags"
 
 
     # Appending the optimization flags
@@ -414,6 +411,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all $additional_flags
+  --rnd_dir $store_all --testing False $additional_flags
 
 echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file

From 78da6c14e9c2b58d5cfce9a4341707689a9eab7e Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 21:08:03 -0500
Subject: [PATCH 038/270] fix bug

---
 benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 1be23bc0c..8b9581dc9 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -110,7 +110,8 @@ freeze_embedding: False
 
 # LSTM
 activation: !name:torch.nn.Sigmoid
-dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"dnn_neurons: 1024
+dnn_layers: 2 #  @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"
+dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
 

From 6a3a7a5127c7f63534bc879305b602bb9170670e Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 21:26:45 -0500
Subject: [PATCH 039/270] fix bug

---
 benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index 8b9581dc9..be5c18d5b 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -41,7 +41,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved.
 
 ####################### Training Parameters ####################################
-number_of_epochs: 20
+number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)"
 batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)"
 batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1

From e9ff250486b8b1ed7adde95cc07e36a46c4b1441 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 21:32:24 -0500
Subject: [PATCH 040/270] add hupertun for contextnet

---
 .../DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml     |  2 +-
 .../LibriSpeech/ASR/hparams/contextnet/train.yaml    | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
index be5c18d5b..8b9581dc9 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml
@@ -41,7 +41,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved.
 
 ####################### Training Parameters ####################################
-number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)"
+number_of_epochs: 20
 batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)"
 batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
index aaca2668d..cd45d7d9a 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
@@ -18,7 +18,7 @@ output_folder: !ref results/LSTM/<run_name>/<seed>
 output_wer_folder: !ref <output_folder>/wer.txt
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
@@ -42,7 +42,8 @@ pretrain_embeddings_folder: none  # Optional: If pretrain_embeddings is True, th
 
 ####################### Training Parameters ####################################
 number_of_epochs: 20
-batch_size: 4
+batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)"
+batch_size: !ref 2 ** <batch_size_exponent>
 test_batch_size: 1
 grad_accumulation_factor: 2
 max_grad_norm: 5.0
@@ -107,11 +108,8 @@ encoder_dim: 1024
 pretrain_embeddings: False
 freeze_embedding: False
 
-# LSTM
-activation: !name:torch.nn.Sigmoid
-dnn_layers: 2
-dnn_neurons: 1024
-dropout: 0.2
+# Contextnet
+
 output_neurons: 31
 
 # BPE parameters

From 3e2fe0c89050745a2375f4c10c6d5654059d1f96 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 21:52:00 -0500
Subject: [PATCH 041/270] add etsting to average run

---
 benchmarks/DASB/run_hparam_optimization.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 390177b28..5cbde3b20 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -411,6 +411,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all --testing False $additional_flags
+  --rnd_dir $store_all --testing True $additional_flags
 
 echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file

From f378aeca46d2439d4bc747c8f656ec09173d24be Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 21:52:00 -0500
Subject: [PATCH 042/270] add lr for HT for contextnet

---
 benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml | 2 +-
 benchmarks/DASB/run_hparam_optimization.sh                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
index cd45d7d9a..eab197c68 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml
@@ -55,7 +55,7 @@ valid_search_interval: 1
 avg_checkpoints: 10 # Number of checkpoints to average for evaluation
 cache_size: 1.e+10
 
-lr_model: 0.001
+lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
 weight_decay: 0.0005
 
 
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 390177b28..5cbde3b20 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -411,6 +411,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all --testing False $additional_flags
+  --rnd_dir $store_all --testing True $additional_flags
 
 echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file

From b2bd3165bfa497b1742961eba8dae405171f0d77 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Wed, 25 Dec 2024 22:29:14 -0500
Subject: [PATCH 043/270] add measuring time

---
 benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 9 +++++++++
 benchmarks/DASB/LibriSpeech/ASR/train.py            | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
index 99eeb81fe..9fa3e3f3d 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
@@ -10,6 +10,7 @@
 
 import os
 import sys
+import time
 import torch
 import torchaudio
 import logging
@@ -423,6 +424,8 @@ def text_pipeline(wrd):
 
     if valid_bsampler is not None:
         valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+    # Measure time
+    start_time = time.time()  # Start the timer
 
     # Training
     asr_brain.fit(
@@ -433,6 +436,12 @@ def text_pipeline(wrd):
         valid_loader_kwargs=hparams["valid_dataloader_opts"],
     )
 
+    end_time = time.time()  # End the timer
+    # Calculate elapsed time
+    elapsed_time = end_time - start_time
+    hparams["train_logger"].log_stats(
+        stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"},
+    )
     # Testing
     if not os.path.exists(hparams["output_wer_folder"]):
         os.makedirs(hparams["output_wer_folder"])
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index 49d2248cb..a66c0c5bf 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -11,6 +11,7 @@
 
 import os
 import sys
+import time
 import torch
 import torchaudio
 import logging
@@ -424,6 +425,8 @@ def text_pipeline(wrd):
     if valid_bsampler is not None:
         valid_dataloader_opts = {"batch_sampler": valid_bsampler}
 
+    # Measure time
+    start_time = time.time()  # Start the timer
     # Training
     asr_brain.fit(
         asr_brain.hparams.epoch_counter,
@@ -433,6 +436,12 @@ def text_pipeline(wrd):
         valid_loader_kwargs=hparams["valid_dataloader_opts"],
     )
 
+    end_time = time.time()  # End the timer
+    # Calculate elapsed time
+    elapsed_time = end_time - start_time
+    hparams["train_logger"].log_stats(
+        stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"},
+    )
     if hparams["testing"]:
         # Testing
         if not os.path.exists(hparams["output_wer_folder"]):

From 9de693453ac350cd9f4997a380d41ca4a4537b4d Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Thu, 26 Dec 2024 01:14:50 -0500
Subject: [PATCH 044/270] add time measure

---
 benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 7 ++++---
 benchmarks/DASB/LibriSpeech/ASR/train.py            | 5 ++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
index 9fa3e3f3d..938ce8b96 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
@@ -439,9 +439,10 @@ def text_pipeline(wrd):
     end_time = time.time()  # End the timer
     # Calculate elapsed time
     elapsed_time = end_time - start_time
-    hparams["train_logger"].log_stats(
-        stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"},
-    )
+    logger.info(f"Model execution time: {elapsed_time:.6f} seconds")
+    # hparams["train_logger"].log_stats(
+    #     stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"},
+    # )
     # Testing
     if not os.path.exists(hparams["output_wer_folder"]):
         os.makedirs(hparams["output_wer_folder"])
diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py
index a66c0c5bf..ec6ac1b42 100644
--- a/benchmarks/DASB/LibriSpeech/ASR/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR/train.py
@@ -439,9 +439,8 @@ def text_pipeline(wrd):
     end_time = time.time()  # End the timer
     # Calculate elapsed time
     elapsed_time = end_time - start_time
-    hparams["train_logger"].log_stats(
-        stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"},
-    )
+    logger.info(f"Model execution time: {elapsed_time:.6f} seconds")
+
     if hparams["testing"]:
         # Testing
         if not os.path.exists(hparams["output_wer_folder"]):

From c4e273852c2b1078bfb70b5d515867788c982218 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 28 Dec 2024 11:24:50 -0500
Subject: [PATCH 045/270] update readme + minor changes

---
 .../ASR-on-the-fly/hparams/LSTM/dac.yaml      |   1 -
 .../ASR-on-the-fly/hparams/LSTM/encodec.yaml  |   1 -
 .../hparams/LSTM/speech_tokenizer.yaml        |   1 -
 .../hparams/contextnet/dac.yaml               |   1 -
 .../hparams/contextnet/encodec.yaml           |   1 -
 .../hparams/contextnet/speech_tokenizer.yaml  |   1 -
 benchmarks/DASB/README.md                     | 176 +++++++++++++++---
 benchmarks/DASB/run_extraction.sh             |   2 +-
 benchmarks/DASB/run_hparam_optimization.sh    |  24 ++-
 benchmarks/DASB/utils/tokenizer_interface.py  |  82 +++++++-
 10 files changed, 240 insertions(+), 50 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
index ff1749fab..605b772b5 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml
@@ -119,7 +119,6 @@ dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
index dd4f62bf4..f13e3cb53 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml
@@ -116,7 +116,6 @@ dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
index bb0b32a43..d0e9aae5b 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
@@ -111,7 +111,6 @@ dnn_neurons: 1024
 dropout: 0.2
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
index b60b32604..8e73e3601 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml
@@ -115,7 +115,6 @@ freeze_embedding: False
 # LSTM
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
index 7c0dcfc45..4d88a7978 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml
@@ -109,7 +109,6 @@ freeze_embedding: False
 
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
index 3dcd7eea7..7fdbf8d51 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
@@ -105,7 +105,6 @@ freeze_embedding: False
 
 output_neurons: 31
 
-# BPE parameters
 # BPE parameters
 token_type: char  # ["unigram", "bpe", "char"]
 character_coverage: 1.0
diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index c3e42bf64..78d780739 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -29,10 +29,11 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294):
 - [Installation](#-installation)
 - [Discrete Audio Encoder](#-Discrete-Audio-Encoder)
 - [Datasets and Recipes](#-Datasets-and-Recipes)
-- [Quickstart](#-quickstart)
-  - [Running a single task](#Running-a-single-task)
-  - [Running multiple tasks](#Runnin-multiple-tasks)
+- [Training Scenarios](#-Training-Scenarios)
+  - [On-the-FlybToken Extraction](#On-the-Fly-Token-Extraction)
+  - [Offline-Token-Extraction](#Offline-Token-Extraction)
 - [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer)
+- [Hyperparameter Tuning](#Hyperparameter-Tuning)
 - [Results](#-results)
 - [Contact](#-contact)
 - [Citing](#-citing)
@@ -98,51 +99,166 @@ To set up SpeechBrain-DASB, follow these steps:
 | Libri2Mix                                | Speech Separation                    | Conformer                   | CRDNN                         | [github.com/JorisCos/LibriMix](https://github.com/JorisCos/LibriMix)                  |
 | LJSpeech                                 | Text-to-Speech                       | Shallow Transformer         | Deep Transformer              | [keithito.com/LJ-Speech-Dataset/](https://keithito.com/LJ-Speech-Dataset/)            |
 
-# ▶️ Quickstart
+# 📖 Training Scenarios
 
-## Running a single task
+We offer two different training scenarios: **on-the-fly token extraction** and **offline token extraction**.
 
-If you have specific discrete model and want to benchmark it for a specific task, you need to run the following command:
-   ```
-   python LibriSpeech/ASR/LSTM/train_[tokenzier_name].py LibriSpeech/ASR/LSTM/hparams/train_[tokenzier_name].yaml --output_folder my-output-folder --data_folder mypath/to/LibriSpeech
-   ```
+## 1. On-the-Fly Token Extraction
+In this scenario, audio tokens are extracted dynamically during training. To enhance efficiency, we use a caching mechanism where tokens are saved in memory during the first epoch and retrieved for subsequent epochs. However, this approach has some limitations:
+- It works best when the dataset is small, the bitrate is low, and batching is sorted (not random).
+- It is unsuitable when data augmentation is required.
 
-## Running multiple tasks
+You can also disable the caching mechanism if needed.
 
-To run all tasks, make the following changes:
+Currently, the on-the-fly token extraction is applied only in the recipe located at:
+`LibriSpeech/ASR-on-the-fly`
 
-1. Edit the `run_discriminative_benchmark.sh` and `run_genarative_benchmark.sh` files and modify tokenizer related values for example the bitrate , number of codebooks, and etc.
-2. Choose a set of tasks from the provided list and, for each task, select a downstream architecture from the available options (see list below).
-3. Update the variables defined in `run_benchmark.sh` with two lists of equal size. In the `ConsideredTasks` list, specify the tasks you want to run (e.g., `'LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP'`). In the `Downstreams` list, specify the corresponding downstream architecture for each task (e.g., `'BiLSTM'`, `contextnet`, `'ecapa_tdnn'`).
+If you wish to adapt this strategy for your own recipe, you can copy and modify the existing recipe as needed. Here's how to run the on-the-fly recipe:
 
-   For example, if you set `ConsideredTasks=('LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP')` and `Downstreams=('BiLSTM', 'contextnet', 'ecapa_tdnn')`, the benchmark will be executed as follows:
-   - LibriSpeechASR with BiLSTM as the probing head
-   - LibriSpeechASR with contextnet as the probing head
-   - IEMOCAP with ecapa_tdnn as the probing head.
+```bash
+python LibriSpeech/ASR-on-the-fly/train.py LibriSpeech/ASR-on-the-fly/hparams/LSTM/{TOKENIZER}.yaml --data_folder=path/LibriSpeech --output_folder=path/results/LibriSpeech/ASR/{TOKENIZER}/LSTM
+```
 
-3. Run the following command:
-   ```
-   bash run_discriminative_benchmark.sh [tokenzier_name]
-   bash run_genarative_benchmark.sh [tokenzier_name]
-   ```
-   You could also pass extra arguments as far as they are consistent  across all tasks.
+> **Note:** On-the-fly extraction can be time-consuming, which is why we also provide an alternative approach: **offline token extraction**.
+
+---
+
+## 2. Offline Token Extraction
+In this scenario, all tokens are pre-extracted in a separate recipe. We recommend using the highest number of codebooks available for token extraction and then choosing the desired settings during training.
+
+### Token Extraction Command
+To extract tokens, use the following command:
+
+```bash
+python LibriSpeech/extraction/extract.py benchmarks/DASB/LibriSpeech/extraction/hparams/{tokenizer}.yaml --data_folder=path/LibriSpeech --num_codebooks=32
+```
+
+If you wish to initialize your embedding layer with the tokenizer's embeddings while training your downstream model, set the flag `save_embedding` to `True`. For discrete SSL tokenizers, you can specify a list of layers for `--num_codebooks` instead of a single number (e.g., `--num_codebooks=[3,7,12]`).
+
+### Training with Pre-Extracted Tokens
+Once tokens are extracted and saved, you can train a downstream model using the following command:
+
+```bash
+bash run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/
+```
+
+---
+
+This workflow ensures flexibility, efficiency, and reproducibility for both training scenarios. Adapt the recipes as needed for your specific requirements!
+
+Here's a polished and formatted version for clarity, suitable for a README or documentation:
+
+
+
+# 🎛️ Hyperparameter Tuning
+
+Efficient hyperparameter tuning is critical when introducing novel models or experimenting with diverse datasets. Our benchmark establishes a standardized protocol for hyperparameter tuning, leveraging [Orion](https://orion.readthedocs.io/en/stable/) to ensure fair and consistent model comparisons.
+
+---
+
+## **Overview**
+
+Hyperparameter tuning is managed using the `./run_hparam_optimization.sh` script. This script coordinates multiple hyperparameter trials via `run_experiments.sh`.
+
+
+
+## **Incorporating Orion Flags in Hparam Files**
+
+To enable tuning, Orion flags should be directly embedded in the YAML hparam file using comments. For example, to optimize the learning rate (`lr`) parameter within a defined range, include the following line in the YAML file:
+
+```yaml
+lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)"
+```
+
+
+
+## **Workflow of the Script**
+
+The script operates as follows:
+
+1. **Scans** the YAML hparam file for Orion flags.
+2. **Executes** hyperparameter tuning using the `orion-hunt` command.
+3. **Saves** the best hyperparameters for reference via `torch-info`.
+4. **Iterates** until encountering flags such as `@orion_step<stepid>` in the YAML file.
+
+
+
+## **Running Hyperparameter Optimization**
+
+You can perform hyperparameter optimization using a command like this:
+
+```bash
+bash run_hparam_optimization.sh \
+  --exp_name 'ASR-encodec-LSTM_hopt' \
+  --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \
+  --data_folder path/LibriSpeech \
+  --cached_data_folder path/cache/ \
+  --output_folder results/LibriSpeech/ASR/encodec/LSTM \
+  --task ASR \
+  --dataset LibriSpeech \
+  --seed 1986 \
+  --nruns 1 \
+  --nruns_eval 5 \
+  --eval_metric WER \
+  --exp_max_trials 50 \
+  --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \
+  --run_name encodec
+```
+
+For more details on the arguments and customization options, refer to `./run_hparam_optimization.sh`.
+
+
+### **Notes**
+
+1. **Execution Time**:
+   - Hyperparameter tuning may take several hours or even days, depending on the model complexity and dataset.
+
+2. **GPU vs. CPU**:
+   - By default, models are trained on GPU. To train on CPU instead, include the `--device cpu` flag.
+
+3. **Monitoring Progress**:
+   - Use the following command to monitor optimization status:
+     ```bash
+     orion status --all
+     ```
+   - Ensure that Orion-specific environment variables are set in your bash environment. For example:
+     ```bash
+     export ORION_DB_ADDRESS=results/LibriSpeech/ASR/encodec/LSTM/hopt/ASR-encodec-LSTM_hopt.pkl
+     export ORION_DB_TYPE=pickleddb
+     ```
+     Adjust `ORION_DB_ADDRESS` according to your experiment.
+
+4. **Resuming Optimization**:
+   - You can interrupt the script at any point. It will resume from the last completed trial.
+
+5. **Repetition of Optimization**:
+   - For multiple repetitions of the same hyperparameter optimization, modify the `--exp_name` parameter.
+
+6. **System Requirements**:
+   - The script is designed for Linux-based systems. A bash script is provided instead of Python due to its ability to manage diverse training loops across various subjects and sessions.
+
+---
+
+This protocol ensures fair model comparison across diverse tasks and datasets. All reported results are derived using this standardized hyperparameter tuning methodology, enabling consistent assessments across models.
 
-   For generative task, make sure to set the `utmos_path` required for TTS evaluation.
 # 📝 ‍Incorporating Your Audio Tokenizer
 
 Let's now assume you've designed an audio and speech tokenizer in PyTorch and wish to integrate it into our benchmark.
 You're in luck because we've made this step as simple as possible for you!
 Here are the steps you should follow:
 
-1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/MOABB/models/my_model.py`).
 
-2. Create a YAML and py file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, if you're working with LibriSpeech/ASR/LSTM, copy `benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml` and save it in the same folder with a different name (e.g., `train_my_model.yaml` and `train_my_model.py`).
+1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/DASB/models/my_model.py`).
+
+2. Add the tokenizer to `utils/tokenizer_interface.py` and ensure the `encode` and `decode` functions are consistent in functionality and output shape with the other tokenizers.
+
+3. Create a YAML and Python file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, you can copy `LibriSpeech/extraction/hparams/encodec.yaml`, adapt it based on your needs, and save it in the same folder with a different name (e.g., `LibriSpeech/extraction/hparams/{YOUR_TOKENIZER_NAME}.yaml`).
 
-3. Edit the relevant section of your `train_my_model.yaml` and `train_my_model.py`. Redefine the `codec:` to reference your custom model (e.g., `codec: !new:models.my_model.my_model`).
+4. Edit the relevant sections of your `{YOUR_TOKENIZER_NAME}.yaml`. Redefine the `tokenizer:` field to reference your custom model (e.g., `tokenizer: !new:tokenizer_interface.your_tokenizer`).
 
-4. Ensure you include the hyperparameters specific to your model.
+5. Ensure you include the hyperparameters specific to your model.
 
-5. Now, follow the instructions above to run an experiments across tasks.
+6. Now, follow the instructions provided earlier to run experiments across tasks.
 **Note**: If you're not familiar with YAML, you can refer to our [HyperPyYAML tutorial](https://speechbrain.github.io/tutorial_basics.html) on the SpeechBrain website for guidance.
 
 # 📈 Results
diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh
index e121c35cb..92cc81381 100644
--- a/benchmarks/DASB/run_extraction.sh
+++ b/benchmarks/DASB/run_extraction.sh
@@ -4,7 +4,7 @@
 # Script to extracts and save tokens from dataset.
 #
 # Usage:
-# ./ $run_extraction.sh  --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encidec --dataset LibriSpeech
+# ./ $run_extraction.sh  --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encodec --dataset LibriSpeech
 
 # Authors:
 # - Pooneh Mousavi (2024)
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 5cbde3b20..2ad1dddf3 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -8,16 +8,20 @@
 # This script facilitates hyperparameter tuning for a given audio tokenizer,  dowsnteram model and dataset using Orion.
 
 # Usage:
-# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \
-#                             --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \
-#                             --data_folder eeg_data/ \
-#                             --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \
-#                             --nruns 1 --nruns_eval 10 \
-#                             --eval_metric acc \
-#                             --exp_max_trials 50 \
-#                             --store_all True \
-#                             --device 'cpu'
-#
+# ./run_hparam_optimization.sh    --exp_name 'ASR-encodec-LSTM_hopt' \
+  #                               --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \
+  #                               --data_folder path/LibriSpeech \
+  #                               --cached_data_folder path/cache/ \
+  #                               --output_folder results/LibriSpeech/ASR/encodec/LSTM \
+  #                               --task ASR \
+  #                               --dataset LibriSpeech \
+  #                               --seed 1986 \
+  #                               --nruns 1 \
+  #                               --nruns_eval 5 \
+  #                               --eval_metric WER \
+  #                               --exp_max_trials 50 \
+  #                               --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \
+  #                               --run_name encodec
 # Optimization Steps:
 # The script supports multiple hyperparameter optimization steps.
 
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index f63ddd6aa..ff1194968 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -21,25 +21,101 @@
 
 
 class BaseTokenizer(ABC):
+    """
+    Abstract base class for tokenizers that encode signals into discrete tokens
+    and decode tokens back into signals.
+
+    This class defines the essential methods that any tokenizer must implement,
+    including encoding, decoding, and retrieving pretrained embeddings.
+
+    Naming Convenstion
+    ------------------
+    B : int
+        Batch size.
+    T : int
+        Sequence length in the time domain.
+    N : int
+        Sequence length in the token domain.
+    C : int
+        Vocabulary size, assuming each codebook has the same number of tokens.
+    K : int
+        Number of codebooks.
+    """
+
     def __init__(self):
+        """
+        Initialize the BaseTokenizer.
+
+        This is a base constructor that other tokenizers can extend.
+        """
         super().__init__()
 
     @abstractmethod
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
-        """Encode signal into tokens."""
+        """
+        Encode a signal into discrete tokens.
+
+        Arguments
+        ---------
+        signal : torch.Tensor
+            Input signal with shape [B, T].
+        lengths : torch.Tensor
+            Lengths of each sequence in the batch, with shape [B].
+        num_codebooks : int, optional
+            Number of codebooks to use for encoding. If None, all codebooks are used (default: None).
+            If specified as an int, the tokens will be truncated to include only the first `num_codebooks` codebooks. If specified as a list,
+            the tokens will include only the codebooks at the specified indices.
+        **kwargs : dict
+            Additional arguments for the tokenizer.
+
+        Returns
+        -------
+        tokens : torch.Tensor
+            Discretized tokens with shape [B, N, K].
+        """
         pass
 
     @abstractmethod
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
-        """Decode tokens to signal."""
+        """
+        Decode discrete tokens back into a signal.
+
+        Arguments
+        ---------
+        tokens : torch.Tensor
+            Input tokens with shape [B, N, K].
+        **kwargs : dict
+            Additional arguments for the tokenizer.
+
+        Returns
+        -------
+        signal : torch.Tensor
+            Reconstructed signal with shape [B, T].
+        """
         pass
 
     @abstractmethod
     @torch.no_grad()
     def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs):
-        """Get codebook embeddings."""
+        """
+        Retrieve pretrained embeddings for the tokenizer.
+
+        Arguments
+        ---------
+        vocab_size : int
+            Number of tokens in each codebook.
+        num_codebooks : int
+            Number of codebooks.
+        **kwargs : dict
+            Additional arguments for embedding retrieval.
+
+        Returns
+        -------
+        embeddings : torch.Tensor
+            Pretrained embedding weights with shape [K, C, H], where H is the embedding dimension.
+        """
         pass
 
 

From 279e48b001fedc21ba69acc1150a9099b114d5a7 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 28 Dec 2024 11:35:57 -0500
Subject: [PATCH 046/270] fix link in readme

---
 README.md                 | 2 +-
 benchmarks/DASB/README.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a6defc05b..fc0b33c4d 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ The SpeechBrain Benchmarks currently include the following:
 
 - [MOABB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/MOABB) - A benchmark designed for evaluating neural models in well-known EEG tasks like motor imagery, P300, and SSVEP.
 
-- [DASB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative
+- [DASB](https://github.com/speechbrain/benchmarks/tree/DASB/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative
 and  generative tasks.
 
 
diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index 78d780739..445232337 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -30,10 +30,10 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294):
 - [Discrete Audio Encoder](#-Discrete-Audio-Encoder)
 - [Datasets and Recipes](#-Datasets-and-Recipes)
 - [Training Scenarios](#-Training-Scenarios)
-  - [On-the-FlybToken Extraction](#On-the-Fly-Token-Extraction)
-  - [Offline-Token-Extraction](#Offline-Token-Extraction)
+  - [On-the-Fly Token Extraction](#-On-the-Fly-Token-Extraction)
+  - [Offline-Token-Extraction](#-Offline-Token-Extraction)
 - [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer)
-- [Hyperparameter Tuning](#Hyperparameter-Tuning)
+- [Hyperparameter Tuning](#-Hyperparameter-Tuning)
 - [Results](#-results)
 - [Contact](#-contact)
 - [Citing](#-citing)

From 7f32f1bf3f3c7c7e844b5134f82845ce29cdbea7 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 28 Dec 2024 11:40:01 -0500
Subject: [PATCH 047/270] update table of contnet

---
 benchmarks/DASB/README.md | 43 ++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index 445232337..684459083 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -25,18 +25,31 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294):
 
 # Table of Contents
 
-- [Table of Contents](#table-of-contents)
-- [Installation](#-installation)
-- [Discrete Audio Encoder](#-Discrete-Audio-Encoder)
-- [Datasets and Recipes](#-Datasets-and-Recipes)
-- [Training Scenarios](#-Training-Scenarios)
-  - [On-the-Fly Token Extraction](#-On-the-Fly-Token-Extraction)
-  - [Offline-Token-Extraction](#-Offline-Token-Extraction)
-- [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer)
-- [Hyperparameter Tuning](#-Hyperparameter-Tuning)
-- [Results](#-results)
-- [Contact](#-contact)
-- [Citing](#-citing)
+Here’s the updated **Table of Contents** for your GitHub README with corrections and better alignment:
+
+---
+
+# 📑 Table of Contents
+
+- [DASB - Discrete Audio and Speech Benchmark](#dasb---discrete-audio-and-speech-benchmark)
+- [🛠️ Installation](#-installation)
+- [🎌 Discrete Audio Encoder](#-discrete-audio-encoder)
+- [⚡ Datasets and Recipes](#-datasets-and-recipes)
+- [📖 Training Scenarios](#-training-scenarios)
+  - [On-the-Fly Token Extraction](#on-the-fly-token-extraction)
+  - [Offline Token Extraction](#offline-token-extraction)
+- [🎛️ Hyperparameter Tuning](#-hyperparameter-tuning)
+- [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer)
+- [📈 Results](#-results)
+  - [Ranking](#ranking)
+  - [Benchmarking Results for Discriminative Tasks](#benchmarking-results-for-discriminative-tasks)
+  - [Benchmarking Results for Generative Tasks](#benchmarking-results-for-generative-tasks)
+- [📧 Contact](#-contact)
+- [📖 Citing](#-citing)
+
+---
+
+This structure provides a clear and logical flow, ensuring users can easily navigate the document. Each major section is linked appropriately, with sub-sections for detailed content. Let me know if additional adjustments are required!
 
 # 🛠️ Installation
 
@@ -103,7 +116,7 @@ To set up SpeechBrain-DASB, follow these steps:
 
 We offer two different training scenarios: **on-the-fly token extraction** and **offline token extraction**.
 
-## 1. On-the-Fly Token Extraction
+## On-the-Fly Token Extraction
 In this scenario, audio tokens are extracted dynamically during training. To enhance efficiency, we use a caching mechanism where tokens are saved in memory during the first epoch and retrieved for subsequent epochs. However, this approach has some limitations:
 - It works best when the dataset is small, the bitrate is low, and batching is sorted (not random).
 - It is unsuitable when data augmentation is required.
@@ -121,9 +134,8 @@ python LibriSpeech/ASR-on-the-fly/train.py LibriSpeech/ASR-on-the-fly/hparams/LS
 
 > **Note:** On-the-fly extraction can be time-consuming, which is why we also provide an alternative approach: **offline token extraction**.
 
----
 
-## 2. Offline Token Extraction
+## Offline Token Extraction
 In this scenario, all tokens are pre-extracted in a separate recipe. We recommend using the highest number of codebooks available for token extraction and then choosing the desired settings during training.
 
 ### Token Extraction Command
@@ -149,7 +161,6 @@ This workflow ensures flexibility, efficiency, and reproducibility for both trai
 Here's a polished and formatted version for clarity, suitable for a README or documentation:
 
 
-
 # 🎛️ Hyperparameter Tuning
 
 Efficient hyperparameter tuning is critical when introducing novel models or experimenting with diverse datasets. Our benchmark establishes a standardized protocol for hyperparameter tuning, leveraging [Orion](https://orion.readthedocs.io/en/stable/) to ensure fair and consistent model comparisons.

From 30fc2d691aef1ff6c9edf4eb97fd1a3bb58d8a77 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 28 Dec 2024 11:42:14 -0500
Subject: [PATCH 048/270] fix

---
 benchmarks/DASB/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index 684459083..a3fdedb56 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -38,7 +38,7 @@ Here’s the updated **Table of Contents** for your GitHub README with correctio
 - [📖 Training Scenarios](#-training-scenarios)
   - [On-the-Fly Token Extraction](#on-the-fly-token-extraction)
   - [Offline Token Extraction](#offline-token-extraction)
-- [🎛️ Hyperparameter Tuning](#-hyperparameter-tuning)
+- [🎛️ Hyperparameter Tuning](#hyperparameter-tuning)
 - [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer)
 - [📈 Results](#-results)
   - [Ranking](#ranking)

From a576ba7fe63d4b1e93792030210bbb8e6d4f3c1a Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 28 Dec 2024 11:43:35 -0500
Subject: [PATCH 049/270] fix

---
 benchmarks/DASB/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index a3fdedb56..0ad632979 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -38,7 +38,7 @@ Here’s the updated **Table of Contents** for your GitHub README with correctio
 - [📖 Training Scenarios](#-training-scenarios)
   - [On-the-Fly Token Extraction](#on-the-fly-token-extraction)
   - [Offline Token Extraction](#offline-token-extraction)
-- [🎛️ Hyperparameter Tuning](#hyperparameter-tuning)
+- [🎛️ Hyperparameter Tuning](#%EF%B8%8F-hyperparameter-tuning)
 - [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer)
 - [📈 Results](#-results)
   - [Ranking](#ranking)

From 0fafc1cfcc3685d6382399015dea49d619a97238 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 30 Dec 2024 17:19:26 -0500
Subject: [PATCH 050/270] Tokotron LJSpeech: Update to work with the new
 tokenizer pipeline

---
 .../hparams/train_continuous_ssl.yaml         |   9 +
 .../TTS/tokotron/hparams/train_dac.yaml       |  42 +--
 .../tokotron/hparams/train_discrete_ssl.yaml  |  42 ++-
 .../TTS/tokotron/hparams/train_encodec.yaml   |  32 +--
 .../hparams/train_speech_tokenizer.yaml       |  14 +
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 250 ++++++++----------
 .../DASB/LJSpeech/TTS/tokotron/train_dac.py   |   4 +-
 .../TTS/tokotron/train_discrete_ssl.py        |   2 +-
 .../DASB/LJSpeech/extraction/extract.py       |  88 ++++++
 .../DASB/LJSpeech/extraction/hparams/dac.yaml |  63 +++++
 .../extraction/hparams/discrete_ssl.yaml      | 100 +++++++
 .../LJSpeech/extraction/hparams/encodec.yaml  |  62 +++++
 .../extraction/hparams/speech_tokenizer.yaml  |  52 ++++
 .../LJSpeech/extraction/ljspeech_prepare.py   |   1 +
 benchmarks/DASB/LJSpeech/ljspeech_prepare.py  | 187 +------------
 benchmarks/DASB/model/Tokotron.py             | 141 ----------
 benchmarks/DASB/utils/audio_tokens.py         |  27 +-
 17 files changed, 593 insertions(+), 523 deletions(-)
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/extract.py
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
 create mode 120000 benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
index ac80bdac0..087eb6cf9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
@@ -257,6 +257,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     audio_token_shift: !ref <audio_token_shift>
     decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
+    audio_dim: !ref <audio_dim>
     representation_mode: continuous
 
 
@@ -264,6 +265,7 @@ modules:
     model: !ref <model>
     vocoder: !ref <vocoder>
     compute_cost: !ref <compute_cost>
+    ssl_model: !ref <ssl_model>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
@@ -306,3 +308,10 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index cd4f338bc..4f50c7ed2 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p
 vocoder_type: encodec
 vocoder_src: "charactr/vocos-encodec-24khz"
 
+# Model type
+representation_mode: discrete
+
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared/dac
@@ -35,6 +38,14 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
+token_model_kwargs:
+    n_quantizers: !ref <audio_tokens_per_step>
+
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 
@@ -101,17 +112,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp
 
 silence_padding: !ref <gate_offset>
 
-# Token model (pretrained)
-dac: !new:speechbrain.lobes.models.discrete.dac.DAC
-    sample_rate: !ref <model_sample_rate>
-    model_type: !ref <model_type>
-    model_bitrate: !ref <model_bitrate>
-    load_pretrained: True
-
-# Token model (pretrained)
-token_model: !new:Tokotron.DACFeatureExtractor
-    dac: !ref <dac>
-    n_quantizers: !ref <audio_tokens_per_step>
 
 # Dataloader options
 train_dataloader_opts:
@@ -143,13 +143,6 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
 
 ####################### Model parameters ###########################
 # Transformer
@@ -174,6 +167,8 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
 bandwidth: 1.5
+model_shape: BHL
+model_needs_channel: True
 attention_type: regularMHA
 
 ############################## models ################################
@@ -198,9 +193,17 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  n_codebooks: !ref <audio_tokens_per_step>
+  load_pretrained: True
+  tag: latest
+  
+
 modules:
     model: !ref <model>
-    dac: !ref <dac>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
@@ -235,3 +238,4 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index f8a0ee622..e3b549549 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -11,6 +11,7 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 
 # Model Type
 ssl_model_type: wavlm
+representation_mode: discrete
 
 output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
@@ -37,6 +38,11 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
 freeze_token_model: True
 token_model_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -47,7 +53,9 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
 
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
-token_model_kmeans_dataset: LibriSpeech-100-360-500
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
 token_offset: 1
@@ -161,14 +169,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
             output_all_hiddens: True
 
 
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
-    ssl_model: !ref <ssl_model>
-    kmeans_repo_id: !ref <token_model_kmeans_src>
-    kmeans_dataset: !ref <token_model_kmeans_dataset>
-    num_clusters: !ref <audio_num_tokens>
-    save_path: !ref <pretrained_model_save_folder>
-    layers_num: !ref <token_model_layers>
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
@@ -206,18 +206,6 @@ sample_dataloader_opts:
 token_model_kwargs:
     SSL_layers: !ref <token_model_layers>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    token_model_kwargs: !ref <token_model_kwargs>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !ref <ssl_model_layers>
-    token_model_layers: !ref <token_model_layers>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-
 
 ####################### Model parameters ###########################
 # Transformer
@@ -229,7 +217,7 @@ d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
-audio_num_tokens: 1000
+vocab_size: 1000
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
@@ -254,7 +242,7 @@ vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
 
 model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
-    audio_num_tokens: !ref <audio_num_tokens>
+    audio_num_tokens: !ref <vocab_size>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
     d_ffn: !ref <d_ffn>
@@ -281,6 +269,7 @@ modules:
     model: !ref <model>
     vocoder: !ref <vocoder>
     compute_cost: !ref <compute_cost>
+    ssl_model: !ref <ssl_model>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
@@ -323,3 +312,10 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index f5e82c309..0082e20db 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p
 vocoder_type: encodec
 vocoder_src: "charactr/vocos-encodec-24khz"
 
+# Model type
+representation_mode: discrete
+
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared/encodec
@@ -35,6 +38,11 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 
@@ -96,13 +104,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp
 
 silence_padding: !ref <gate_offset>
 
-# Token model (pretrained)
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-    bandwidth: !ref <bandwidth>
-    flat_embeddings: True
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -133,13 +134,6 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
 
 ####################### Model parameters ###########################
 # Transformer
@@ -190,7 +184,6 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
 
 modules:
     model: !ref <model>
-    token_model: !ref <token_model>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
@@ -225,3 +218,12 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 103d584ed..ec6de9bb2 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p
 vocoder_type: encodec
 vocoder_src: "charactr/vocos-encodec-24khz"
 
+# Model type
+representation_mode: discrete
+
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared/st
@@ -35,6 +38,12 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
+
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 
@@ -167,6 +176,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
 bandwidth: 1.5
+model_shape: HBL
 attention_type: regularMHA
 
 ############################## models ################################
@@ -228,3 +238,7 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 3dddf48dc..0c80cc5c2 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -59,6 +59,8 @@ def __init__(
             create_waveform_fn=self.create_waveform,
             device=self.device,
         )
+        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
+
 
     def compute_forward(self, batch, stage):
         """Runs all the computation of the Tokotron TTS
@@ -77,7 +79,8 @@ def compute_forward(self, batch, stage):
         """
         batch = batch.to(self.device)
         tokens, tokens_length = batch.tokens
-        audio, audio_length = batch.audio_bos
+        features = self.prepare_features(batch)
+        audio, audio_length, _, _ = features
         emb = None
         if self.use_spk_emb:
             emb = {"spk": batch.spk_emb.data.squeeze(1)}
@@ -90,7 +93,48 @@ def compute_forward(self, batch, stage):
             emb=emb,
         )
 
-        return predictions
+        return predictions, features
+
+    def prepare_features(self, batch):
+        """Prepares features, depending on the configuration
+
+        Arguments
+        ---------        
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation
+
+        Returns
+        -------
+        audio_bos : torch.Tensor
+            Audio features, with BOS
+        audio_bos_length : torch.Tensor
+            Relative lengths of the audio features, with BOS
+        audio_tgt : torch.Tensor
+            Target audio features (for loss computation)
+        audio_tgt_length : torch.Tensor
+            Relative lengths of the target audio features
+        """
+        if self.representation_mode == RepresentationMode.DISCRETE:
+            audio_bos, audio_bos_length = batch.audio_bos
+            audio_tgt, audio_tgt_length = batch.audio_pad
+        else:
+            wav, audio_length = batch.sig
+            audio = self.modules.ssl_model(wav)
+            audio = audio[self.hparams.ssl_model_layers, :, :, :].permute(
+                1, 2, 0, 3
+            )
+            batch_size, _, heads, dim = audio.shape
+            bos = torch.zeros_like(
+                audio[:, :1, :, :]
+            ).reshape(batch_size, self.hparams.bos_width, heads, dim)
+            audio_bos = torch.concatenate(
+                [bos, audio],
+                dim=1
+            )
+            audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
+            audio_tgt = audio
+            audio_tgt_length = audio_length
+        return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length
 
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):
@@ -140,19 +184,20 @@ def compute_objectives(self, predictions, batch, stage):
             A one-element tensor used for backpropagating the gradient.
         """
         batch = batch.to(self.device)
-        audio, audio_length = batch.audio_pad
+        predictions, features = predictions
+        _, _, audio_tgt, audio_tgt_length = features
         loss_details = self.hparams.compute_cost(
             predictions=predictions,
-            audio=audio,
-            audio_length=audio_length,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
             input_tokens=batch.tokens.data,
             input_length=batch.tokens.lengths,
         )
         self.loss_metric.append(
             batch.uttid,
             predictions=predictions,
-            audio=audio,
-            audio_length=audio_length,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
             input_tokens=batch.tokens.data,
             input_length=batch.tokens.lengths,
             reduction="batch",
@@ -281,11 +326,7 @@ def fit_batch(self, batch):
     def init_optimizers(self):
         """Custom optimizer initialization
         """
-        representation_mode = getattr(
-            self.hparams, "representation_mode", RepresentationMode.DISCRETE
-        )
-        representation_mode = RepresentationMode(representation_mode)
-        if representation_mode == RepresentationMode.CONTINUOUS:
+        if self.representation_mode == RepresentationMode.CONTINUOUS:
             audio_emb_params = self.modules.model.decoder.audio_emb.parameters()
             audio_emb_params_set = set(audio_emb_params)
             model_params = [
@@ -368,9 +409,7 @@ def dataio_prepare(hparams):
         the token used for silence
     """
 
-    representation_mode = RepresentationMode(
-        hparams.get("representation_mode", RepresentationMode.DISCRETE)
-    )
+    representation_mode = RepresentationMode(hparams["representation_mode"])
 
     # Define datasets from json data manifest file
     # Define datasets sorted by ascending lengths for efficiency
@@ -407,7 +446,7 @@ def audio_ref_pipeline(wav):
 
         Arguments
         ---------
-        wav : str
+        wav : strƒnum_
             The file path
 
         Returns
@@ -422,49 +461,43 @@ def audio_ref_pipeline(wav):
 
     if representation_mode == RepresentationMode.DISCRETE:
         layers_key = "token_model_layers"
-        model_key = "token_model"
-        audio_features = "audio_tokens"
+        model_key = "tokenizer"
     else:
         layers_key = "ssl_model_layers"
         model_key = "ssl_model"
-        audio_features = "audio_ssl"
 
     audio_tokens_per_step = (
         len(hparams[layers_key])
         if layers_key in hparams
         else hparams["audio_tokens_per_step"]
     )
-    if use_silence_padding:
-        silence_token, silence_emb = get_silence_token(
+    if use_silence_padding and representation_mode == RepresentationMode.DISCRETE:
+        silence_token, _ = get_silence_token(
             hparams[model_key],
-            extract_emb=representation_mode == RepresentationMode.CONTINUOUS,
             model_kwargs=hparams.get("token_model_kwargs"),
+            extract_emb=False,
+            model_shape=hparams.get("model_shape", "BLH"),
+            unsqueeze=hparams.get("model_needs_channel", False)
         )
     else:
         silence_token = (
             torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64)
             * hparams["eos_index"]
         )
-    silence_token = silence_token.cpu()
-    silence_padding = (
-        silence_token
-        if representation_mode == RepresentationMode.DISCRETE
-        else silence_emb
-    )
+    silence_padding = silence_token.cpu()
+    silence_padding = silence_padding[:audio_tokens_per_step]
     silence_padding_len = int(math.ceil(hparams["silence_padding"]))
     bos_width = hparams.get("bos_width", 1)
     audio_bos_prefix = (
         torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"]
     )
-    if representation_mode == RepresentationMode.CONTINUOUS:
-        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(
-            1, 1, hparams["audio_dim"]
-        )
 
-    @sb.utils.data_pipeline.takes(audio_features)
+    tokens_loader = hparams.get("tokens_loader")
+
+    @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
-    def audio_pipeline(audio):
-        audio = torch.from_numpy(audio)
+    def audio_pipeline(id):
+        audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step)
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -480,21 +513,20 @@ def audio_pipeline(audio):
     ]
 
     init_sequence_encoder(hparams)
-    use_spk_emb = hparams.get("use_spk_emb", False)
-    prepared_features = [audio_features]
     output_keys = [
         "uttid",
         "tokens",
-        "audio_pad",
-        "audio_bos",
         "label_norm_eval",
     ]
-    if use_spk_emb:
-        prepared_features.append("spk_emb")
-        output_keys.append("spk_emb")
+    if representation_mode == RepresentationMode.DISCRETE:
+        output_keys += [
+            "audio_pad",
+            "audio_bos",
+        ]
+    else:
+        output_keys.append("sig")
 
     eval_output_keys = [*output_keys, "sig"]
-
     for dataset in data_info:
         if dataset == "train":
             dataset_output_keys = output_keys
@@ -508,13 +540,6 @@ def audio_pipeline(audio):
             output_keys=dataset_output_keys,
         )
 
-        add_prepared_features(
-            dataset=dynamic_dataset,
-            save_path=Path(hparams["prepare_save_folder"]) / "features",
-            id_key="uttid",
-            features=prepared_features,
-        )
-
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
 
@@ -540,50 +565,9 @@ def audio_pipeline(audio):
             "sorting must be random, ascending or descending"
         )
 
-    datasets["sample"] = select_sample(hparams, datasets)
     return datasets, silence_padding
 
 
-def select_sample(hparams, datasets):
-    """Selects a sample of files for sample generation, freezing the sample if
-    requested to persist across multiple experiments
-
-    Arguments
-    ---------
-    hparams : dict
-        experiment hyperparameters
-    datasets : dict
-        a dictionary of datasets
-
-    Returns
-    -------
-    dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset
-        the sample dataset
-    """
-    sample_path = hparams.get("sample_path")
-    dataset = None
-    if sample_path is not None:
-        sample_path = Path(sample_path)
-        if sample_path.exists():
-            with open(sample_path, "r") as sample_file:
-                data_ids = [line.strip() for line in sample_file]
-                dataset = FilteredSortedDynamicItemDataset(
-                    datasets["valid"], data_ids
-                )
-
-    if dataset is None:
-        dataset = (
-            datasets["valid"]
-            .batch_shuffle(1)
-            .filtered_sorted(select_n=hparams["num_audio_samples"])
-        )
-        if sample_path is not None:
-            with open(sample_path, "w") as sample_file:
-                for data_id in dataset.data_ids:
-                    print(data_id, file=sample_file)
-    return dataset
-
-
 def init_sequence_encoder(hparams):
     """Initialize a sequence encoder
 
@@ -682,7 +666,6 @@ def apply_overfit_test(hparams, dataset):
                 "train": dataset_train,
                 "valid": dataset_eval,
                 "test": dataset_eval,
-                "sample": dataset_eval,
             }
         else:
             result = dataset.overfit_test(
@@ -736,40 +719,25 @@ def run_experiment(brain_cls):
     from ljspeech_prepare import prepare_ljspeech
 
     # Data preparation, to be run on only one process.
-    representation_mode = RepresentationMode(
-        hparams.get("representation_mode", RepresentationMode.DISCRETE)
-    )
-    audio_features = (
-        "audio_tokens"
-        if representation_mode == RepresentationMode.DISCRETE
-        else "audio_ssl"
-    )
-    extract_features = [audio_features]
-    if hparams.get("use_spk_emb", False):
-        extract_features.append("spk_emb")
-
     if not hparams["skip_prep"]:
-        with hparams["freezer"]:
-            run_on_main(
-                prepare_ljspeech,
-                kwargs={
-                    "data_folder": hparams["data_folder"],
-                    "save_folder": hparams["prepare_save_folder"],
-                    "splits": hparams["splits"],
-                    "split_ratio": hparams["split_ratio"],
-                    "seed": hparams["seed"],
-                    "extract_features": extract_features,
-                    "extract_features_opts": hparams["extract_features_opts"],
-                    "extract_phonemes": hparams["input"] == "phonemes",
-                    "model_name": "tokotron",
-                    "g2p_src": hparams["g2p_src"],
-                    "skip_ignore_folders": hparams[
-                        "prepare_skip_ignore_folders"
-                    ],
-                    "frozen_split_path": hparams.get("frozen_split_path"),
-                    "device": run_opts.get("device", "cpu"),
-                },
-            )
+        run_on_main(
+            prepare_ljspeech,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_folder": hparams["prepare_save_folder"],
+                "splits": hparams["splits"],
+                "split_ratio": hparams["split_ratio"],
+                "seed": hparams["seed"],
+                "extract_phonemes": hparams["input"] == "phonemes",
+                "model_name": "tokotron",
+                "g2p_src": hparams["g2p_src"],
+                "skip_ignore_folders": hparams[
+                    "prepare_skip_ignore_folders"
+                ],
+                "frozen_split_path": hparams.get("frozen_split_path"),
+                "device": run_opts.get("device", "cpu"),
+            },
+        )
 
     # We can now directly create the datasets for training, valid, and test
     datasets, silence_padding = dataio_prepare(hparams)
@@ -786,31 +754,43 @@ def run_experiment(brain_cls):
         run_opts=run_opts,
         checkpointer=hparams["checkpointer"],
     )
-    tts_brain.sample_data = datasets["sample"]
 
     # The `fit()` method iterates the training loop, calling the methods
     # necessary to update the parameters of the model. Since all objects
     # with changing state are managed by the Checkpointer, training can be
     # stopped at any point, and will be resumed on next call.
+
+    dataloader_opts = [
+        hparams[f"{key}_dataloader_opts"]
+        for key in ["train", "valid", "test"]
+    ]
+    representation_mode = RepresentationMode(hparams["representation_mode"])
+    if representation_mode == RepresentationMode.DISCRETE:
+        dataloader_opts = [
+            use_silence_padding(
+                opts, silence_padding, audio_keys
+            )
+            for opts in dataloader_opts
+        ]
+    (
+        train_dataloader_opts,
+        valid_dataloader_opts,
+        test_dataloader_opts
+    ) = dataloader_opts
+
     tts_brain.fit(
         tts_brain.hparams.epoch_counter,
         datasets["train"],
         datasets["valid"],
-        train_loader_kwargs=use_silence_padding(
-            hparams["train_dataloader_opts"], silence_padding, audio_keys
-        ),
-        valid_loader_kwargs=use_silence_padding(
-            hparams["valid_dataloader_opts"], silence_padding, audio_keys
-        ),
+        train_loader_kwargs=train_dataloader_opts,
+        valid_loader_kwargs=valid_dataloader_opts,
     )
 
     # Load best checkpoint for evaluation
     tts_brain.evaluate(
         test_set=datasets["test"],
         min_key="loss",
-        test_loader_kwargs=use_silence_padding(
-            hparams["test_dataloader_opts"], silence_padding, audio_keys
-        ),
+        test_loader_kwargs=test_dataloader_opts,
     )
 
     # Save final checkpoint (fixed name)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
index d0bc9f4f7..83b9ff538 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
@@ -33,10 +33,10 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        z, _, _ = self.modules.dac.quantizer.from_codes(
+        z, _, _ = self.modules.tokenizer.quantizer.from_codes(
             audio.transpose(1, 2).int()
         )
-        wav = self.modules.dac.decode(z).squeeze(1)
+        wav = self.modules.tokenizer.decode(z).squeeze(1)
         clean_padding_(wav, length)
         return wav
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
index f9fc764cd..aa2c57681 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
@@ -34,7 +34,7 @@ def compute_offset(self):
                 str(layer) for layer in (layers_set - available_layers_set)
             )
             raise ValueError(f"Layers {unavailable_layers} are not supported")
-        self.num_units = self.hparams.audio_num_tokens
+        self.num_units = self.hparams.vocab_size
         _, layers_idx = torch.where(
             torch.tensor(
                 self.hparams.vocoder_available_layers, device=self.device
diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py
new file mode 100644
index 000000000..556d8a9d0
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/extract.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env/python3
+"""Recipe for extracting a discrete tokens with librispeech.
+
+Authors
+ * Jarod Duret 2024
+"""
+
+import os
+import sys
+import logging
+import pathlib as pl
+import speechbrain as sb
+from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+print(base_dir)
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech)
+    from ljspeech_prepare import prepare_ljspeech  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_ljspeech,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["output_folder"],
+            "splits": hparams["splits"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+            "frozen_split_path": hparams.get("frozen_split_path"),
+            "device": run_opts.get("device", "cpu"),
+        },
+    )
+
+    tokens_extractor = hparams["tokens_extractor"]
+    data_folder = hparams["data_folder"]
+    datasets = []
+    for split in ["train", "valid", "test"]:
+        json_path = hparams[f"{split}_json"]
+        name = pl.Path(json_path).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=json_path, replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    merged_data = {
+        key: value
+        for dataset in datasets
+        for key, value in dataset.data.items()
+    }
+    merged_dataset = DynamicItemDataset(merged_data)
+
+    save_folder = pl.Path(hparams["save_folder"])
+    logger.info("Extracting dataset tokens ...")
+    tokens_extractor.extract_tokens(
+        merged_dataset,
+        hparams["num_codebooks"],
+        (save_folder / "ljspeech").as_posix(),
+    )
+
+    if hparams["save_embedding"]:
+        save_folder = pl.Path(hparams["save_folder"])
+        logger.info(f"Saving embeddings ...")
+        tokens_extractor.save_pretrained_embeddings(
+            (save_folder / "embeddings").as_posix(),
+            vocab_size=hparams["vocab_size"],
+            num_codebooks=hparams["num_codebooks"],
+        )
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
new file mode 100644
index 000000000..ebf155bb2
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
@@ -0,0 +1,63 @@
+# ############################################################################
+# Auido Tokenizer: DAC
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  load_pretrained: True
+  tag: latest
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
new file mode 100644
index 000000000..c4c01f527
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
@@ -0,0 +1,100 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: WavLM
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+vocab_size: 1000
+save_embedding: False
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+num_codebooks: [1, 3, 7, 12, 18, 23]
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
new file mode 100644
index 000000000..0b07a6b1f
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
@@ -0,0 +1,62 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+bandwidth: 24.0
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+  source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
+  sample_rate: !ref <sample_rate>
+  bandwidth: !ref <bandwidth>
+  flat_embeddings: False
+  freeze: True
+  renorm_embeddings: False
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
new file mode 100644
index 000000000..54da4f210
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -0,0 +1,52 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+vocab_size: 1024
+num_codebooks: 8
+sample_rate: 16000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py
new file mode 120000
index 000000000..2de5a21a8
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py
@@ -0,0 +1 @@
+../ljspeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
index e88b92eb6..bfd1b3743 100644
--- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
+++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
@@ -13,13 +13,11 @@
 import json
 import random
 import logging
-from types import SimpleNamespace
 import torch
 import torchaudio
 import numpy as np
 import tgt
 import re
-import speechbrain as sb
 from tqdm import tqdm
 from pathlib import Path
 from speechbrain.utils.data_utils import download_file
@@ -27,10 +25,6 @@
 from speechbrain.inference.text import GraphemeToPhoneme
 from unidecode import unidecode
 from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
-from speechbrain.dataio.batch import PaddedData
-from speechbrain.dataio.dataset import DynamicItemDataset
-from preparation import FeatureExtractor
-from torchaudio.functional import resample
 
 
 logger = logging.getLogger(__name__)
@@ -179,7 +173,7 @@ def prepare_ljspeech(
             os.makedirs(duration_folder)
 
     # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models
-    if "FastSpeech2" in model_name:
+    if model_name is not None and "FastSpeech2" in model_name:
         pitch_folder = os.path.join(data_folder, "pitch")
         if not os.path.exists(pitch_folder):
             os.makedirs(pitch_folder)
@@ -200,16 +194,6 @@ def prepare_ljspeech(
         data_folder, splits, split_ratio, frozen_split_path
     )
 
-    extract_features_context = None
-    extract_features_folder = None
-    if extract_features:
-        extract_features_context = get_context(
-            extract_features=extract_features,
-            extract_features_opts=extract_features_opts or {},
-            device=device,
-        )
-        extract_features_folder = Path(save_folder) / "features"
-
     if "train" in splits:
         prepare_json(
             model_name,
@@ -226,10 +210,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -250,10 +230,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -274,10 +250,6 @@ def prepare_ljspeech(
             pitch_min_f0,
             pitch_max_f0,
             use_custom_cleaner,
-            extract_features,
-            extract_features_context,
-            extract_features_folder,
-            extract_features_opts,
             extract_phonemes,
             g2p_src,
             device,
@@ -500,7 +472,7 @@ def prepare_json(
         g2p = GraphemeToPhoneme.from_hparams(
             g2p_src, run_opts={"device": device}
         )
-    if "FastSpeech2" in model_name:
+    if model_name is not None and "FastSpeech2" in model_name:
         logger.info(
             "Computing pitch as required for FastSpeech2. This may take a while."
         )
@@ -649,19 +621,6 @@ def prepare_json(
             # Updates data for the utterance
             json_dict[id].update({"phonemes": phonemes})
 
-    # Feature Extraction
-    if extract_features:
-        extract_features_folder.mkdir(exist_ok=True)
-        prepare_features(
-            data=json_dict,
-            data_folder=data_folder,
-            save_path=extract_features_folder,
-            features=extract_features,
-            context=extract_features_context,
-            options=extract_features_opts,
-            device=device,
-        )
-
     # Writing the dictionary to the json file
     with open(json_file, mode="w") as json_f:
         json.dump(json_dict, json_f, indent=2)
@@ -839,145 +798,3 @@ def custom_clean(text, model_name):
         text = re.sub(regex, replacement, text)
     return text
 
-
-INLINE_FEATURES = ["audio_ssl_len"]
-
-
-def prepare_features(
-    data, data_folder, save_path, features, context, options=None, device="cpu"
-):
-    """Performs feature extraction
-
-    Arguments
-    ---------
-    data: dict
-        a preprocessed dataset
-    data_folder : str
-        the data folder
-    save_folder : str
-        the folder where features will be saved
-    context : dict
-        context data
-    features: list
-        the list of feature extractions to be performed
-    """
-    dataset = DynamicItemDataset(data)
-    feature_extractor = FeatureExtractor(
-        save_path=save_path,
-        src_keys=["sig"],
-        id_key="uttid",
-        dataloader_opts=options.get("dataloader_opts", {}),
-        device=device,
-    )
-    token_model_kwargs = options.get("token_model_kwargs", {})
-    ssl_layers = options.get("ssl_model_layers") or options.get(
-        "token_model_layers"
-    )
-
-    @sb.utils.data_pipeline.takes("wav")
-    @sb.utils.data_pipeline.provides("sig")
-    def audio_pipeline(wav):
-        """Load the audio signal. """
-        wav = wav.replace("{data_root}", data_folder)
-        sig = sb.dataio.dataio.read_audio(wav)
-
-        yield sig
-
-    dataset.add_dynamic_item(audio_pipeline)
-
-    @sb.utils.data_pipeline.takes("sig")
-    @sb.utils.data_pipeline.provides("sig_resampled")
-    def resample_pipeline(sig):
-        sig_data = resample(
-            waveform=sig.data,
-            orig_freq=options["sample_rate"],
-            new_freq=options["model_sample_rate"],
-        )
-        return PaddedData(sig_data, sig.lengths)
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("audio_tokens", "audio_emb")
-    def token_pipeline(sig):
-        with torch.no_grad():
-            result = context.token_model(
-                sig.data, sig.lengths, **token_model_kwargs
-            )
-            # TODO: Clean this up
-            if torch.is_tensor(result):
-                tokens = result
-                # Note: Dummy embedding - meaning embeddings are not available
-                emb = torch.zeros((len(sig.data), 1, 1), device=sig.data.device)
-            else:
-                tokens, emb = result[:2]
-                tokens = tokens.int()
-            if tokens.dim() < 3:
-                tokens = tokens.unsqueeze(-1)
-            yield PaddedData(tokens, sig.lengths)
-            yield PaddedData(emb, sig.lengths)
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("spk_emb")
-    def spk_emb_pipeline(sig):
-        mel_spec = context.spk_emb_model.mel_spectogram(audio=sig.data)
-        return context.spk_emb_model.encode_mel_spectrogram_batch(
-            mel_spec, sig.lengths
-        )
-
-    @sb.utils.data_pipeline.takes("sig_resampled")
-    @sb.utils.data_pipeline.provides("audio_ssl", "audio_ssl_len")
-    def ssl_pipeline(sig):
-        ssl_raw = context.ssl_model(sig.data, sig.lengths)
-        ssl = ssl_raw[ssl_layers].permute(1, 2, 0, 3)
-        yield PaddedData(ssl, sig.lengths)
-        yield (sig.lengths * ssl.size(1)).tolist()
-
-    dynamic_items = [
-        resample_pipeline,
-        token_pipeline,
-        ssl_pipeline,
-        spk_emb_pipeline,
-    ]
-    for dynamic_item in dynamic_items:
-        feature_extractor.add_dynamic_item(dynamic_item)
-
-    feature_keys = [key for key in features if key not in INLINE_FEATURES]
-    inline_keys = [key for key in features if key in INLINE_FEATURES]
-    feature_extractor.set_output_features(feature_keys, inline_keys=inline_keys)
-    feature_extractor.extract(dataset, data)
-
-
-def get_context(extract_features, extract_features_opts, device):
-    """
-    Gets the context (pretrained models, etc) for feature extraction
-
-    Arguments
-    ---------
-    extract_features : list
-        A list of features to extract
-        Available features:
-        audio_tokens - raw tokens
-        audio_emb - embeddings from the model
-    extract_features_opts : dict
-        Options for feature extraction
-    device : str|torch.Device
-        The device on which extraction will be run
-
-    Returns
-    --------
-    context: SimpleNamespace
-        The context object
-    """
-    context = {}
-    if (
-        any(key in extract_features for key in ["audio_tokens", "audio_emb"])
-        and "token_model" in extract_features_opts
-    ):
-        context["token_model"] = extract_features_opts["token_model"].to(device)
-    if "audio_ssl" in extract_features:
-        context["ssl_model"] = extract_features_opts["ssl_model"].to(device)
-    if "spk_emb" in extract_features:
-        context["spk_emb_model"] = extract_features_opts["spk_emb_model"](
-            run_opts={"device": device}
-        )
-
-    return SimpleNamespace(**context)
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 5238beacd..804227d55 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -802,147 +802,6 @@ def decode(self, enc_out, length):
             )
 
 
-class TokotronForwardInference(nn.Module):
-    """A beam search-based inference implementation
-
-    All keyword arguments will be passed on to the underlying
-    beam search
-
-    Arguments
-    ---------
-    scale_factor : float
-        The scaling factor for encoder representations
-    gate_threshold : float
-        The threshold for gate activation
-    min_length : int
-        The minimum length for generating sequences, in tokens
-    """
-
-    def __init__(
-        self,
-        scale_factor=5.0,
-        gate_threshold=0.5,
-        min_length=16,
-        eos_mode=EosMode.GATE,
-        eos_index=0,
-        representation_mode=RepresentationMode.DISCRETE,
-    ):
-        super().__init__()
-        self.scale_factor = scale_factor
-        self.gate_threshold = gate_threshold
-        self.min_length = min_length
-        self.decoder = None
-        self.gate = None
-        self.eos_mode = EosMode(eos_mode)
-        self.eos_index = eos_index
-        self.representation_mode = RepresentationMode(representation_mode)
-
-    def bind(self, model=None):
-        """Binds this inference implementation to a model
-
-        Arguments
-        ---------
-        model : TokotronTransformerModel
-            The transformer model
-        """
-        self.decoder = model.decoder
-
-    def decode(self, enc_out, length):
-        """"Decodes the encoder representation using Beam Search
-
-        Arguments
-        ---------
-        enc_out : torch.Tensor
-            Encoder output
-        length : torch.Tensor
-            Encoder output lengths
-
-        Returns
-        -------
-        output : TokotronDecoderInfernceOutput
-            The inference output
-        """
-        with torch.no_grad():
-            max_len = enc_out.size(1)
-            src_key_padding_mask = length_to_mask(
-                length * max_len, max_len,
-            ).logical_not()
-            tgt = scale(enc_out, self.scale_factor)
-            dec_out = self.decoder(
-                enc_out=enc_out,
-                tgt=tgt,
-                tgt_length=length,
-                src_length=length,
-                src_key_padding_mask=src_key_padding_mask,
-                pos_embs_src=None,
-            )
-            if self.eos_mode == EosMode.GATE:
-                p_eos, eos = self.get_length_gate(dec_out)
-            else:
-                p_eos, eos = self.get_length_token(dec_out)
-
-            infer_length_abs = eos.max(dim=1).indices
-            infer_length_abs_nonzero = infer_length_abs[infer_length_abs > 0]
-            if len(infer_length_abs_nonzero) > 0:
-                infer_length_max = infer_length_abs_nonzero.max()
-            else:
-                infer_length_max = 0
-            if infer_length_max == 0:
-                infer_length_max = p_eos.size(1)
-            infer_length_abs = torch.where(
-                infer_length_abs == 0, infer_length_max, infer_length_abs
-            )
-            infer_length_abs = infer_length_abs.clip(min=self.min_length)
-            infer_length = infer_length_abs / infer_length_max
-
-            audio = dec_out.out[:, :infer_length_max].argmax(-1)
-            if self.representation_mode == RepresentationMode.DISCRETE:
-                audio = audio.argmax(-1)
-            return TokotronDecoderInfernceOutput(
-                audio=audio,
-                length=infer_length,
-                dec_self_attn=dec_out.dec_self_attn,
-                dec_attn=dec_out.dec_attn,
-                alignments=get_alignments(dec_out.dec_attn),
-                p_eos=p_eos,
-            )
-
-    def get_length_gate(self, dec_out):
-        """Infers lengths using the gate module
-
-        Arguments
-        ---------
-        dec_out : TokotronDecoderOutput
-            The decoder output
-
-        Returns
-        -------
-        p_eos : torch.Tensor
-            EOS probabilities (as estimated by the gate)
-        eos : torch.Tensor
-            a Boolean tensor where positions indicate whether
-            the gate has activated
-        """
-        p_eos = dec_out.gate_out.sigmoid()
-        eos = p_eos > self.gate_threshold
-        return p_eos, eos
-
-    def get_length_token(self, dec_out):
-        """Infers lengths using an EOS token
-
-        Arguments
-        ---------
-        dec_out : TokotronDecoderOutput
-            The decoder output
-        eos : torch.Tensor
-            A Boolean tensor indicating whether EOS has been reached
-        """
-        p_seq = dec_out.out[:, :, 0].softmax(dim=-1)
-        p_eos = p_seq[:, :, self.eos_index].softmax(-1)
-        eos = p_seq.argmax(dim=-1) == self.eos_index
-        return p_eos, eos
-
-
 class TokotronTransformerModel(nn.Module):
     """An end-to-end Tokotron model receiving characters or phonemes
     as inputs and outputting audio tokens
diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py
index 9dc4014c4..9dcc922cd 100644
--- a/benchmarks/DASB/utils/audio_tokens.py
+++ b/benchmarks/DASB/utils/audio_tokens.py
@@ -14,6 +14,8 @@ def get_silence_token(
     model,
     sample_length=100000,
     extract_emb=True,
+    model_shape="BLH",
+    unsqueeze=False,
     device=None,
     model_kwargs=None,
 ):
@@ -28,6 +30,13 @@ def get_silence_token(
         The length of the sample
     extract_emb : bool
         Whether to extract embeddings
+    model_shape : str
+        The shape of tokens output by the model
+        BLH: Batch x Length x Heads (Discrete SSL, Encodec)
+        BHL: Batch x Heads x Length (DAC)
+        HBL: Heads x Batch x Length (SpeechTokenizer)
+    unsqueeze: bool
+        Whether to add an extra dimension to the audio (needed for DAC)
     device : str | torch.Device
         The device to use
     model_kwargs : dict
@@ -48,10 +57,24 @@ def get_silence_token(
         model_kwargs = {}
 
     audio = torch.zeros(1, sample_length, device=device)
+    if unsqueeze:
+        audio = audio.unsqueeze(1)
     length = torch.ones(1, device=device)
+    model_training = model.training
+    model.eval()
     result = model(audio, length, **model_kwargs)
-    tokens = result[0]
-    silence_tokens = tokens.squeeze(0).mode(0).values
+    if model_training:
+        model.train()
+    tokens = result if torch.is_tensor(result) else result[0]
+    if model_shape == "HBL":
+        tokens = tokens.permute(1, 2, 0)
+    elif model_shape == "BHL":
+        tokens = tokens.transpose(-1, -2)
+
+    tokens = tokens.squeeze(0)
+    if unsqueeze:
+        tokens = tokens.squeeze(0)
+    silence_tokens = tokens.mode(0).values
     silence_emb = None
     if extract_emb:
         if hasattr(model, "embeddings"):

From e66a00e5763491173017912fcb9cf64c500fea42 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 11 Jan 2025 20:49:54 -0500
Subject: [PATCH 051/270] Tokotron: Add Tokotron integration for LibriTTS
 (multi-speaker recipes)

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |   33 +-
 .../DASB/LJSpeech/TTS/tokotron/train.py       |   13 +-
 .../DASB/LibriTTS/TTS/tokotron/Tokotron.py    |    1 +
 .../LibriTTS/TTS/tokotron/custom_model.py     |    1 +
 benchmarks/DASB/LibriTTS/TTS/tokotron/data.py |    1 +
 benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py |    1 +
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  603 +++++++++
 .../LibriTTS/TTS/tokotron/hparams/arpabet.txt |   50 +
 .../LibriTTS/TTS/tokotron/hparams/char_en.txt |   38 +
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |   66 +
 .../hparams/train_continuous_ssl.yaml         |  488 ++++++++
 .../TTS/tokotron/hparams/train_dac.yaml       |  330 +++++
 .../tokotron/hparams/train_discrete_ssl.yaml  |  396 ++++++
 .../TTS/tokotron/hparams/train_encodec.yaml   |  352 ++++++
 .../hparams/train_speech_tokenizer.yaml       |  329 +++++
 .../LibriTTS/TTS/tokotron/libritts_prepare.py |    1 +
 .../DASB/LibriTTS/TTS/tokotron/train.py       | 1077 +++++++++++++++++
 .../TTS/tokotron/train_continuous_ssl.py      |   47 +
 .../DASB/LibriTTS/TTS/tokotron/train_dac.py   |   47 +
 .../TTS/tokotron/train_discrete_ssl.py        |   79 ++
 .../LibriTTS/TTS/tokotron/train_encodec.py    |   46 +
 .../TTS/tokotron/train_speech_tokenizer.py    |   46 +
 .../DASB/LibriTTS/extraction/extract.py       |   90 ++
 .../DASB/LibriTTS/extraction/hparams/dac.yaml |   63 +
 .../extraction/hparams/discrete_ssl.yaml      |  101 ++
 .../LibriTTS/extraction/hparams/encodec.yaml  |   63 +
 .../extraction/hparams/speech_tokenizer.yaml  |   53 +
 .../LibriTTS/extraction/libritts_prepare.py   |    1 +
 benchmarks/DASB/LibriTTS/libritts_prepare.py  |  331 +++++
 benchmarks/DASB/model/Tokotron.py             |  421 +++----
 benchmarks/DASB/utils/audio_tokens.py         |  216 ----
 benchmarks/DASB/utils/eval.py                 |  545 +++++----
 32 files changed, 5266 insertions(+), 663 deletions(-)
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/data.py
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/extract.py
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
 create mode 120000 benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py
 create mode 100644 benchmarks/DASB/LibriTTS/libritts_prepare.py
 delete mode 100644 benchmarks/DASB/utils/audio_tokens.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index bdf6c0f75..bad9ce7c1 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -9,12 +9,22 @@ eval_asr_source: !apply:speechbrain.utils.hparams.choice
     whisper: openai/whisper-small
 evaluations: utmos,asr
 tmp_folder: null
-utmos_batch_size: 8
-utmos_model_path: ./utmos
-utmos_ckpt_name: epoch=3-step=7459.ckpt
-utmos_ckpt_path: !ref <utmos_model_path>/<utmos_ckpt_name>
-utmos_use_python: True
-utmos_script: predict.py
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: false
+
+
+eval_utmos: !name:eval.UTMOSSpeechEvaluator
+    source: !ref <eval_utmos_source>
+    save_path: !ref <eval_utmos_save_path>
+    model_name: !ref <eval_utmos_model_name>
+    model_url: !ref <eval_utmos_model_url>
+    domain_id: !ref <eval_utmos_domain_id>
+    judge_id: !ref <eval_utmos_judge_id>
 
 
 eval_asr: !apply:speechbrain.utils.hparams.choice
@@ -31,18 +41,9 @@ eval_asr: !apply:speechbrain.utils.hparams.choice
       savedir: !ref <pretrained_model_save_folder>
 
 evaluators:
+  utmos: !ref <eval_utmos>
   asr: !ref <eval_asr>
 
-bulk_evaluators:
-  utmos: !name:eval.UTMOSSpeechEvaluator
-    model_path: !ref <utmos_model_path>
-    output_folder: !ref <output_folder>
-    ckpt_path: !ref <utmos_ckpt_path>
-    batch_size: !ref <utmos_batch_size>
-    script: !ref <utmos_script>
-    use_python: !ref <utmos_use_python>
-    tmp_folder: !ref <tmp_folder>
-
 eval_summary:
   asr:
     descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 0c80cc5c2..deb8a3236 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -22,10 +22,8 @@
 import string
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
-from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
 from speechbrain.utils.distributed import run_on_main
-from preparation import add_prepared_features
-from audio_tokens import (
+from Tokotron import (
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
@@ -683,6 +681,14 @@ def apply_overfit_test(hparams, dataset):
 
 
 def run_experiment(brain_cls):
+    """Starts the experiement
+
+    Arguments
+    ---------
+    brain_cls : type
+        The brain class to instantiate
+    """
+
     # Reading command line arguments
     hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
 
@@ -789,7 +795,6 @@ def run_experiment(brain_cls):
     # Load best checkpoint for evaluation
     tts_brain.evaluate(
         test_set=datasets["test"],
-        min_key="loss",
         test_loader_kwargs=test_dataloader_opts,
     )
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py
new file mode 120000
index 000000000..097a6d488
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py
@@ -0,0 +1 @@
+../../../model/Tokotron.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py
new file mode 120000
index 000000000..4b3f08ebb
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py
@@ -0,0 +1 @@
+../../../model/custom_model.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py
new file mode 120000
index 000000000..d65702b6c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py
@@ -0,0 +1 @@
+../../../utils/data.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py
new file mode 120000
index 000000000..0ca6d4644
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py
@@ -0,0 +1 @@
+../../../utils/eval.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
new file mode 100644
index 000000000..d72df92aa
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -0,0 +1,603 @@
+"""Evaluates a checkpoint using an MOS estimation tool
+
+Authors
+* Artem Ploujnikov 2024
+"""
+
+#TODO: There are too many evaluation scripts. Refactor to extract common
+# features
+
+import speechbrain as sb
+import json
+import logging
+import math
+import sys
+import csv
+import torch
+import torchaudio
+import string
+import re
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from types import SimpleNamespace
+from torch.nn import ModuleDict
+from tqdm.auto import tqdm
+from data import undo_batch
+from eval import vocoder_to_device
+from torch.utils.flop_counter import FlopCounterMode
+from contextlib import nullcontext
+
+
+logger = logging.getLogger(__name__)
+
+
+class TokotronEvaluator:
+    """An evaluator class for the TTS model
+    
+    Arguments
+    ---------
+    hparams: dict
+        hyperparameters (as a dictionary)
+    device : str | torch.device
+        the device
+    """
+    def __init__(self, hparams, create_waveform_fn, device):
+        self.hparams = SimpleNamespace(**hparams)
+        self.create_waveform_fn = create_waveform_fn
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = ModuleDict(modules).to(self.device)
+        self.spk_emb_model = self.hparams.spk_emb_model(
+            run_opts={"device": device}
+        )
+        self.modules.model.vocoder = None
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.get("evaluators", {})
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {})
+        if bulk_evaluators:
+            self.bulk_evaluators = {
+                key: evaluator_f()
+                for key, evaluator_f in bulk_evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.bulk_evaluators = {}
+
+        if not self.evaluators and not self.bulk_evaluators:
+            logger.warning("No evaluators were defined - this run will produce samples only")
+
+        self.attention = []
+
+    def on_evaluate_start(self, stage, epoch):
+        """Invoked when evaluation starts
+
+        Arguments
+        ---------
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.stage = stage
+        self.epoch = epoch
+        self.output_folder = self.get_output_folder(stage, epoch)
+        self.samples_folder = self.output_folder / "samples"
+        self.samples_folder.mkdir(parents=True, exist_ok=True)
+        logger.info(
+            "Starting evaluation, results will be saved in %s",
+            self.output_folder,
+        )
+        self.create_reports()
+        self.modules.model.show_inference_progress = False
+        self.item_ids = []
+        details_keys = list(self.evaluators.keys()) + list(
+            self.bulk_evaluators.keys()
+        )
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.sample_text = []
+        self.sample_file_names = []
+        self.ref_file_names = []
+        if hasattr(self.modules, "vocoder"):
+            vocoder_to_device(self.modules.vocoder, self.device)
+
+    def get_output_folder(self, stage, epoch):
+        """Computes the output folder of evaluation results
+        for the specified stage and epoch.
+
+        If the folder does not exists, it will be created.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+
+        Returns
+        -------
+        """
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)
+        output_folder.mkdir(parents=True, exist_ok=True)
+        return output_folder
+
+
+    def evaluate(self, dataset):
+        """Runs evaluation on a dataset
+
+        Arguments
+        ---------
+        dataset : speechbrain.dataio.dataset.DynamicItemDataset
+            a dataset
+        """
+        logger.info("Recovering the checkpoint")
+        ckpt = self.hparams.checkpointer.recover_if_possible()
+        if not ckpt:
+            raise ValueError("Unable to recover the checkpoint")
+        self.modules.model.eval()
+        if self.hparams.eval_samples is not None:
+            dataset = dataset.filtered_sorted(select_n=self.hparams.eval_samples)
+        loader = sb.dataio.dataloader.make_dataloader(dataset, batch_size=self.hparams.batch_size)
+        loader_it = iter(loader)
+        self.create_reports()
+        self.modules.model.show_inference_progress = False
+        self.item_ids = []
+        details_keys = list(self.evaluators.keys()) + list(self.bulk_evaluators.keys())
+        self.details = {
+            evaluator_key: []
+            for evaluator_key in details_keys
+        }
+        self.read_reports()
+        self.sample_text = []
+        self.sample_file_names = []
+        self.ref_file_names = []
+        logger.info("Starting evaluation")
+        batch_count = math.ceil(len(dataset) / self.hparams.batch_size)
+        for batch in tqdm(loader_it, desc="Evaluation", total=batch_count):
+            self.evaluate_batch(batch)
+        self.evaluate_bulk()
+        self.write_summary()
+        logger.info("Evaluation done")
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+        if self.hparams.eval_perf:
+            self.perf_file = open(self.output_folder / "perf.csv", "w")
+            self.perf_writer = csv.DictWriter(
+                self.perf_file,
+                [
+                    "uttid",
+                    "infer_flops",
+                    "steps",
+                    "infer_flops_per_step",
+                    "vocoder_flops",
+                    "total_flops",
+                    "total_flops_per_step",
+                ]
+            )
+            self.perf_writer.writeheader()
+
+    def infer(self, tokens, tokens_length, emb):
+        stats = {}
+        if self.hparams.eval_perf:
+            flop_counter = FlopCounterMode()
+        else:
+            flop_counter = nullcontext()
+
+        with flop_counter:
+            infer_out = self.modules.model.infer(
+                input_tokens=tokens, input_length=tokens_length, emb=emb
+            )
+        if self.hparams.eval_perf:
+            steps = (infer_out.length * infer_out.audio.size(1)).sum().item()
+            total_flops = flop_counter.get_total_flops()
+            stats = {
+                "infer_flops": total_flops,
+                "steps": steps,
+                "infer_flops_per_step": total_flops / steps,
+            }
+        return infer_out, stats
+
+    def vocoder(self, infer_out, emb):
+        stats = {}
+        if self.hparams.eval_perf:
+            flop_counter = FlopCounterMode()
+        else:
+            flop_counter = nullcontext()
+
+        with flop_counter:
+            wav = self.create_waveform_fn(
+                infer_out.audio,
+                length=infer_out.length,
+                emb=emb
+            )
+            if wav.dim() > 2:
+                wav = wav.squeeze(1)
+
+        if self.hparams.eval_perf:
+            flops = flop_counter.get_total_flops()
+            stats = {
+                "vocoder_flops": flops
+            }
+        return wav, stats
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {key : handle_number(value) for key, value in row.items()}
+                        self.details[evaluator_key].append(row)
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1., 1.], device=self.device)
+        if evaluator_key in self.evaluators:
+            evaluator = self.evaluators[evaluator_key]
+            result = evaluator.evaluate(
+                wavs=bogus_wavs,
+                length=bogus_length,
+                text=["BOGUS"] * len(bogus_wavs),
+                wavs_ref=bogus_wavs,
+                length_ref=bogus_length,
+            )
+        else:
+            bogus_file_name = self.output_folder / "bogus.wav"
+            evaluator = self.bulk_evaluators[evaluator_key]
+            sb.dataio.dataio.write_audio(
+                str(bogus_file_name),
+                bogus_wavs[0].cpu(),
+                samplerate=self.hparams.model_sample_rate,
+            )
+            result = evaluator.evaluate_files(
+                file_names=[bogus_file_name],
+                text=["BOGUS"],
+                file_names_ref=[bogus_file_name],
+            )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def evaluate_batch(self, batch):
+        """Runs evaluation on a single batch of speech
+
+        Arguments
+        ---------
+        batch : speechbrain.dataio.batch.PaddedBatch
+            the batch to be evaluated"""
+        with torch.no_grad():
+            batch = batch.to(self.device)
+            tokens, tokens_length = batch.tokens
+            vocoder_to_device(self.modules.vocoder, self.device)
+            if hasattr(self.modules.vocoder, "device"):
+                self.modules.vocoder.device = self.device
+            audio_resampled = torchaudio.functional.resample(
+                batch.sig.data,
+                self.hparams.sample_rate,
+                self.hparams.model_sample_rate,
+            )
+            mel_spec = self.spk_emb_model.mel_spectogram(
+                audio=audio_resampled
+            )
+            spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch(
+                mel_spec, batch.sig.lengths
+            ).squeeze(1)
+            infer_out, perf_stats = self.infer(
+                tokens=tokens, tokens_length=tokens_length,
+                emb={
+                    "spk": spk_emb
+                }
+            )
+            wav, vocoder_stats = self.vocoder(
+                infer_out, spk_emb
+            )
+            perf_stats.update(vocoder_stats)
+            length = infer_out.length
+            if wav.dim() > 2:
+                wav = wav.squeeze(1)
+
+            self.save_samples(batch, wav, infer_out.length)
+            self.item_ids.extend(batch.uttid)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=batch.label_norm_eval,
+                    wavs_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, batch.uttid, details)
+                self.details[evaluator_key].extend(details)
+
+            if self.hparams.eval_perf:
+                perf_stats.update(vocoder_stats)
+                perf_stats["total_flops"] = perf_stats["vocoder_flops"] + perf_stats["infer_flops"]
+                perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"]
+                self.write_perf_stats(batch.uttid, perf_stats)
+
+
+    def evaluate_bulk(self):
+        """Performs bulk evaluation"""
+        for evaluator_key, evaluator in self.bulk_evaluators.items():
+            result = evaluator.evaluate_files(
+                file_names=self.sample_file_names,
+                text=self.sample_text,
+                file_names_ref=self.ref_file_names,
+            )
+            self.details[evaluator_key].append(result.details)
+            details = undo_batch(result.details)
+            self.write_result(evaluator_key, self.item_ids, details)
+
+    def write_result(self, evaluator_key, uttid, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        batch : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(uttid, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(
+                ascii_only(flatten(report_details))
+            )
+        self.report_files[evaluator_key].flush()
+
+    def save_samples(self, batch, wav, length):
+        """Saves the samples generated by the TTS system
+
+        Arguments
+        ---------
+        batch : speechbrain.dataio.batch.PaddedBatch
+            the batch being evaluated
+        wav : torch.Tensor
+            the waveform
+        length: torch.Tensor
+            relative lengths
+        """
+        wav_length_abs = (length * wav.size(1)).int()
+        for item_id, infer_wav, wav_length in zip(
+            batch.uttid, wav, wav_length_abs
+        ):
+            file_name = str(
+                self.samples_folder / f"{item_id}_pred.wav"
+            )
+            infer_wav_cut = infer_wav[:wav_length.item()].cpu()
+            sb.dataio.dataio.write_audio(
+                file_name, infer_wav_cut, samplerate=self.hparams.model_sample_rate
+            )
+            self.sample_file_names.append(file_name)
+
+    def write_summary(self):
+        """Outputs summarized statistics"""
+        summary = self.compute_summary()
+        file_name = self.output_folder / "summary.json"
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def write_perf_stats(self, uttid, details):
+        self.perf_writer.writerow(
+            {
+                "uttid": " ".join(uttid),
+                **details
+            }
+        )
+        self.perf_file.flush()
+
+
+    def compute_summary(self):
+        """Computes the summarized statistics"""
+        return {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key]["descriptive"]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key],
+                key=metric_key,
+            ).items()
+        }
+    
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(
+        re.escape(char) for char in string.punctuation
+    )
+)
+
+RE_NON_ASCII = re.compile(r'[^\x00-\x7F]+')
+
+
+def ascii_only(values):
+    return {
+        key: RE_NON_ASCII.sub('', value) if isinstance(value, str)
+        else value
+        for key, value in values.items()
+    }
+
+
+@sb.utils.data_pipeline.takes("label_norm")
+@sb.utils.data_pipeline.provides("label_norm_eval")
+def label_norm_pipeline(label):
+    """Normalizes labels for ASR comparison, converting to uppercase and removing
+    punctuation
+
+    Arguments
+    ---------
+    label : str
+        The unnormalized label
+
+    Returns
+    -------
+    result : str
+        The normalized label
+    """
+    label = label.upper()
+    label = RE_PUNCTUATION.sub("", label)
+    return label
+
+
+@sb.utils.data_pipeline.takes("wav")
+@sb.utils.data_pipeline.provides("sig")
+def audio_ref_pipeline(wav):
+    """The audio loading pipeline for references
+
+    Arguments
+    ---------
+    wav : str
+        The file path
+
+    Returns
+    -------
+    sig : torch.Tensor
+        The waveform
+    """
+    sig = sb.dataio.dataio.read_audio(wav)
+    return sig
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+    
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item()
+        for stat_key, value in stats.items()
+    }
+
+
+def select_subset(dataset, hparams):
+    """Selects a subset of the dataset provided, if specified.
+    The selection is controlled by a hyperparameter named
+    eval_subset, which is expected to list the IDs of the
+    data items on which evaluation will take place, one per line
+
+    Arguments
+    ---------
+    dataset : speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    hparams : dict
+        A hyperparameters file
+
+    Returns
+    -------
+    subset : dataset
+        The dataset, filtered down if applicable
+    """
+    eval_subset_path = hparams.get("eval_subset")
+    if eval_subset_path is not None:
+        eval_subset_path = Path(eval_subset_path)
+        if not eval_subset_path.exists():
+            raise ValueError(f"eval_subset {eval_subset_path} does not exist")
+        with open(eval_subset_path) as eval_subset_file:
+            eval_subset_ids = [line.strip() for line in eval_subset_file]
+        subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
+    else:
+        subset = dataset
+    return subset
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
+
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
new file mode 100644
index 000000000..a4c8b6b59
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -0,0 +1,66 @@
+eval_dataset: valid
+eval_suffix: ""
+eval_sample_rate: 16000
+eval_spk_sim_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_subset: null
+eval_asr_beam_size: 66
+eval_asr_type: encoder_decoder
+eval_asr_source: !apply:speechbrain.utils.hparams.choice
+  value: !ref <eval_asr_type>
+  choices:
+    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
+    whisper: openai/whisper-small
+eval_spk_sim_source: microsoft/wavlm-base-sv
+evaluations: utmos,asr,spk_sim
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: false
+
+
+eval_asr: !apply:speechbrain.utils.hparams.choice
+  value: !ref <eval_asr_type>
+  choices:
+    encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator
+      source: !ref <eval_asr_source>
+      sample_rate: !ref <eval_sample_rate>
+      overrides:
+        lm_weight: 0.0
+        test_beam_size: !ref <eval_asr_beam_size>
+    whisper: !name:eval.WhisperASRSpeechEvaluator
+      source: !ref <eval_asr_source>
+      sample_rate: !ref <eval_sample_rate>
+      savedir: !ref <pretrained_model_save_folder>
+
+eval_utmos: !name:eval.UTMOSSpeechEvaluator
+    source: !ref <eval_utmos_source>
+    save_path: !ref <eval_utmos_save_path>
+    model_name: !ref <eval_utmos_model_name>
+    model_url: !ref <eval_utmos_model_url>
+    domain_id: !ref <eval_utmos_domain_id>
+    judge_id: !ref <eval_utmos_judge_id>
+
+eval_spk_sim: !name:eval.SpkSimWavLM
+  source: !ref <eval_spk_sim_source>
+  savedir: !ref <pretrained_model_save_folder>
+  model_sample_rate: !ref <eval_spk_sim_sample_rate>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+  spk_sim: !ref <eval_spk_sim>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+  spk_sim:
+    descriptive: ["score"]
+
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
new file mode 100644
index 000000000..3626079ef
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
@@ -0,0 +1,488 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+data_folder_alignments: null
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+ssl_model_type: wavlm
+representation_mode: discrete
+vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-<representation_mode>-ms
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+  value: !ref <data_mode>
+  choices:
+    lite: ["train-clean-100"]
+    clean: ["train-clean-100", "train-clean-360"]
+    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_archive: !ref <progress_folder>/progress.tar
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+# Position shift
+use_position_shift: True
+max_position_shift: 1000
+position_shift_seed: 42
+position_shift_probability: 1.0
+
+freeze_token_model: True
+
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+g2p_src: flexthink/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+token_model_kmeans_dataset: LibriSpeech-100-360-500
+ssl_model_layers: [1, 3, 7, 12, 18, 23]
+token_model_layers: !ref <ssl_model_layers>
+select_layers: null
+token_offset: 1
+vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
+        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
+        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
+vocoder_src_continous: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
+        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
+        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
+vocoder_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: !ref <vocoder_src_discrete>
+        continuous: !ref <vocoder_src_continous>
+vocoder_available_layers: [1, 3, 7, 12, 18, 23]
+vocoder_takes_spk_emb: True
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite
+        hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite
+        wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite
+asr_src: speechbrain/asr-transformer-transformerlm-librispeech
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+batch_size_guided: 2
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+# Guides
+guides_enabled: False
+guides_start_epoch: 40
+guides_spk: False
+guides_spk_discrete: True
+guides_spk_loss_weight: 0.2
+guides_asr: True
+guides_asr_loss_weight: 0.1
+
+
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+
+
+token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
+    ssl_model: !ref <ssl_model>
+    kmeans_repo_id: !ref <token_model_kmeans_src>
+    kmeans_dataset: !ref <token_model_kmeans_dataset>
+    num_clusters: !ref <audio_num_tokens>
+    save_path: !ref <pretrained_model_save_folder>
+    layers_num: !apply:benchmarks.DASB.utils.hparams.as_list
+        value: !ref <token_model_layers>
+        dtype: int
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
+    source: !ref <spk_emb_discrete_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa-<ssl_model_type>
+    pymodule_file: custom_interface.py
+    classname: DiscreteSpkEmb
+    overrides:
+        ssl_layer_num_selected: !ref <token_model_layers>
+
+asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide
+    source: !ref <asr_src>
+    savedir:  !ref <pretrained_model_save_folder>/asr-transformer
+    
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+token_model_kwargs:
+    SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list
+        value: !ref <token_model_layers>
+        dtype: int
+    deduplicates: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers
+        layers: !ref <token_model_layers>
+        value: False
+    bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers
+        layers: !ref <token_model_layers>
+        value: null
+
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
+    token_model_kwargs: !ref <token_model_kwargs>
+    ssl_model: !ref <ssl_model>
+    ssl_model_layers: !apply:benchmarks.DASB.utils.hparams.as_list
+        value: !ref <ssl_model_layers>
+        dtype: int
+    token_model_layers: !ref <token_model_layers>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+    data_folder_alignments: !ref <data_folder_alignments>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+layerwise_renorm: True
+d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+enc_n_dim: 16
+dec_n_dim: 256
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+emb_dropout: 0.0
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1000
+audio_dim: 1024
+audio_emb_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: 1024
+        continuous: 128
+audio_emb_freeze: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+
+############################## models ################################
+
+vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list
+    value: !apply:speechbrain.utils.hparams.choice
+        value: !ref <select_layers>
+        default: !ref <select_layers>
+        choices:
+            null: !ref <token_model_layers>
+    dtype: int
+
+vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper
+  model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
+    source: !ref <vocoder_src>
+    savedir: !ref <vocoder_model_path>
+  available_layers: !ref <vocoder_available_layers>
+  layers: !ref <vocoder_layers>
+  num_units: !ref <audio_num_tokens>
+  offset: !ref <token_offset>
+
+vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper
+    model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams
+        source: !ref <vocoder_src>
+        savedir: !ref <vocoder_model_path>
+
+vocoder: !apply:benchmarks.DASB.utils.hparams.choice
+    value: !ref <representation_mode>
+    apply: True
+    choices:
+        discrete: !ref <vocoder_discrete>
+        continuous: !ref <vocoder_continuous>
+
+inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <beam_size>
+    using_eos_threshold: False
+    length_normalization: True
+    audio_token_shift: !ref <audio_token_shift>
+
+inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
+    scale_factor: !ref <scale_factor>
+    gate_threshold: !ref <gate_threshold>
+    eos_mode: !ref <eos_mode>
+    representation_mode: !ref <representation_mode>
+
+inference: !apply:speechbrain.utils.hparams.choice
+    value: !ref <inference_mode>
+    choices:
+        search: !ref <inference_search>
+        forward: !ref <inference_forward>
+
+emb:
+  spk:
+    kind: "pretrained"
+    dim: 192
+    vocoder: True
+    injection: !ref <spk_emb_injection>
+
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    enc_n_dim: !ref <enc_n_dim>
+    dec_n_dim: !ref <dec_n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    emb_dropout: !ref <emb_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    inference: !ref <inference>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    use_position_shift: !ref <use_position_shift>
+    max_position_shift: !ref <max_position_shift>
+    position_shift_probability: !ref <position_shift_probability>
+    position_shift_seed: !ref <position_shift_seed>
+    emb: !ref <emb>
+    layerwise_renorm: !ref <layerwise_renorm>
+
+modules:
+    model: !ref <model>
+    vocoder: !ref <vocoder>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+    spk_weight: !ref <guides_spk_loss_weight>
+    asr_weight: !ref <guides_asr_loss_weight>
+    representation_mode: !ref <representation_mode>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+freezer: !new:benchmarks.DASB.utils.preparation.Freezer
+    save_path: !ref <prepare_save_folder>
+    archive_path: !ref <prepare_archive_path>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
+    logger: !ref <progress_logger>
+    sample_rate: !ref <model_sample_rate>
+    eos_threshold: !ref <gate_threshold>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
new file mode 100644
index 000000000..c6875498c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -0,0 +1,330 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+  value: !ref <data_mode>
+  choices:
+    lite: ["train-clean-100"]
+    clean: ["train-clean-100", "train-clean-360"]
+    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_archive: !ref <progress_folder>/progress.tar
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# DAC-specific settings
+model_type: 24khz
+model_bitrate: 8kbps
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+dac: !new:speechbrain.lobes.models.discrete.dac.DAC
+    sample_rate: !ref <model_sample_rate>
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    load_pretrained: True
+
+token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor
+    dac: !ref <dac>
+    n_quantizers: !ref <audio_tokens_per_step>
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+    
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+n_dim: 16
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder
+    dac: !ref <dac>
+
+
+
+inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <beam_size>
+    using_eos_threshold: False
+    length_normalization: True
+    audio_token_shift: !ref <audio_token_shift>
+
+inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
+    scale_factor: !ref <scale_factor>
+    gate_threshold: !ref <gate_threshold>
+    eos_mode: !ref <eos_mode>
+
+inference: !apply:speechbrain.utils.hparams.choice
+    value: !ref <inference_mode>
+    choices:
+        search: !ref <inference_search>
+        forward: !ref <inference_forward>
+
+emb:
+  spk:
+    kind: "pretrained"
+    dim: 192
+    vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    n_dim: !ref <n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>    
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    inference: !ref <inference>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+modules:
+    model: !ref <model>
+    vocoder: !ref <vocoder>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+freezer: !new:benchmarks.DASB.utils.preparation.Freezer
+    save_path: !ref <prepare_save_folder>
+    archive_path: !ref <prepare_archive_path>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
+    logger: !ref <progress_logger>
+    sample_rate: !ref <model_sample_rate>
+    eos_threshold: !ref <gate_threshold>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..c1c2f9f1c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,396 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+data_folder_alignments: null
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+ssl_model_type: wavlm
+representation_mode: discrete
+vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-<representation_mode>-ms
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+  value: !ref <data_mode>
+  choices:
+    lite: ["train-clean-100"]
+    clean: ["train-clean-100", "train-clean-360"]
+    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_archive: !ref <progress_folder>/progress.tar
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
+# Position shift
+use_position_shift: True
+max_position_shift: 1000
+position_shift_seed: 42
+position_shift_probability: 1.0
+
+freeze_token_model: True
+
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+g2p_src: flexthink/soundchoice-g2p
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+ssl_model_layers: [1, 3, 7, 12, 18, 23]
+token_model_layers: !ref <ssl_model_layers>
+select_layers: null
+token_offset: 1
+vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
+        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
+        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
+vocoder_src_continous: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
+        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
+        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
+vocoder_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: !ref <vocoder_src_discrete>
+        continuous: !ref <vocoder_src_continous>
+vocoder_available_layers: [1, 3, 7, 12, 18, 23]
+vocoder_takes_spk_emb: True
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite
+        hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite
+        wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite
+asr_src: speechbrain/asr-transformer-transformerlm-librispeech
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+batch_size_guided: 2
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 2000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+# Guides
+guides_enabled: False
+guides_start_epoch: 40
+guides_spk: False
+guides_spk_discrete: True
+guides_spk_loss_weight: 0.2
+guides_asr: True
+guides_asr_loss_weight: 0.1
+
+
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
+    source: !ref <spk_emb_discrete_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa-<ssl_model_type>
+    pymodule_file: custom_interface.py
+    classname: DiscreteSpkEmb
+    overrides:
+        ssl_layer_num_selected: !ref <token_model_layers>    
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+token_model_kwargs:
+    SSL_layers: !ref <token_model_layers>
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    ssl_model: !ref <ssl_model>
+    ssl_model_layers: !ref <ssl_model_layers>
+    token_model_layers: !ref <token_model_layers>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+    data_folder_alignments: !ref <data_folder_alignments>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+layerwise_renorm: True
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 1000
+audio_dim: 1024
+audio_emb_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: 1024
+        continuous: 128
+audio_emb_freeze: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+
+############################## models ################################
+
+vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
+    source: !ref <vocoder_src>
+    savedir: !ref <vocoder_model_path>
+
+emb:
+  spk:
+    kind: "pretrained"
+    dim: 192
+    vocoder: True
+    injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+modules:
+    model: !ref <model>
+    vocoder: !ref <vocoder>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+    representation_mode: !ref <representation_mode>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
new file mode 100644
index 000000000..a82d82a2c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -0,0 +1,352 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: encodec
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+  value: !ref <data_mode>
+  choices:
+    lite: ["train-clean-100"]
+    clean: ["train-clean-100", "train-clean-360"]
+    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_archive: !ref <progress_folder>/progress.tar
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_type: encodec
+vocoder_src: "charactr/vocos-encodec-24khz"
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+# Guides
+guides_enabled: False
+
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec
+    source: !ref <token_model_src>
+    save_path: !ref <pretrained_model_save_folder>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: True
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+    
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+enc_n_dim: 16
+dec_n_dim: 256
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+vocoder: !apply:speechbrain.utils.hparams.choice
+    value: !ref <vocoder_type>
+    choices:
+        encodec: !new:benchmarks.DASB.model.custom_model.EncodecVocoder
+            encodec: !ref <token_model>
+        vocos: !new:speechbrain.lobes.models.huggingface_transformers.vocos.Vocos
+            source: !ref <vocoder_src>
+            save_path: !ref <pretrained_model_save_folder>
+
+
+inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <beam_size>
+    using_eos_threshold: False
+    length_normalization: True
+    audio_token_shift: !ref <audio_token_shift>
+
+inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
+    scale_factor: !ref <scale_factor>
+    gate_threshold: !ref <gate_threshold>
+    eos_mode: !ref <eos_mode>
+
+inference: !apply:speechbrain.utils.hparams.choice
+    value: !ref <inference_mode>
+    choices:
+        search: !ref <inference_search>
+        forward: !ref <inference_forward>
+
+emb:
+  spk:
+    kind: "pretrained"
+    dim: 192
+    vocoder: !ref <vocoder_takes_spk_emb>
+    injection: !ref <spk_emb_injection>
+
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    enc_n_dim: !ref <enc_n_dim>
+    dec_n_dim: !ref <dec_n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    inference: !ref <inference>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+modules:
+    model: !ref <model>
+    vocoder: !ref <vocoder>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+freezer: !new:benchmarks.DASB.utils.preparation.Freezer
+    save_path: !ref <prepare_save_folder>
+    archive_path: !ref <prepare_archive_path>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger
+    current_path: !ref <progress_current>
+    archive_path: !ref <progress_archive>
+    meta_path: !ref <progress_meta>
+    epoch_counter: !ref <epoch_counter>
+
+progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
+    logger: !ref <progress_logger>
+    sample_rate: !ref <model_sample_rate>
+    eos_threshold: !ref <gate_threshold>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
new file mode 100644
index 000000000..1711b10f4
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -0,0 +1,329 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: vocos
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+  value: !ref <data_mode>
+  choices:
+    lite: ["train-clean-100"]
+    clean: ["train-clean-100", "train-clean-360"]
+    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_archive: !ref <progress_folder>/progress.tar
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+freeze_token_model: True
+token_model_src: "fnlp/SpeechTokenizer"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+# Token model (pretrained)
+token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface
+    source: !ref <token_model_src>
+    save_path: !ref <pretrained_model_save_folder>
+    shape: compat
+    codebooks: !ref <audio_tokens_per_step>
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+    
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+extract_features_opts:
+    dataloader_opts:
+        batch_size: !ref <extract_features_batch_size>
+        num_workers: !ref <num_workers>
+    token_model: !ref <token_model>
+    sample_rate: !ref <sample_rate>
+    model_sample_rate: !ref <model_sample_rate>
+    spk_emb_model: !ref <spk_emb_model>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6
+dec_num_layers: 12
+d_ffn: 2048
+z_dim: 128
+hidden_dim: 2048
+n_dim: 16
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+
+############################## models ################################
+
+vocoder: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerVocoder
+    tokenizer: !ref <token_model>
+
+inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <beam_size>
+    using_eos_threshold: False
+    length_normalization: True
+    audio_token_shift: !ref <audio_token_shift>
+
+inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
+    scale_factor: !ref <scale_factor>
+    gate_threshold: !ref <gate_threshold>
+    eos_mode: !ref <eos_mode>
+
+inference: !apply:speechbrain.utils.hparams.choice
+    value: !ref <inference_mode>
+    choices:
+        search: !ref <inference_search>
+        forward: !ref <inference_forward>
+
+emb:
+  spk:
+    kind: "pretrained"
+    dim: 192
+    vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    z_dim: !ref <z_dim>
+    hidden_dim: !ref <hidden_dim>
+    n_dim: !ref <n_dim>
+    decoder_chunk_size: !ref <decoder_chunk_size>    
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    vocoder: !ref <vocoder>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    inference: !ref <inference>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+modules:
+    model: !ref <model>
+    vocoder: !ref <vocoder>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+freezer: !new:benchmarks.DASB.utils.preparation.Freezer
+    save_path: !ref <prepare_save_folder>
+    archive_path: !ref <prepare_archive_path>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger
+    current_path: !ref <progress_current>
+    archive_path: !ref <progress_archive>
+    meta_path: !ref <progress_meta>
+    epoch_counter: !ref <epoch_counter>
+
+progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
+    logger: !ref <progress_logger>
+    sample_rate: !ref <model_sample_rate>
+    eos_threshold: !ref <gate_threshold>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py
new file mode 120000
index 000000000..489ab4011
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py
@@ -0,0 +1 @@
+../../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
new file mode 100644
index 000000000..a09a4cc23
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -0,0 +1,1077 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import math
+import torch
+import sys
+from functools import partial
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from Tokotron import (
+    RepresentationMode,
+    get_silence_token,
+    use_silence_padding,
+    feature_pad_to,
+)    
+from types import SimpleNamespace
+from evaluate import TokotronEvaluator
+import re
+import string
+
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKEN_COUNT = 1
+
+
+# Brain class for speech recognition training
+class TokotronBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluator = TokotronEvaluator(
+            hparams=hparams,
+            create_waveform_fn=self.create_waveform,
+            device=self.device,
+        )
+        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        raise NotImplementedError()
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        tokens, tokens_length = batch.tokens
+        features = self.prepare_features(batch)
+        (
+            audio_bos,
+            audio_bos_length,
+            audio_tgt,
+            audio_tgt_length,
+            spk_emb
+        ) = features
+
+        predictions = self.modules.model(
+            input_tokens=tokens,
+            input_length=tokens_length,
+            audio=audio_bos,
+            audio_length=audio_bos_length,
+            emb={
+                "spk": spk_emb
+            }
+        )
+
+        return predictions, features
+
+    def prepare_features(self, batch):
+        if self.hparams.spk_emb_shuffle:
+            wav, wav_length = batch.spk_emb_random_match
+        else:
+            wav, wav_length = batch.sig
+        spk_emb = self._compute_spk(wav, wav_length).squeeze(1)
+
+        if self.representation_mode == RepresentationMode.DISCRETE:
+            audio_bos, audio_bos_length = batch.audio_bos
+            audio_tgt, audio_tgt_length = batch.audio_pad
+        else:
+            wav, audio_length = batch.sig
+            audio = self.modules.ssl_model(wav)
+            audio = audio[self.hparams.ssl_model_layers, :, :, :].permute(
+                1, 2, 0, 3
+            )
+            batch_size, _, heads, dim = audio.shape
+            bos = torch.zeros_like(
+                audio[:, :1, :, :]
+            ).reshape(batch_size, self.hparams.bos_width, heads, dim)
+            audio_bos = torch.concatenate(
+                [bos, audio],
+                dim=1
+            )
+            audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
+            audio_tgt = audio
+            audio_tgt_length = audio_length
+
+        return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb
+
+    def _compute_spk(self, wav, wav_length):
+        mel_spec = self.spk_emb_model.mel_spectogram(
+            wav.squeeze(1))
+        spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch(
+            mel_spec, wav_length
+        )
+        return spk_emb_pred
+
+    def _get_selected_layer_idx(self):
+        selected_layers = None
+        if hasattr(self.hparams, "select_layers") and self.hparams.select_layers:
+            layers = self.hparams.select_layers
+            model_layers_map = {
+                layer: idx
+                for idx, layer in enumerate(
+                    self.hparams.token_model_layers)
+            }
+            selected_layers = [model_layers_map[layer] for layer in layers]
+        return selected_layers
+
+    # TODO: Move this elsewhere
+    def select_layers(self, audio_ssl):
+        """Applies layer squishing, if enabled
+
+        Arguments
+        ---------
+        audio_ssl : torch.Tensor
+            SSL features
+
+        Returns
+        -------
+        audio_ssl : torch.Tensor
+            SSL features, squished if enabled
+        """
+        if self.layer_idx:
+            audio_ssl = audio_ssl[:, :, self.layer_idx]
+        return audio_ssl
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        predictions, features = predictions
+        (
+            audio_bos,
+            audio_bos_length,
+            audio_tgt,
+            audio_tgt_length,
+            spk_emb
+        ) = features
+
+        loss_details = self.hparams.compute_cost(
+            predictions=predictions,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
+            input_tokens=batch.tokens.data,
+            input_length=batch.tokens.lengths,
+        )
+        self.loss_metric.append(
+            batch.uttid,
+            predictions=predictions,
+            audio=audio_tgt,
+            audio_length=audio_tgt_length,
+            input_tokens=batch.tokens.data,
+            input_length=batch.tokens.lengths,
+            reduction="batch",
+        )
+        return loss_details.loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        if hasattr(self.modules.vocoder, "model"):
+            self.modules.vocoder.model.device = self.device
+        self.layer_idx = self._get_selected_layer_idx()
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.hparams.compute_cost, batch_eval=True,
+        )
+        if (
+            self.hparams.audio_emb_pretrained
+            and epoch == 1
+            and stage == sb.Stage.TRAIN
+        ):
+            # TODO: Clean this up
+            if hasattr(self.hparams.token_model, "vocabulary"):
+                vocabulary = self.hparams.token_model.vocabulary
+            elif hasattr(self.hparams.token_model, "vocabularies"):
+                vocabulary = torch.stack(
+                    [
+                        torch.from_numpy(voc)
+                        for voc in self.hparams.token_model.vocabularies
+                    ]
+                )
+            self.modules.model.init_audio_emb(vocabulary)
+        # Load the compression model only if compression is enables
+        pretrained_run_opts = {"device": self.device}
+        self.spk_emb_model = self.hparams.spk_emb_model(
+            run_opts=pretrained_run_opts
+        )
+        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
+        # If speaker embedding shuffling is enabled, re-initialize them for the
+        # epoch
+        if self.hparams.spk_emb_shuffle:
+            stage_key = stage.name.lower()
+            self.resample_fn[stage_key](epoch=epoch)
+
+        # Reset the learning rate - if supported. This is useful when fine-tuning
+        # a model pre-trained on another dataset
+        if (
+            stage == sb.Stage.TRAIN
+            and self.hparams.reset_annealing_epoch is not None
+            and epoch is not None
+            and epoch == self.hparams.reset_annealing_epoch
+        ):
+            self.hparams.lr_annealing.n_steps = 0
+
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluator.on_evaluate_start(stage, epoch)
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluator.on_evaluate_start(stage, epoch)
+            self.is_evaluating = True
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        return epoch % self.hparams.eval_interval == 0
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            self.evaluator.evaluate_batch(batch)
+        return loss.detach().cpu()            
+
+    def make_dataloader(
+        self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs
+    ):
+        """A custom override of make_dataloader that will change the batch
+        size if guides are enabled to meet GPU memory constraints
+
+        Arguments
+        ---------
+        dataset : Dataset
+            A set of data to use to create data loader. If the Dataset is a
+            DynamicItemDataset, PaddedBatch is used as the default collate_fn,
+            unless specified in loader_kwargs.
+        stage : Stage
+            The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
+        ckpt_prefix : str, None
+            Prefix to use for SaveableDataLoader Checkpoint name. The Stage
+            name is added to this to create the full key. Set to None to not
+            save the DataLoader.
+        **loader_kwargs : dict
+            Additional keyword arguments to the DataLoader.
+            E.g., batch_size, num_workers, pin_memory.
+
+        Returns
+        -------
+        DataLoader for the input dataset
+        """
+        if stage == sb.Stage.TRAIN and not getattr(self, "_ckpt_recovered", False):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+        if self.guides_running(pre_epoch=True):
+            loader_kwargs["batch_size"] = self.hparams.batch_size_guided
+        return super().make_dataloader(
+            dataset=dataset,
+            stage=stage,
+            ckpt_prefix=ckpt_prefix,
+            **loader_kwargs
+        )
+
+    def guides_running(self, pre_epoch=False):
+        """Determines whether guides are currently running
+
+        Arguments
+        ---------
+        pre_epoch : bool
+            If enabled, a correction will be applied to the current epoch
+            indicating that the current epoch has not yet started"""
+        epoch = self.hparams.epoch_counter.current
+        if pre_epoch:
+            epoch += 1
+        return (
+            self.hparams.guides_enabled
+            and epoch >= self.hparams.guides_start_epoch
+        )
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+            )
+
+        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
+            self.evaluator.on_evaluate_end()
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+
+
+def dataio_prepare(hparams, guide_ctx=None):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+    
+    guide_ctx : SimpleNamespace, optional
+        The guide context with pretrained models
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    representation_mode = RepresentationMode(
+        hparams.get("representation_mode", RepresentationMode.DISCRETE)
+    )
+
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_ref_pipeline(wav):
+        """The audio loading pipeline for references
+
+        Arguments
+        ---------
+        wav : strƒnum_
+            The file path
+
+        Returns
+        -------
+        sig : torch.Tensor
+            The waveform
+        """
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label.upper()
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+ 
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+    
+    @sb.utils.data_pipeline.takes("label_norm")
+    @sb.utils.data_pipeline.provides("asr_tokens")
+    def asr_tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return torch.tensor(guide_ctx.asr_model.encode(label))    
+
+    use_silence_padding = hparams.get("use_silence_padding", True)
+    if "token_model_layers" in hparams:
+        audio_tokens_per_step = len(hparams["token_model_layers"])
+    else:
+        audio_tokens_per_step = hparams["audio_tokens_per_step"]
+    if use_silence_padding:
+        silence_token, silence_emb = get_silence_token(
+            hparams["tokenizer"],
+            extract_emb=True,
+            model_kwargs=hparams.get("token_model_kwargs"),
+        )
+    else:
+        silence_token = (
+            torch.ones(audio_tokens_per_step, dtype=torch.int64)
+            * hparams["eos_index"]
+        )
+
+    silence_padding = silence_token if representation_mode == RepresentationMode.DISCRETE else silence_emb
+    silence_padding = silence_padding.cpu()
+    silence_padding_len = int(math.ceil(hparams["silence_padding"]))
+    bos_width = hparams.get("bos_width", 1)
+    audio_bos_prefix = (
+        torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"]
+    )
+    if representation_mode == RepresentationMode.CONTINUOUS:
+        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(1, 1, hparams["audio_dim"])
+
+    tokens_loader = hparams.get("tokens_loader")
+
+    @sb.utils.data_pipeline.takes("uttid")
+    @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
+    def audio_pipeline(id):
+        audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step)    
+        audio_pad = feature_pad_to(
+            audio, len(audio) + silence_padding_len, silence_padding
+        )
+        yield audio_pad
+        audio_bos = torch.cat([audio_bos_prefix, audio_pad], dim=0)
+        yield audio_bos
+
+    def spk_emb_random_match(uttid, dataset, spk_sample):
+        # Sample a speaker-matched embedding
+        selected_idx = spk_sample[uttid]
+
+        # Retrieve the embedding value from the dataset
+        with dataset.output_keys_as(["sig"]):
+            spk_emb = dataset[selected_idx]["sig"]
+        return spk_emb
+
+    dynamic_items = [
+        text_pipeline,
+        tokens_pipeline,
+        audio_ref_pipeline,
+        audio_pipeline
+    ]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "audio_pad",
+        "audio_bos",
+        "sig",
+        "spk_emb_random_match",
+    ]
+
+    init_sequence_encoder(hparams)
+
+    resample_fn = {}
+    for dataset in data_info:
+        dataset_output_keys = output_keys if dataset == "train" else output_keys + ["label_norm_eval"]
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+        if hparams["spk_emb_shuffle"]:
+            spk_idx, spk_samplers = group_by_speaker(
+                dynamic_dataset,
+                hparams
+            )
+            spk_sample = {}
+            spk_emb_random_match_pipeline = partial(
+                spk_emb_random_match,
+                spk_sample=spk_sample,
+                dataset=dynamic_dataset.filtered_sorted(),
+            )
+            dynamic_dataset.add_dynamic_item(
+                func=spk_emb_random_match_pipeline,
+                takes=["uttid"],
+                provides=["spk_emb_random_match"],
+            )
+            resample_fn[dataset] = partial(
+                resample_spk,
+                spk_idx=spk_idx,
+                sample=spk_sample,
+                dataset=dynamic_dataset,
+                spk_samplers=spk_samplers
+            )
+            resample_fn[dataset](epoch=0)
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        hparams["train_dataloader_opts"]["shuffle"] = True
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+
+    # Exclude samples without phonemes
+    if hparams["input"] == "phonemes":
+        for key in datasets:
+            datasets[key] = datasets[key].filtered_sorted(
+                key_test={
+                    "phn": lambda value: value
+                }
+            )
+    datasets["sample"] = select_sample(hparams, datasets)
+    return datasets, silence_padding, resample_fn
+
+
+def select_sample(hparams, datasets):
+    """Selects a sample of files for sample generation, freezing the sample if
+    requested to persist across multiple experiments
+
+    Arguments
+    ---------
+    hparams : dict
+        experiment hyperparameters
+    datasets : dict
+        a dictionary of datasets
+
+    Returns
+    -------
+    dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset
+        the sample dataset
+    """
+    sample_path = hparams.get("sample_path")
+    dataset = None
+    if sample_path is not None:
+        sample_path = Path(sample_path)
+        if sample_path.exists():
+            with open(sample_path, "r") as sample_file:
+                data_ids = [line.strip() for line in sample_file]
+                dataset = FilteredSortedDynamicItemDataset(
+                    datasets["valid"], data_ids
+                )
+
+    if dataset is None:
+        dataset = (
+            datasets["valid"]
+            .batch_shuffle(1)
+            .filtered_sorted(select_n=hparams["num_audio_samples"])
+        )
+        if sample_path is not None:
+            with open(sample_path, "w") as sample_file:
+                for data_id in dataset.data_ids:
+                    print(data_id, file=sample_file)
+    return dataset
+
+
+def group_by_speaker(dataset, hparams):
+    """Groups utterance IDs in a dataset by speaker, for selection. The selection
+    is stable based on the seed - calling this method multiple times will always
+    result in the same order
+
+    Arguments
+    ---------
+    dataset : torch.Tensor
+        the dataset from which to select items
+    hparams : dict
+        hyperparameters
+    
+    Returns
+    -------
+    spk_idx : dict
+        a str -> int dictionary with a list of utterance indexes
+        for every speaker
+    spk_samplers : dict
+        a reproducible sampler for every speaker
+    spk_samplers_it : dict
+        an iterator for each sampler
+    """
+    spk_idx = {}
+    spk_samplers = {}
+    speakers = []
+    generator = torch.Generator()
+    generator.manual_seed(hparams["seed"])
+
+    # Group by speaker
+    with dataset.output_keys_as(["spk_id"]):
+        for idx, item in enumerate(dataset):
+            spk_id = item["spk_id"]
+            if spk_id not in spk_idx:
+                spk_idx[spk_id] = []
+            spk_idx[spk_id].append(idx)
+            speakers.append(spk_id)
+
+    # Create a reproducible sampler
+    for spk_id in speakers:
+        sampler = hparams["spk_sampler"](data_source=spk_idx[spk_id])
+        spk_samplers[spk_id] = sampler
+
+    return spk_idx, spk_samplers
+
+
+def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch):
+    """Selects new samples
+
+    Arguments
+    ---------
+    spk_idx : dict
+        Data item indexes grouped by speaker
+    spk_samplers : dict
+        A sampler for each speaker
+    spk_samplers_it : dict
+        An iterator for each speaker
+    epoch : int
+        The epoch number
+
+    Returns
+    -------
+    sample : dict
+        a dictionary with uttids as keys and matching
+        indexes as values
+    """
+    if epoch is None:
+        epoch = 0
+    spk_samplers_it = {}
+    for spk_id, sampler in spk_samplers.items():
+        sampler.set_epoch(epoch)
+        spk_samplers_it[spk_id] = iter(sampler)
+    with dataset.output_keys_as(["uttid", "spk_id"]):
+        for item in dataset:
+            spk_item_idx = next(spk_samplers_it[item["spk_id"]])
+            dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx]
+            sample[item["uttid"]] = dataset_item_idx
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)
+    encoder.add_unk()
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    return encoder
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    if not Path(file_name).exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, _, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+def get_guide_ctx(hparams, run_opts):
+    """Initializes a context object for guides,
+    containing pretrained models only for guides that will be
+    used per hparams
+    
+    Arguments
+    ---------
+    hparams : dict  
+        Hyperparameters
+    run_opts : dict
+        Run options
+    
+    Returns
+    -------
+    ctx : SimpleNamespace
+        The resulting context"""
+    ctx = {}
+    if hparams["guides_enabled"]:
+        pretrained_run_opts = {"device": run_opts.get("device", "cpu")}
+        if hparams["guides_spk"]:
+            ctx["spk_emb_model"] = hparams["spk_emb_model"](
+                run_opts=pretrained_run_opts
+            )
+        if hparams["guides_asr"]:
+            ctx["asr_model"] = hparams["asr_model"](
+                run_opts=pretrained_run_opts
+            )
+    return SimpleNamespace(**ctx)
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+
+def run_experiment(brain_cls):
+    """Starts the experiement
+
+    Arguments
+    ---------
+    brain_cls : type
+        The brain class to instantiate
+    """
+
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from libritts_prepare import prepare_libritts
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_libritts,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_json_train": hparams["train_json"],
+                "save_json_valid": hparams["valid_json"],
+                "save_json_test": (
+                    hparams["test_json"] if "test" in hparams["splits"]
+                    else None
+                ),
+                "sample_rate": hparams["sample_rate"],
+                "train_split": hparams["train_split"],
+                "valid_split": hparams["valid_split"],
+                "test_split": (
+                    hparams["test_split"] if "test" in hparams["splits"]
+                    else None
+                ),
+                "seed": hparams["seed"],
+                "model_name": hparams["model"].__class__.__name__,
+            },
+        )
+
+    # We can now directly create the datasets for training, valid, and test
+    guide_ctx = get_guide_ctx(hparams, run_opts)
+    (
+        datasets,
+        silence_padding,
+        resample_fn
+    ) = dataio_prepare(hparams, guide_ctx)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_pad", "audio_bos"]
+
+    # Trainer initialization
+    tts_brain = brain_cls(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+    tts_brain.sample_data = datasets["sample"]
+    tts_brain.resample_fn = resample_fn
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=use_silence_padding(
+            hparams["train_dataloader_opts"], silence_padding, audio_keys
+        ),
+        valid_loader_kwargs=use_silence_padding(
+            hparams["valid_dataloader_opts"], silence_padding, audio_keys
+        ),
+    )
+
+    # Load best checkpoint for evaluation
+    tts_brain.evaluate(
+        test_set=datasets["test"],
+        test_loader_kwargs=use_silence_padding(
+            hparams["test_dataloader_opts"], silence_padding, audio_keys
+        ),
+    )
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
new file mode 100644
index 000000000..9c8b243be
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+Continuous SSL verfsion
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+from train import TokotronBrain, run_experiment
+from speechbrain.dataio.dataio import clean_padding_
+
+
+class TokotronContinuousSSLBrain(TokotronBrain):
+    """Tokotron implementation for Encodec"""
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        wav = self.modules.vocoder(audio, emb)
+        wav = wav.squeeze(1)
+        clean_padding_(wav, length)
+        return wav
+
+
+if __name__ == "__main__":
+    run_experiment(TokotronContinuousSSLBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
new file mode 100644
index 000000000..78c584c45
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+from train import TokotronBrain, run_experiment
+from speechbrain.dataio.dataio import clean_padding_
+
+
+class TokotronDACBrain(TokotronBrain):
+    """Tokotron implementation for Encodec"""
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        z, _, _ = self.modules.tokenizer.quantizer.from_codes(
+            audio.transpose(1, 2).int()
+        )
+        wav = self.modules.tokenizer.decode(z).squeeze(1)
+        clean_padding_(wav, length)
+        return wav
+
+
+if __name__ == "__main__":
+    run_experiment(TokotronDACBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
new file mode 100644
index 000000000..3cc0e2644
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+Discrete SSL version
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+import torch
+from train import TokotronBrain, run_experiment
+from speechbrain.dataio.dataio import clean_padding_
+
+
+class TokotronDiscreteSSLBrain(TokotronBrain):
+    """Tokotron implementation for Encodec"""
+
+    def on_stage_start(self, stage, epoch):
+        self.compute_offset()
+        return super().on_stage_start(stage, epoch)
+
+    def compute_offset(self):
+        """Computes per-layer offsets"""
+        layers_set = set(self.hparams.token_model_layers)
+        available_layers_set = set(self.hparams.vocoder_available_layers)
+        if not layers_set.issubset(available_layers_set):
+            unavailable_layers = ",".join(
+                str(layer) for layer in (layers_set - available_layers_set)
+            )
+            raise ValueError(f"Layers {unavailable_layers} are not supported")
+        self.num_units = self.hparams.vocab_size
+        _, layers_idx = torch.where(
+            torch.tensor(
+                self.hparams.vocoder_available_layers, device=self.device
+            ).unsqueeze(0)
+            == torch.tensor(
+                self.hparams.token_model_layers, device=self.device
+            ).unsqueeze(1)
+        )
+        self.layer_offset = (
+            torch.tensor(layers_idx, device=self.device) * self.num_units
+        )[None, None, :]
+        self.offset = self.hparams.token_offset
+        self.modules.vocoder.tokenize = False
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        units_with_offset = (
+            audio + self.layer_offset.to(audio.device) + self.offset
+        )
+        wav = self.modules.vocoder(units_with_offset)
+        wav = wav.squeeze(1)
+        clean_padding_(wav, length)
+        return wav
+
+
+if __name__ == "__main__":
+    run_experiment(TokotronDiscreteSSLBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
new file mode 100644
index 000000000..07edbbd8c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+from train import TokotronBrain, run_experiment
+from speechbrain.dataio.dataio import clean_padding_
+
+
+class TokotronEncodecBrain(TokotronBrain):
+    """Tokotron implementation for Encodec"""
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        wav = self.modules.token_model.decode(audio)
+        wav = wav.squeeze(1)
+        clean_padding_(wav, length)
+        return wav
+
+
+if __name__ == "__main__":
+    run_experiment(TokotronEncodecBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py
new file mode 100644
index 000000000..fdbbb3ed7
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+from train import TokotronBrain, run_experiment
+from speechbrain.dataio.dataio import clean_padding_
+
+
+class TokotronSTBrain(TokotronBrain):
+    """Tokotron implementation for Encodec"""
+
+    def create_waveform(self, audio, length, emb):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+        emb: dict
+            Embeddings (speaker, etc)
+
+        Returns
+        -------
+        wav : torch.Tensor
+        """
+        wav = self.modules.token_model.decode(audio)
+        if length is not None:
+            clean_padding_(wav, length)
+        return wav
+
+
+if __name__ == "__main__":
+    run_experiment(TokotronSTBrain)
diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
new file mode 100644
index 000000000..ad2f5bf0c
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env/python3
+"""Recipe for extracting a discrete tokens with librispeech.
+
+Authors
+ * Jarod Duret 2024
+"""
+
+import os
+import sys
+import logging
+import pathlib as pl
+import speechbrain as sb
+from speechbrain.dataio.dataset import DynamicItemDataset
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+
+base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+sys.path.append(base_dir)
+
+print(base_dir)
+
+logger = logging.getLogger(__name__)
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # Dataset prep (parsing Librispeech    
+    from libritts_prepare import prepare_libritts  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_libritts,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "train_split": hparams["train_splits"],
+            "valid_split": hparams["dev_splits"],
+            "test_split": hparams["test_splits"],
+            "save_json_train": hparams["train_json"],
+            "save_json_valid": hparams["valid_json"],
+            "save_json_test": hparams["test_json"],
+            "sample_rate": hparams["sample_rate"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    tokens_extractor = hparams["tokens_extractor"]
+    data_folder = hparams["data_folder"]
+    datasets = []
+    for split in ["train", "valid", "test"]:
+        json_path = hparams[f"{split}_json"]
+        name = pl.Path(json_path).stem
+        dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=json_path, replacements={"data_root": data_folder},
+        )
+        datasets.append(dataset)
+
+    merged_data = {
+        key: value
+        for dataset in datasets
+        for key, value in dataset.data.items()
+    }
+    merged_dataset = DynamicItemDataset(merged_data)
+
+    save_folder = pl.Path(hparams["save_folder"])
+    logger.info("Extracting dataset tokens ...")
+    tokens_extractor.extract_tokens(
+        merged_dataset,
+        hparams["num_codebooks"],
+        (save_folder / "libritts").as_posix(),
+    )
+
+    if hparams["save_embedding"]:
+        save_folder = pl.Path(hparams["save_folder"])
+        logger.info("Saving embeddings ...")
+        tokens_extractor.save_pretrained_embeddings(
+            (save_folder / "embeddings").as_posix(),
+            vocab_size=hparams["vocab_size"],
+            num_codebooks=hparams["num_codebooks"],
+        )
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
new file mode 100644
index 000000000..76870e279
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
@@ -0,0 +1,63 @@
+# ############################################################################
+# Auido Tokenizer: DAC
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+# DAC parameters
+# model_type: [16khz, 24khz, 44khz, 44khz]
+# vocab_size: [1024, 1024, 1024, 1024]
+# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps]
+# max_num_codebooks: [12, 32, 9, 18]
+# embedding_dim: [1024, 1024, 1024, 128]
+model_type: 24khz
+vocab_size: 1024
+model_bitrate: 8kbps
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+  model_type: !ref <model_type>
+  model_bitrate: !ref <model_bitrate>
+  load_pretrained: True
+  tag: latest
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
new file mode 100644
index 000000000..2b57a7edf
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
@@ -0,0 +1,101 @@
+# ############################################################################
+# Auido Tokenizer: WavLM
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavlm
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+### Configuration for  discrete SSL model
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+
+# ssl_model_type: hubert, wavlm, wav2vec2
+# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
+ssl_model_type: WavLM
+ssl_hub: microsoft/wavlm-large
+ssl_folder: !ref <save_folder>/ssl_checkpoint
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+freeze_ssl: True
+freeze_feature_extractor: True
+vocab_size: 1000
+save_embedding: False
+
+### Config for Tokenizer
+# Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
+num_codebooks: [1, 3, 7, 12, 18, 23]
+deduplicate: [False, False, False, False, False, False]
+bpe_tokenizer_path: [null, null, null, null, null, null]
+sample_rate: 16000
+encoder_dim: 1024
+
+ssl_model: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+    Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+      source: !ref <ssl_hub>
+      output_norm: False
+      freeze: !ref <freeze_ssl>
+      freeze_feature_extractor: !ref <freeze_feature_extractor>
+      output_all_hiddens: True
+      save_path: !ref <ssl_folder>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+  save_path: !ref <kmeans_cache_dir>
+  ssl_model: !ref <ssl_model>
+  vocoder_repo_id: !ref <vocoder_repo_id>
+  kmeans_dataset: !ref <kmeans_dataset>
+  num_clusters: !ref <vocab_size>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
new file mode 100644
index 000000000..31211ec75
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
@@ -0,0 +1,63 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+bandwidth: 24.0
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+  source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
+  sample_rate: !ref <sample_rate>
+  bandwidth: !ref <bandwidth>
+  flat_embeddings: False
+  freeze: True
+  renorm_embeddings: False
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
new file mode 100644
index 000000000..9a53ed27b
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -0,0 +1,53 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+vocab_size: 1024
+num_codebooks: 8
+sample_rate: 16000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+
+# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+  save_path: !ref <save_folder>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py
new file mode 120000
index 000000000..39f1a78c2
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py
@@ -0,0 +1 @@
+../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
new file mode 100644
index 000000000..6d0ca9f0a
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -0,0 +1,331 @@
+"""
+LibriTTS data preparation
+
+Authors
+ * Pradnya Kandarkar 2022
+"""
+
+import json
+import os
+import random
+
+import torch
+import torchaudio
+from tqdm import tqdm
+
+from speechbrain.inference.text import GraphemeToPhoneme
+from speechbrain.utils.data_utils import get_all_files
+from speechbrain.utils.logger import get_logger
+from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
+
+logger = get_logger(__name__)
+LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/"
+
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def prepare_libritts(
+    data_folder,
+    save_json_train,
+    save_json_valid,
+    save_json_test,
+    sample_rate,
+    split_ratio=[80, 10, 10],
+    libritts_subsets=None,
+    train_split=None,
+    valid_split=None,
+    test_split=None,
+    seed=1234,
+    model_name=None,
+    max_valid_size=500,
+    skip_prep=False,
+):
+    """
+    Prepares the json files for the LibriTTS dataset.
+    Downloads the dataset if it is not found in the `data_folder` as expected.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the LibriTTS dataset is stored.
+    save_json_train : str
+        Path where the train data specification file will be saved.
+    save_json_valid : str
+        Path where the validation data specification file will be saved.
+    save_json_test : str
+        Path where the test data specification file will be saved.
+    sample_rate : int
+        The sample rate to be used for the dataset
+    split_ratio : list
+        List composed of three integers that sets split ratios for train, valid,
+        and test sets, respectively. For instance split_ratio=[80, 10, 10] will
+        assign 80% of the sentences to training, 10% for validation, and 10%
+        for test.
+    libritts_subsets: list
+        List of librispeech subsets to use (e.g., dev-clean, train-clean-100, ...) for the experiment.
+        This parameter will be ignored if explicit data splits are provided.
+        Explicit data splits parameters: "train_split", "valid_split", "test_split"
+    train_split : list
+        List of librispeech subsets to use (e.g.,train-clean-100, train-clean-360) for the experiment training stage.
+    valid_split : list
+        List of librispeech subsets to use (e.g., dev-clean) for the experiment validation stage.
+    test_split : list
+        List of librispeech subsets to use (e.g., test-clean) for the experiment testing stage.
+    seed : int
+        Seed value
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    skip_prep: Bool
+        If True, skip preparation.
+
+    Returns
+    -------
+    None
+    """
+
+    if skip_prep:
+        return
+
+    # Setting the seed value
+    random.seed(seed)
+
+    # Checks if this phase is already done (if so, skips it)
+    if skip(save_json_train, save_json_valid, save_json_test):
+        logger.info("Preparation completed in previous run, skipping.")
+        return
+
+    logger.info(
+        f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}"
+    )
+
+    # If specific splits are provided, creates data manifest files accordingly
+    if train_split:
+        wav_list = prepare_split(data_folder, train_split)
+        create_json(wav_list, save_json_train, sample_rate, model_name)
+    if valid_split:
+        wav_list = prepare_split(data_folder, valid_split)
+        # TODO add better way to speedup evaluation
+        if len(wav_list) > max_valid_size:
+            wav_list = random.sample(wav_list, max_valid_size)
+        create_json(wav_list, save_json_valid, sample_rate, model_name)
+    if test_split:
+        wav_list = prepare_split(data_folder, test_split)
+        create_json(wav_list, save_json_test, sample_rate, model_name)
+
+    if skip(save_json_train, save_json_valid, save_json_test):
+        logger.info("Preparation completed.")
+        return
+
+    # If specific splits are not provided, and a list of subsets if provided, creates train, valid, test splits
+    # Creates data manifest files according to the data splits
+    if libritts_subsets:
+        wav_list = prepare_split(data_folder, libritts_subsets)
+        # Random split the signal list into train, valid, and test sets.
+        data_split = split_sets(wav_list, split_ratio)
+        # Creating json files
+        create_json(
+            data_split["train"], save_json_train, sample_rate, model_name
+        )
+        create_json(
+            data_split["valid"], save_json_valid, sample_rate, model_name
+        )
+        create_json(data_split["test"], save_json_test, sample_rate, model_name)
+
+
+def prepare_split(data_folder, split_list):
+    """
+    Processes the provided list of LibriTTS subsets and creates a list of all the .wav files present in the subsets.
+    Downloads the LibriTTS subsets as required.
+
+    Arguments
+    ---------
+    data_folder : str
+        Path to the folder where the LibriTTS dataset is stored
+    split_list : list
+        List of librispeech subsets to process (e.g., dev-clean, train-clean-100, ...)
+
+    Returns
+    -------
+    wav_list : list
+        List of all .wav files to be processed
+    """
+    extension = [".wav"]  # The expected extension for audio files
+    wav_list = list()  # Stores all audio file paths for the dataset
+
+    # For every subset of the dataset, if it doesn't exist, downloads it
+    for subset_name in split_list:
+        subset_folder = os.path.join(data_folder, subset_name)
+        subset_archive = os.path.join(subset_folder, subset_name + ".tar.gz")
+
+        if not check_folders(subset_folder):
+            logger.info(
+                f"No data found for {subset_name}. Checking for an archive file."
+            )
+            if not os.path.isfile(subset_archive):
+                logger.info(
+                    f"No archive file found for {subset_name}. Downloading and unpacking."
+                )
+                quit()
+        # Collects all files matching the provided extension
+        wav_list.extend(get_all_files(subset_folder, match_and=extension))
+
+    return wav_list
+
+
+def create_json(wav_list, json_file, sample_rate, model_name=None):
+    """
+    Creates the json file given a list of wav files.
+    Arguments
+    ---------
+    wav_list : list of str
+        The list of wav files.
+    json_file : str
+        The path of the output json file
+    sample_rate : int
+        The sample rate to be used for the dataset
+    model_name : str
+        Model name (used to prepare additional model specific data)
+    """
+
+    # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments
+    if model_name == "Tacotron2":
+        logger.info(
+            "Computing phonemes for labels using SpeechBrain G2P. This may take a while."
+        )
+        g2p = GraphemeToPhoneme.from_hparams(
+            "speechbrain/soundchoice-g2p", run_opts={"device": DEVICE}
+        )
+    else:
+        g2p = None
+
+    json_dict = {}
+
+    # Processes all the wav files in the list
+    for wav_file in tqdm(wav_list):
+        # Reads the signal
+        signal, sig_sr = torchaudio.load(wav_file)
+        duration = signal.shape[1] / sig_sr
+
+        # TODO add better way to filter short utterances
+        if duration < 1.0:
+            continue
+
+        # Manipulates path to get relative path and uttid
+        path_parts = wav_file.split(os.path.sep)
+        uttid, _ = os.path.splitext(path_parts[-1])
+        # relative_path = os.path.join("{data_root}", *path_parts[-4:])
+
+        # Gets the path for the text files and extracts the input text
+        normalized_text_path = os.path.join(
+            "/", *path_parts[:-1], uttid + ".normalized.txt"
+        )
+        try:
+            with open(normalized_text_path, encoding="utf-8") as f:
+                normalized_text = f.read()
+                if normalized_text.__contains__("{"):
+                    normalized_text = normalized_text.replace("{", "")
+                if normalized_text.__contains__("}"):
+                    normalized_text = normalized_text.replace("}", "")
+        except FileNotFoundError:
+            print(f"Warning: The file {normalized_text_path} does not exist.")
+            continue
+
+        # Resamples the audio file if required
+        if sig_sr != sample_rate:
+            resampled_signal = torchaudio.functional.resample(
+                signal, sig_sr, sample_rate
+            )
+            os.unlink(wav_file)
+            torchaudio.save(wav_file, resampled_signal, sample_rate=sample_rate)
+
+        # Gets the speaker-id from the utterance-id
+        spk_id = uttid.split("_")[0]
+
+        # Creates an entry for the utterance
+        json_dict[uttid] = {
+            "uttid": uttid,
+            "wav": wav_file,
+            "duration": duration,
+            "spk_id": spk_id,
+            "label": normalized_text,
+            "segment": True if "train" in json_file else False,
+        }
+
+        # Characters are used for Tacotron2, phonemes may be needed for other models
+        if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None:
+            # Computes phoneme labels using SpeechBrain G2P and keeps the punctuations
+            phonemes = _g2p_keep_punctuations(g2p, normalized_text)
+            json_dict[uttid].update({"label_phoneme": phonemes})
+
+    # Writes the dictionary to the json file
+    with open(json_file, mode="w", encoding="utf-8") as json_f:
+        json.dump(json_dict, json_f, indent=2)
+
+    logger.info(f"{json_file} successfully created!")
+
+
+def skip(*filenames):
+    """
+    Detects if the data preparation has been already done.
+    If the preparation has been done, we can skip it.
+
+    Arguments
+    ---------
+    *filenames : tuple
+        Set of filenames to check for existence.
+
+    Returns
+    -------
+    bool
+        if True, the preparation phase can be skipped.
+        if False, it must be done.
+    """
+    for filename in filenames:
+        if isinstance(filename, list):
+            if any(not os.path.isfile(item) for item in filename):
+                return False
+        else:
+            if not os.path.isfile(filename):
+                return False
+    return True
+
+
+def split_sets(wav_list, split_ratio):
+    """Randomly splits the wav list into training, validation, and test lists.
+
+    Arguments
+    ---------
+    wav_list : list
+        list of all the signals in the dataset
+    split_ratio: list
+        List composed of three integers that sets split ratios for train, valid,
+        and test sets, respectively. For instance split_ratio=[80, 10, 10] will
+        assign 80% of the sentences to training, 10% for validation, and 10%
+        for test.
+
+    Returns
+    -------
+    dictionary containing train, valid, and test splits.
+    """
+    # Random shuffles the list
+    random.shuffle(wav_list)
+    tot_split = sum(split_ratio)
+    tot_snts = len(wav_list)
+    data_split = {}
+    splits = ["train", "valid"]
+
+    for i, split in enumerate(splits):
+        n_snts = int(tot_snts * split_ratio[i] / tot_split)
+        data_split[split] = wav_list[0:n_snts]
+        del wav_list[0:n_snts]
+    data_split["test"] = wav_list
+
+    return data_split
+
+
+def check_folders(*folders):
+    """Returns False if any passed folder does not exist."""
+    for folder in folders:
+        if not os.path.exists(folder):
+            return False
+    return True
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 804227d55..14aa38693 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -20,6 +20,8 @@
     PositionalEncoding as TransformerPositionalEncoding,
     get_lookahead_mask,
 )
+from speechbrain.dataio.batch import PaddedBatch
+from speechbrain.utils.data_utils import batch_pad_right
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.embedding import Embedding
 from speechbrain.nnet.linear import Linear
@@ -592,216 +594,6 @@ def forward(self, enc_out, length, emb=None):
         )
 
 
-class TokotronSearchWrapper(nn.Module):
-    """A wrapper class to facilitate seach-based inference. It takes care of re-interpreting
-    a multi-headed sequence as multiple samples, for compatibility, and for the retention
-    of attention tensors
-
-    Arguments
-    ---------
-    decoder : TokotronTransformerDecoder
-        the Tokotron transformer decoder
-    """
-
-    def __init__(self, decoder):
-        super().__init__()
-        self.tokens_per_step = decoder.tokens_per_step
-        self.decoder = decoder
-
-    def decode(self, memory, enc_states, enc_lens):
-        """Wraps the decode operation, will all the necessary
-        reshaping
-
-        Arguments
-        ---------
-        memory : torch.Tensor
-            Characters predicted so far
-        enc_states : torch.Tensor
-            Encoder states
-        enc_lens : torch.Tensor
-            Encoder state lengths
-        """
-        batch_size = enc_states.size(0) // self.tokens_per_step
-        _, mem_len = memory.shape
-        memory = memory.reshape(
-            self.tokens_per_step, batch_size, mem_len
-        ).permute(1, 2, 0)
-        dec_out, dec_self_attn, dec_attn = self.decoder.decode(
-            enc_out=enc_states[:batch_size],
-            src_length=enc_lens[:batch_size],
-            tgt=memory,
-        )
-        self.dec_self_attn = dec_self_attn
-        self.dec_attn = dec_attn
-        return dec_out, dec_attn
-
-
-class TokotronTransformerBeamSearcher(S2STransformerBeamSearcher):
-    """A slight modification of S2STransformerBeamSearcher that uses an
-    explicit number of tokens instead of trying to infer it from the
-    weights of the linear layer. This is needed because Tokotron is
-    multi-header and the final output layer outputs multiple output states
-
-    Arguments
-    ---------
-    num_tokens : int
-        The number of audio tokens available
-    """
-
-    def __init__(self, num_tokens, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.num_tokens = num_tokens
-
-    def set_n_out(self):
-        """Set the number of output tokens."""
-        return self.num_tokens
-
-
-class SearchLinearWrapper(nn.Module):
-    """A wrapper for the final linear layer of the Transformer. The goal is to
-    make it compatible with the SpeechBrain Beam Search implementation, which is
-    single-headed, by expanding multiple heads along the batch dimensions.
-
-    Arguments
-    ---------
-    lin : torch.Tensor
-        A linear layer with an output feature dimensions of
-        (tokens_per_step x num_tokens)
-    tokens_per_step : int
-        the numer of tokens the model outputs for each
-        time step
-    """
-
-    def __init__(self, lin, tokens_per_step):
-        super().__init__()
-        self.lin = lin
-        self.tokens_per_step = tokens_per_step
-
-    def forward(self, x):
-        """Performs a forward pass with all the required reshape operations
-
-        Arguments
-        ---------
-        x : torch.Tensor
-            The decoder output
-
-        Returns
-        -------
-        result : torch.Tensor
-            The layer output, reshaped along the batch dimension
-        """
-        x = self.lin(x)
-        batch_size, max_len, out_dim = x.shape
-        num_tokens = x.size(-1) // self.tokens_per_step
-        x = (
-            # batch x tokens x length
-            x.transpose(2, 1)
-            # batch x heads x tokens x length
-            .view(batch_size, self.tokens_per_step, num_tokens, max_len)
-            # heads x batch x tokens x length
-            .transpose(0, 1)
-            # heads * batch x tokens x length
-            .reshape(self.tokens_per_step * batch_size, num_tokens, max_len)
-            # heads * batch x length x tokens
-            .transpose(1, 2)
-        )
-        return x
-
-
-class TokotronSearchInference(nn.Module):
-    """A beam search-based inference implementation
-
-    All keyword arguments will be passed on to the underlying
-    beam search
-    """
-
-    def __init__(self, audio_token_shift=1, **kwargs):
-        super().__init__()
-        self.search_kwargs = kwargs
-        self.audio_token_shift = audio_token_shift
-        self.decoder, self.search, self.tokens_per_step = None, None, None
-
-    def bind(self, model=None):
-        """Binds this inference implementation to a model
-
-        Arguments
-        ---------
-        model : TokotronTransformerModel
-            The transformer model
-        """
-        decoder = model.decoder
-        self.tokens_per_step = decoder.tokens_per_step
-        self.decoder = TokotronSearchWrapper(decoder)
-        self.search = TokotronTransformerBeamSearcher(
-            modules=[
-                self.decoder,
-                SearchLinearWrapper(decoder.out_proj, self.tokens_per_step),
-            ],
-            num_tokens=decoder.num_tokens + self.audio_token_shift,
-            **self.search_kwargs,
-        )
-
-    def decode(self, enc_out, length):
-        """"Decodes the encoder representation using Beam Search
-
-        Arguments
-        ---------
-        enc_out : torch.Tensor
-            Encoder output
-        length : torch.Tensor
-            Encoder output lengths
-
-        Returns
-        -------
-        output : TokotronDecoderInfernceOutput
-            The inference output
-        """
-        with torch.no_grad():
-            device = enc_out.device
-            # The search does not support multiple heads. "Trick" it by expanding encoded
-            # representations along the batch dimension so that the beam searcher
-            # treats it as if they were separate, independent samples.
-            batch_size, max_len, enc_dim = enc_out.shape
-            enc_out_search = (
-                enc_out.unsqueeze(0)
-                .expand(self.tokens_per_step, batch_size, max_len, enc_dim)
-                .reshape(self.tokens_per_step * batch_size, max_len, enc_dim)
-            )
-            length_search = (
-                length.unsqueeze(0)
-                .expand(self.tokens_per_step, batch_size)
-                .reshape(self.tokens_per_step * batch_size)
-            )
-            hyps, audio_length, scores, log_probs = self.search(
-                enc_out_search, length_search
-            )
-            tokens_batch = PaddedBatch(
-                [
-                    {"hyps": torch.tensor(item, device=enc_out.device)}
-                    for item in hyps
-                ]
-            ).to(device)
-
-            audio_tokens, length = tokens_batch.hyps
-            _, audio_max_len = audio_tokens.shape
-            audio_tokens = audio_tokens.reshape(
-                self.tokens_per_step, batch_size, audio_max_len
-            ).permute(1, 2, 0)
-            length = (
-                length.reshape(self.tokens_per_step, batch_size).min(dim=0)
-            ).values
-            audio_tokens = audio_tokens - self.audio_token_shift
-
-            return TokotronDecoderInfernceOutput(
-                audio_tokens=audio_tokens,
-                length=length,
-                dec_self_attn=self.decoder.dec_self_attn,
-                dec_attn=self.decoder.dec_attn,
-                alignments=get_alignments(self.decoder.dec_attn),
-                p_eos=None,
-            )
-
-
 class TokotronTransformerModel(nn.Module):
     """An end-to-end Tokotron model receiving characters or phonemes
     as inputs and outputting audio tokens
@@ -2263,3 +2055,212 @@ def decode(self, codes):
         """
         codes = codes.permute(2, 0, 1)
         return self.speech_tokenizer.decode(codes)
+
+
+def get_silence_token(
+    model,
+    sample_length=100000,
+    extract_emb=True,
+    model_shape="BLH",
+    unsqueeze=False,
+    device=None,
+    model_kwargs=None,
+):
+    """Attempts to find out the silence tokens for a given model,
+    if applicable
+
+    Arguments
+    ---------
+    model : nn.Module
+        A discrete token model, taking (wav, lengths) as arguments
+    sample_length : int
+        The length of the sample
+    extract_emb : bool
+        Whether to extract embeddings
+    model_shape : str
+        The shape of tokens output by the model
+        BLH: Batch x Length x Heads (Discrete SSL, Encodec)
+        BHL: Batch x Heads x Length (DAC)
+        HBL: Heads x Batch x Length (SpeechTokenizer)
+    unsqueeze: bool
+        Whether to add an extra dimension to the audio (needed for DAC)
+    device : str | torch.Device
+        The device to use
+    model_kwargs : dict
+        Additional arguments to pass to the model
+
+    Returns
+    -------
+    silence_tokens : torch.Tensor
+        The token(s) corresponding to silence
+
+    silece_emb : torch.Tensor
+        The embedding(s) corresponding to silence
+
+    """
+    if device is None:
+        device = next(model.parameters()).device
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    audio = torch.zeros(1, sample_length, device=device)
+    if unsqueeze:
+        audio = audio.unsqueeze(1)
+    length = torch.ones(1, device=device)
+    model_training = model.training
+    model.eval()
+    if hasattr(model, "encode"):
+        result = model.encode(audio, length, **model_kwargs)
+    else:
+        result = model(audio, length, **model_kwargs)
+    if model_training:
+        model.train()
+    tokens = result if torch.is_tensor(result) else result[0]
+    if model_shape == "HBL":
+        tokens = tokens.permute(1, 2, 0)
+    elif model_shape == "BHL":
+        tokens = tokens.transpose(-1, -2)
+
+    tokens = tokens.squeeze(0)
+    if unsqueeze:
+        tokens = tokens.squeeze(0)
+    silence_tokens = tokens.mode(0).values
+    silence_emb = None
+    if extract_emb:
+        if hasattr(model, "embeddings"):
+            silence_emb = model.embeddings(
+                silence_tokens[None, None, :]
+            ).squeeze()
+        else:
+            heads = tokens.shape[-1]
+            embs = result[1]
+            mode_idx = [
+                (tokens[:, head] == silence_tokens[head]).nonzero()[0].item()
+                for head in range(heads)
+            ]
+            silence_emb = torch.stack(
+                [embs[0, idx, head] for head, idx in enumerate(mode_idx)]
+            )
+    return silence_tokens, silence_emb
+
+
+def feature_pad_to(tensor, length, padding=None):
+    """Pads feature dimensions to the specified length with the specified padding,
+    assuming a (Batch x Length x Features..) tensor
+
+    Arguments
+    ---------
+    tensor : torch.Tensor
+        The tensor to be padded
+
+    length : int
+        The length to which the tensor will be padded
+
+    padding : torch.Tensor, optional
+        The padding tensor - if omitted, zero padding
+        will be used
+
+    Returns
+    -------
+    result : torch.Tensor
+        The padded tensor
+    """
+    if padding is None:
+        padding = torch.zeros(tensor.shape[1:])
+    padding = padding[None, ...].expand(
+        (length - tensor.size(0),) + tensor.shape[1:]
+    )
+    return torch.cat([tensor, padding], dim=0)
+
+
+def batch_feature_pad(tensors, padding=None):
+    """Similar to batch_pad_right but pads with the specified padding, whcih
+    can be a vector or a tensor
+
+    Arguments
+    ---------
+    tensors : list
+        The list of tensors to be padded
+    padding : torch.Tensor
+        The padding tensor
+
+    Returns
+    -------
+    result : torch.Tensor
+        the padded tensor
+    """
+    lengths_abs = torch.tensor(
+        [len(item) for item in tensors], device=tensors[0].device
+    )
+    max_length = lengths_abs.max()
+    data = torch.stack(
+        [feature_pad_to(item, max_length, padding) for item in tensors]
+    )
+    lengths = lengths_abs / max_length
+    return data, lengths
+
+
+def token_collate_fn(examples, silence_token, token_keys):
+    """A customized collation function for audio tokens where
+    the specified silence token will be used as padding - instead of
+    zeros
+
+    Arguments
+    ---------
+    examples : list
+        A list of examples
+
+    silence_token : torch.Tensor
+        The token(s) representing silence
+
+    token_keys : list
+        The list of keys to which special padding will be applied
+
+    Returns
+    -------
+    result : speechbrain.dataio.batch.PaddedBatch
+        A padded batch
+    """
+    token_tensor_ids = {id(examples[0][key]) for key in token_keys}
+    return PaddedBatch(
+        examples,
+        padding_func=_silence_padding,
+        padding_kwargs={
+            "silence_token": silence_token,
+            "token_tensor_ids": token_tensor_ids,
+        },
+    )
+
+
+def _silence_padding(values, silence_token, token_tensor_ids):
+    return (
+        batch_feature_pad(values, silence_token)
+        if id(values[0]) in token_tensor_ids
+        else batch_pad_right(values)
+    )
+
+
+def use_silence_padding(dataloader_opts, silence_token, token_keys):
+    """Overrides the collation function to add silence padding to
+    audio token features
+
+    Arguments
+    ---------
+    dataloder_opts : dict
+        Dataloader options
+    silence_token : torch.Tensor
+        The tensor to be used as silence padding
+    token_keys : torch.Tensor
+        The keys to apply silence padding to
+
+    Returns
+    -------
+    dataloader_opts : dict
+        Updated data loader options
+    """
+    return {
+        **dataloader_opts,
+        "collate_fn": partial(
+            token_collate_fn, silence_token=silence_token, token_keys=token_keys
+        ),
+    }
diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py
deleted file mode 100644
index 9dcc922cd..000000000
--- a/benchmarks/DASB/utils/audio_tokens.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""Utilities for discrete audio token models
-
-
-Authors
- * Artem Ploujnikov 2023
-"""
-import torch
-from speechbrain.dataio.batch import PaddedBatch
-from speechbrain.utils.data_utils import batch_pad_right
-from functools import partial
-
-
-def get_silence_token(
-    model,
-    sample_length=100000,
-    extract_emb=True,
-    model_shape="BLH",
-    unsqueeze=False,
-    device=None,
-    model_kwargs=None,
-):
-    """Attempts to find out the silence tokens for a given model,
-    if applicable
-
-    Arguments
-    ---------
-    model : nn.Module
-        A discrete token model, taking (wav, lengths) as arguments
-    sample_length : int
-        The length of the sample
-    extract_emb : bool
-        Whether to extract embeddings
-    model_shape : str
-        The shape of tokens output by the model
-        BLH: Batch x Length x Heads (Discrete SSL, Encodec)
-        BHL: Batch x Heads x Length (DAC)
-        HBL: Heads x Batch x Length (SpeechTokenizer)
-    unsqueeze: bool
-        Whether to add an extra dimension to the audio (needed for DAC)
-    device : str | torch.Device
-        The device to use
-    model_kwargs : dict
-        Additional arguments to pass to the model
-
-    Returns
-    -------
-    silence_tokens : torch.Tensor
-        The token(s) corresponding to silence
-
-    silece_emb : torch.Tensor
-        The embedding(s) corresponding to silence
-
-    """
-    if device is None:
-        device = next(model.parameters()).device
-    if model_kwargs is None:
-        model_kwargs = {}
-
-    audio = torch.zeros(1, sample_length, device=device)
-    if unsqueeze:
-        audio = audio.unsqueeze(1)
-    length = torch.ones(1, device=device)
-    model_training = model.training
-    model.eval()
-    result = model(audio, length, **model_kwargs)
-    if model_training:
-        model.train()
-    tokens = result if torch.is_tensor(result) else result[0]
-    if model_shape == "HBL":
-        tokens = tokens.permute(1, 2, 0)
-    elif model_shape == "BHL":
-        tokens = tokens.transpose(-1, -2)
-
-    tokens = tokens.squeeze(0)
-    if unsqueeze:
-        tokens = tokens.squeeze(0)
-    silence_tokens = tokens.mode(0).values
-    silence_emb = None
-    if extract_emb:
-        if hasattr(model, "embeddings"):
-            silence_emb = model.embeddings(
-                silence_tokens[None, None, :]
-            ).squeeze()
-        else:
-            heads = tokens.shape[-1]
-            embs = result[1]
-            mode_idx = [
-                (tokens[0, :, head] == silence_tokens[head]).nonzero()[0].item()
-                for head in range(heads)
-            ]
-            silence_emb = torch.stack(
-                [embs[0, idx, head] for head, idx in enumerate(mode_idx)]
-            )
-    return silence_tokens, silence_emb
-
-
-def feature_pad_to(tensor, length, padding=None):
-    """Pads feature dimensions to the specified length with the specified padding,
-    assuming a (Batch x Length x Features..) tensor
-
-    Arguments
-    ---------
-    tensor : torch.Tensor
-        The tensor to be padded
-
-    length : int
-        The length to which the tensor will be padded
-
-    padding : torch.Tensor, optional
-        The padding tensor - if omitted, zero padding
-        will be used
-
-    Returns
-    -------
-    result : torch.Tensor
-        The padded tensor
-    """
-    if padding is None:
-        padding = torch.zeros(tensor.shape[1:])
-    padding = padding[None, ...].expand(
-        (length - tensor.size(0),) + tensor.shape[1:]
-    )
-    return torch.cat([tensor, padding], dim=0)
-
-
-def batch_feature_pad(tensors, padding=None):
-    """Similar to batch_pad_right but pads with the specified padding, whcih
-    can be a vector or a tensor
-
-    Arguments
-    ---------
-    tensors : list
-        The list of tensors to be padded
-    padding : torch.Tensor
-        The padding tensor
-
-    Returns
-    -------
-    result : torch.Tensor
-        the padded tensor
-    """
-    lengths_abs = torch.tensor(
-        [len(item) for item in tensors], device=tensors[0].device
-    )
-    max_length = lengths_abs.max()
-    data = torch.stack(
-        [feature_pad_to(item, max_length, padding) for item in tensors]
-    )
-    lengths = lengths_abs / max_length
-    return data, lengths
-
-
-def token_collate_fn(examples, silence_token, token_keys):
-    """A customized collation function for audio tokens where
-    the specified silence token will be used as padding - instead of
-    zeros
-
-    Arguments
-    ---------
-    examples : list
-        A list of examples
-
-    silence_token : torch.Tensor
-        The token(s) representing silence
-
-    token_keys : list
-        The list of keys to which special padding will be applied
-
-    Returns
-    -------
-    result : speechbrain.dataio.batch.PaddedBatch
-        A padded batch
-    """
-    token_tensor_ids = {id(examples[0][key]) for key in token_keys}
-    return PaddedBatch(
-        examples,
-        padding_func=_silence_padding,
-        padding_kwargs={
-            "silence_token": silence_token,
-            "token_tensor_ids": token_tensor_ids,
-        },
-    )
-
-
-def _silence_padding(values, silence_token, token_tensor_ids):
-    return (
-        batch_feature_pad(values, silence_token)
-        if id(values[0]) in token_tensor_ids
-        else batch_pad_right(values)
-    )
-
-
-def use_silence_padding(dataloader_opts, silence_token, token_keys):
-    """Overrides the collation function to add silence padding to
-    audio token features
-
-    Arguments
-    ---------
-    dataloder_opts : dict
-        Dataloader options
-    silence_token : torch.Tensor
-        The tensor to be used as silence padding
-    token_keys : torch.Tensor
-        The keys to apply silence padding to
-
-    Returns
-    -------
-    dataloader_opts : dict
-        Updated data loader options
-    """
-    return {
-        **dataloader_opts,
-        "collate_fn": partial(
-            token_collate_fn, silence_token=silence_token, token_keys=token_keys
-        ),
-    }
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index c0e14f867..1cf092a46 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -9,30 +9,49 @@
 from speechbrain.inference.interfaces import Pretrained
 from speechbrain.inference.ASR import EncoderDecoderASR
 from speechbrain.lobes.models.huggingface_transformers import Whisper
+from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2
 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher
 from speechbrain.dataio.batch import PaddedBatch
 from speechbrain.utils.metric_stats import ErrorRateStats
 from speechbrain.utils.superpowers import run_shell
+from speechbrain.utils.data_utils import pad_right_to
+from speechbrain.utils.fetching import fetch
 from collections import namedtuple
 from pathlib import Path
-import os
+from torch import nn
 import torch
 import torchaudio
 import re
 import string
 import logging
-import shutil
-import shlex
-import subprocess
+
 
 logger = logging.getLogger(__name__)
 
+
+has_transformers = False
+try:
+    from transformers import AutoModelForAudioXVector
+    has_transformers = True
+except ImportError:
+    logger.warning("transformers library not found - some evaluators may be disabled")
+
+
 RE_PUNCTUATION = re.compile(
     "|".join(re.escape(char) for char in string.punctuation)
 )
 
 
+SAMPLE_RATE = 16000
+DEFAULT_ENCODER_HUB = "chaanks/wav2vec2-small"
+DEFAULT_MODEL_URL = "https://huggingface.co/chaanks/UTMOS/resolve/main"
+DEFAULT_MODEL_NAME = "utmos.ckpt"
+DEFAULT_SAVE_DIR = "./pretrained_models"
+DEFAULT_JUDGE_ID = 288
+DEFAULT_DOMAIN_ID = 0
+
 SpeechEvaluationResult = namedtuple(
     "SpeechEvaluationResult", ["score", "details"]
 )
@@ -217,77 +236,6 @@ def __call__(self, wavs, length):
         return self.mods.model(wavs, length)
 
 
-class RegressionModelSpeechEvaluator(SpeechEvaluator):
-    """A speech evaluator that uses a regression model
-    that produces a quality score (e.g. SSL fine-tuning)
-    for a sample of speech
-
-    Arguments
-    ---------
-    source : str
-        The source model path or HuggingFace hub name
-    sample_rate : int
-        The audio sample rate this evaluator expects
-    """
-
-    def __init__(self, source, sample_rate=None, *args, **kwargs):
-        super().__init__(sample_rate=sample_rate)
-        self.model = SpeechEvaluationRegressionModel.from_hparams(
-            source, *args, **kwargs
-        )
-
-    def evaluate(
-        self,
-        wavs,
-        length,
-        text=None,
-        wavs_ref=None,
-        length_ref=None,
-        sample_rate=None,
-        sample_rate_ref=None,
-    ):
-        """Evaluates a batch of waveforms
-
-        Arguments
-        ---------
-        Arguments
-        ---------
-        wavs: torch.Tensor
-            the waveforms to evaluate
-
-        length: torch.Tensor
-            relative lengths (a 1-D tensor)
-
-        text : list, optional
-            Ground truth text
-
-        wavs_ref : torch.Tensor
-            the reference waveforms
-
-        length_ref : torch.Tensor
-            the reference waveform lengths
-
-        sample_rate : int, optional
-            The sample rate of the audio. If not provided,
-            the audio is assumed to be at the same sample
-            rate as the model
-
-        sample_rate_ref : int, optional
-            The sample rate of the reference samples
-
-        Returns
-        -------
-        result : SpeechEvaluationResult
-            an aggregated speech evaluation result with a score
-            for each item
-        """
-        wavs = self.resample(wavs, sample_rate)
-        scores = self.model(wavs, length)
-        while scores.dim() > 1 and scores.size(-1) == 1:
-            scores = scores.squeeze(-1)
-        return SpeechEvaluationResult(score=scores, details={"score": scores})
-
-
 class ASRSpeechEvaluator(SpeechEvaluator):
     """A superclass for ASR speech evaluators"""
 
@@ -743,171 +691,334 @@ def evaluate_files(self, file_names, text=None, file_names_ref=None):
         raise NotImplementedError()
 
 
-UTMOS_REPO = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo"
+class UTMOSModel(nn.Module):
+    """The UTMOS model wrapper
 
+    Arguments
+    ---------
+    source : str
+        The WavLM source
+    save_path : str | path-like
+        The path where the model will be saved
+    features_dim : int, optional
+        The features dimension
+    num_domains : int, optional
+        The number of domains
+    domain_dim : int, optional
+        The dimension of each domain
+    num_judges : int, optional
+        The number of "judges"
+    judge_dim : int, optional
+        The dimension of each judge
+    decoder_hidden_size : int, optional
+        The size of the decoder hidden state
+    multiplier : float, optional
+        The number that the raw model output is multiplied by
+        to compute the score
+    offset : float, optional
+        The number that (raw output * multiplier) will be added
+        to in order to get the score
+    """
+
+    def __init__(
+        self,
+        source,
+        save_path,
+        features_dim=768,
+        num_domains=3,
+        domain_dim=128,
+        num_judges=3000,
+        judge_dim=128,
+        decoder_hidden_size=512,
+        multiplier=2.0,
+        offset=3.0,
+    ):
+        super().__init__()
+
+        self.ssl_encoder = Wav2Vec2(
+            source,
+            save_path,
+            freeze=True,
+            output_norm=False,
+            freeze_feature_extractor=True,
+            output_all_hiddens=False,
+        )
+
+        self.domain_embedding = nn.Embedding(num_domains, domain_dim)
+        self.judge_embedding = nn.Embedding(num_judges, judge_dim)
+
+        self.decoder = nn.LSTM(
+            input_size=features_dim + domain_dim + judge_dim,
+            hidden_size=decoder_hidden_size,
+            num_layers=1,
+            batch_first=True,
+            bidirectional=True,
+        )
+
+        self.classifier = nn.Sequential(
+            nn.Linear(decoder_hidden_size * 2, 2048),
+            torch.nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(2048, 1),
+        )
+        self.multiplier = multiplier
+        self.offset = offset
+
+    def forward(self, wav, domain_id=None, judge_id=None):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        wav : torch.Tensor
+            The raw waveforms
+        domain_id : torch.Tensor
+            The domain identifiers
+        judge_id : torch.Tensor
+            The judge identifier
+
+        Returns
+        -------
+        result : torch.Tensor
+            The predicted rating(s)
+        """
+
+        if domain_id is None:
+            domain_id = torch.zeros(
+                len(wav), dtype=torch.int, device=wav.device
+            )
+        if judge_id is None:
+            judge_id = (
+                torch.ones(len(wav), dtype=torch.int, device=wav.device)
+                * DEFAULT_JUDGE_ID
+            )
 
-class UTMOSSpeechEvaluator(BulkSpeechEvaluator):
-    """An evaluation wrapper for UTMOS
+        ssl_features = self.ssl_encoder(wav)
+        domain_emb = self.domain_embedding(domain_id)
+        judge_emb = self.judge_embedding(judge_id)
+
+        domain_emb = domain_emb.unsqueeze(1).expand(
+            -1, ssl_features.size(1), -1
+        )
+        judge_emb = judge_emb.unsqueeze(1).expand(-1, ssl_features.size(1), -1)
+        concatenated_feature = torch.cat(
+            [ssl_features, domain_emb, judge_emb], dim=2
+        )
+
+        decoder_output, _ = self.decoder(concatenated_feature)
+        pred = self.classifier(decoder_output)
+
+        return pred.mean(dim=1).squeeze(1) * self.multiplier + self.offset
+
+
+class UTMOSSpeechEvaluator(SpeechEvaluator):
+    """The UTMOS speech evaluator wrapper
 
     Github: https://github.com/sarulab-speech/UTMOS22
     HuggingFace: https://huggingface.co/spaces/sarulab-speech/UTMOS-demo
 
+
     Arguments
     ---------
-    model_path : str | path-like
-        The path where the HuggingFace repository was extracted
-    output_folder : str | path-like
-        The folder where results will be output
-    ckpt_path : str | path-like
-        The path to the checkpoint to be used
-    script : str | path-like
-        The path to the evaluation script, defaults to the bundled
-        predict.py
-    python : str | path-like, optional
-        The path to the Python interpreter to be used, defaults to
-        "python". Depending on the environment, it might need to be
-        changed (e.g. to "python3" or an absolute path to the interpreter)
-    use_python : bool
-        Whether to launch the script using python. This flag will need to be
-        set to False in environments where running UTMOS requires a wrapper shell
-        script (e.g. to initialize a different Python virtual environment from
-        the one in which SpeechBrain is running)
-    tmp_folder : str | path-like, optional
-        The temporary folder where files will be copied for evaluation. If
-        omitted, it will be set to output_folder. This can be useful on
-        compute environments that provide fast local storage (e.g. certain
-        compute clusters)
-    repo : str
-        The repor
+    source : str, optional
+        The WavLM source
+    save_path : str | path-like, optional
+        The path where the model will be saved
+    model_name : str
+        The name of the model hub
+    model_url : str
+        The model URL (if applicable)
+    domain_id : int
+        The domain ID of the underlying model
+    judge_id : int
+        The judge ID to use (given UTMOS was trained as an ensemble
+        of judges)
+    run_opts: dict, optional
+        The run options
+    sample_rate : int
+        The sample rate of the underlying model
     """
 
     def __init__(
         self,
-        model_path,
-        output_folder,
-        ckpt_path,
-        script="predict.py",
-        python="python",
-        use_python=True,
-        batch_size=8,
-        tmp_folder=None,
-        repo=UTMOS_REPO,
+        source=None,
+        save_path=None,
+        model_name=None,
+        model_url=None,
+        domain_id=None,
+        judge_id=None,
+        run_opts=None,
+        sample_rate=16000,
     ):
-        self.output_folder = Path(output_folder)
-        rand = torch.randint(1, 999999999, (1,)).item()
-        if tmp_folder is None:
-            tmp_folder = self.output_folder
-        else:
-            tmp_folder = Path(tmp_folder)
-        self.eval_path = (tmp_folder / f"eval_{rand}").absolute()
-        self.model_path = Path(model_path).absolute()
-        script = self.model_path / script
-        self.script = script
-        self.ckpt_path = Path(ckpt_path).absolute()
-        self.batch_size = batch_size
-        self.python = python
-        self.use_python = use_python
-        self.repo = repo
-        self.install()
-
-    def install(self):
-        if self.model_path.exists():
-            logger.info("UTMOS is already installed in %s", self.model_path)
-            return
-        logger.info(
-            "Attempting to install UTMOS from %s to %s",
-            self.repo,
-            self.model_path,
-        )
-        cmd = shlex.join(
-            [
-                "git",
-                "-C",
-                str(self.model_path.parent),
-                "clone",
-                self.repo,
-                str(self.model_path.name),
-            ]
+        super().__init__(sample_rate=sample_rate)
+        self.model = UTMOSModel(
+            source=source,
+            save_path=save_path,
         )
-        output, err, return_code = run_shell(cmd)
-        if return_code != 0:
-            raise CommandError(cmd, output, err, return_code)
-        logger.info("Repository clone successful, performing an LFS fetch")
-        cwd = Path.cwd()
-        try:
-            os.chdir(self.model_path)
-            cmd = shlex.join(["git", "lfs", "fetch"])
-            output, err, return_code = run_shell(cmd)
-            if return_code != 0:
-                raise CommandError(cmd, output, err, return_code)
-        finally:
-            os.chdir(cwd)
-        if not self.ckpt_path.exists():
-            raise ValueError("ckpt_path {ckpt_path} does not exist")
-
-    def evaluate_files(self, file_names, text, file_names_ref=None):
-        """Evaluates multiple files
+        if run_opts is not None:
+            device = run_opts.get("device")
+            if device:
+                self.model = self.model.to(device)
+        fetch(model_name, model_url, save_path)
+        model_path = Path(save_path) / model_name
+        state_dict = torch.load(model_path)
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+        self.domain_id = domain_id
+        self.judge_id = judge_id
+
+    def evaluate(
+        self,
+        wavs,
+        length,
+        text=None,
+        wavs_ref=None,
+        length_ref=None,
+        sample_rate=None,
+        sample_rate_ref=None,
+    ):
+        """Evaluates a batch of waveforms using UTMOS
 
         Arguments
         ---------
-        file_names : list
-            A list of files
-
-        text : list
-            File transcripts (not required for all evaluators)
-            Not used in this evaluator
-
-        file_names_ref : list, optional
-            A list of reference files / ground truths (if applicable)
-            Not used in this evaluator
+        wavs: torch.Tensor
+            the waveforms to evaluate
+        length: torch.Tensor
+            relative lengths (a 1-D tensor)
+        text : list, optional
+            Ground truth text. Ignored for UTMOS.
+        wavs_ref : torch.Tensor
+            the reference waveforms. Ignored for UTMOS.
+        length_ref : torch.Tensor
+            the reference waveform lengths. Ignored for UTMOS.
+        sample_rate : int, optional
+            The sample rate of the audio. If not provided,
+            the audio is assumed to be at the same sample
+            rate as the model
+        sample_rate_ref : int, optional
+            The sample rate of the reference samples. Ignored for UTMOS.
 
         Returns
         -------
         result : SpeechEvaluationResult
-            a consolidated evaluation result
+            an aggregated speech evaluation result with a score
+            for each item
         """
-        current_path = os.getcwd()
-        try:
-            self.eval_path.mkdir(parents=True, exist_ok=True)
-            logger.info("Copying the files to '%s'", self.eval_path)
-            for file_name in file_names:
-                target_file_name = self.eval_path / Path(file_name).name
-                shutil.copy(file_name, target_file_name)
-
-            logger.info("Running evaluation")
-            result_path = self.eval_path / "result.txt"
-            os.chdir(self.model_path)
-            cmd = [
-                str(self.script),
-                "--mode",
-                "predict_dir",
-                "--bs",
-                str(self.batch_size),
-                "--inp_dir",
-                str(self.eval_path),
-                "--out_path",
-                str(result_path),
-                "--ckpt_path",
-                str(self.ckpt_path),
-            ]
-            if self.use_python:
-                cmd = [self.python] + cmd
-
-            output = subprocess.check_output(cmd)
-            logger.info("Evaluation finished, output: %s", output)
-            file_names = [path.name for path in self.eval_path.glob("*.wav")]
-            with open(result_path) as result_path:
-                scores = [float(line.strip()) for line in result_path]
-            score_map = dict(zip(file_names, scores))
-            scores_ordered = [
-                score_map[Path(file_name).name] for file_name in file_names
-            ]
-            return SpeechEvaluationResult(
-                scores_ordered, {"utmos": scores_ordered}
+        wavs = self.resample(wavs, sample_rate=sample_rate)
+        domain_id, judge_id = None, None
+        if self.domain_id is not None:
+            domain_id = (
+                torch.ones(len(wavs), device=wavs.device) * self.domain_id
+            )
+        if self.judge_id is not None:
+            judge_id = torch.ones(len(wavs), device=wavs.device) * self.judge_id
+
+        scores = self.model(wav=wavs, domain_id=domain_id, judge_id=judge_id)
+        return SpeechEvaluationResult(score=scores, details={"utmos": scores})
+
+
+class SpkSimWavLM(SpeechEvaluator):
+    """A speaker similarity evaluator based on WavLM / XVector
+
+    Arguments
+    ---------
+    source : str
+        The model hub to use
+    savedir : str
+        The path where the model will be saved
+    model_sample_rate : int, optional
+        The sample rate to which all samples will be resampled
+        before being processed
+    """
+    def __init__(
+        self,
+        source,
+        savedir,
+        model_sample_rate=16000,
+        run_opts=None,
+        *args,
+        **kwargs
+    ):
+        if not has_transformers:
+            raise ValueError(
+                "Unable to use the SpkSimWavLM evaluator because the "
+                "transformers library is not enabled"
+            )
+        if run_opts is None:
+            run_opts = {}
+        device = run_opts.get("device")
+        self.model = AutoModelForAudioXVector.from_pretrained(
+            source, cache_dir=savedir,
+            *args,
+            **kwargs
+        )
+        if device is not None:
+            self.model = self.model.to(device)
+
+        self.model.eval()
+        self.model_sample_rate = model_sample_rate
+        self.device = next(self.model.parameters()).device
+
+    def evaluate(
+        self,
+        wavs,
+        length,
+        text=None,
+        wavs_ref=None,
+        length_ref=None,
+        sample_rate=None,
+        sample_rate_ref=None,
+    ):
+        # Resample
+        if sample_rate is not None:
+            wavs = torchaudio.functional.resample(
+                wavs,
+                orig_freq=sample_rate,
+                new_freq=self.model_sample_rate
             )
-        finally:
-            os.chdir(current_path)
-            shutil.rmtree(self.eval_path)
+        if sample_rate_ref is not None:
+            wavs_ref = torchaudio.functional.resample(
+                wavs_ref,
+                orig_freq=sample_rate_ref,
+                new_freq=self.model_sample_rate
+            )
+
+        # Concatenate
+        batch_size, wavs_max_len = wavs.shape
+        _, wavs_ref_max_len = wavs_ref.shape
+        length_abs = length * wavs_max_len
+        length_ref_abs = length_ref * wavs_ref_max_len
+        max_len = max(wavs_max_len, wavs_ref_max_len)
+        wavs, _ = pad_right_to(
+            wavs,
+            (batch_size, max_len)
+        )
+        wavs_ref, _ = pad_right_to(
+            wavs_ref,
+            (batch_size, max_len)
+        )
+        audio = torch.cat([wavs, wavs_ref])
+
+        length_cat_abs = torch.cat([length_abs, length_ref_abs])
+        # Attention mask
+        attention_mask = length_to_mask(
+            length_cat_abs.int()
+        ).long()  # 0 for masked tokens
+        # Forward
+        embs = self.model(
+            input_values=audio,
+            attention_mask=attention_mask,
+            output_attentions=False,
+        ).embeddings
+        hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)])
+        scores = torch.nn.functional.cosine_similarity(
+            hyp_embs, ref_embs, dim=-1
+        )
+
+        return SpeechEvaluationResult(
+            scores,
+            {"score": scores}
+        )
 
 
 def vocoder_to_device(vocoder, device):

From 252f1d738145ade2251f1bd6070595a6ae1c2c49 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 11 Jan 2025 23:12:38 -0500
Subject: [PATCH 052/270] add new tokenziers andadopt to SB main repo

---
 .../hparams/LSTM/speech_tokenizer.yaml        |    2 +-
 .../hparams/contextnet/speech_tokenizer.yaml  |    2 +-
 .../extraction/hparams/speech_tokenizer.yaml  |    2 +-
 benchmarks/DASB/extra_requirements.txt        |    1 +
 benchmarks/DASB/model/sq_codec.py             | 1356 +++++++++++++++++
 benchmarks/DASB/utils/tokenizer_interface.py  |  287 +++-
 6 files changed, 1634 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/DASB/model/sq_codec.py

diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
index d0e9aae5b..9607dab79 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml
@@ -126,7 +126,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
index 7fdbf8d51..615777a99 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml
@@ -120,7 +120,7 @@ prune_history: False
 
 ############################## models ################################
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
    save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 5d897a782..161d4e870 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -42,7 +42,7 @@ freeze_embedding: False
 save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt
index e97e16b28..dffb3cd07 100644
--- a/benchmarks/DASB/extra_requirements.txt
+++ b/benchmarks/DASB/extra_requirements.txt
@@ -2,6 +2,7 @@ beartype
 jsonlines
 kaldiio
 librosa>=0.9.2
+omegaconf
 onnxruntime>=1.16.3
 orion
 orion[profet]
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
new file mode 100644
index 000000000..f04c094d4
--- /dev/null
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -0,0 +1,1356 @@
+"""This lobe enables the integration of speech codec model (SQ-Codec) with scalar quantization,.
+
+SQ-Codec effectively maps the complex speech signal into a finite and compact latent space, named scalar latent space.
+
+Repository: https://github.com/yangdongchao/SimpleSpeech
+Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893
+
+Authors
+ * Pooneh Mousavi 2024
+"""
+
+import logging
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from omegaconf import OmegaConf
+from torch.autograd import Function
+from torch.nn.utils import remove_weight_norm, weight_norm
+
+
+class SQCodec(nn.Module):
+    """
+    Speech codec model (SQ-Codec) with scalar quantization. It maps the complex speech signal into a finite and compact latent space.
+    The model consists of an encoder-decoder architecture with optional causal convolutions, downsampling, and upsampling layers.
+    It uses vector quantization and various convolutional blocks for processing.
+
+    Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo:
+        - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip
+
+    Repository: https://github.com/yangdongchao/SimpleSpeech
+    Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893
+
+    Arguments
+    ---------
+    save_path : str, optional
+        Directory where the model and configuration files are saved (default is None).
+    config : str, optional
+        Configuration filename for the model. It is extracted form zip file(default is 'config.yaml').
+    checkpoint : str, optional
+        Model checkpoint filename. It is extracted form zip file( (default is 'ckpt_00190000.pth').
+    sample_rate : int, optional
+        Sample rate for input audio (default is 16000).
+    dim_codebook : int, optional
+        Dimension of each codebook (default is 19683).
+    n_codebook : int, optional
+        Number of codebooks used (default is 4).
+    bw : float, optional
+        Bandwidth parameter (default is 2).
+    clip_length : int, optional
+        Maximum clip length for processing (default is 450).
+
+    Example
+    -------
+    >>> save_path = "savedir"
+    >>> config = "config.yaml"
+    >>> checkpoint = "ckpt_00190000.pth"
+    >>> model = SQCodec(save_path, config, checkpoint)
+    >>> audio = torch.randn(3, 16000)
+    >>> tokens, emb = model.encode(audio)
+    >>> tokens.shape
+    torch.Size([3, 200])
+    >>> emb.shape
+    torch.Size([3, 36, 50])
+    >>> rec = model.decode(tokens)
+    >>> rec.shape
+    torch.Size([3, 1, 16000])
+    """
+
+    def __init__(
+        self,
+        save_path,
+        config,
+        checkpoint,
+        sample_rate=16000,
+        dim_codebook=19683,
+        n_codebook=4,
+        bw=2,
+        clip_length=450,
+    ):
+        super(SQCodec, self).__init__()
+        self.config_path = os.path.join(save_path, config)
+        self.ckpt_path = os.path.join(save_path, checkpoint)
+        if not os.path.exists(self.config_path) and not os.path.exists(
+            self.ckpt_path
+        ):
+            err_msg = (
+                "the files  %s or %s does not exist."
+                "(make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo:"
+                " https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip)"
+                % (self.ckpt_path, self.config_path)
+            )
+            raise FileNotFoundError(err_msg)
+        self.clip_length = clip_length
+
+        logging.info(
+            f"Using config {self.config_path} and model {self.ckpt_path}"
+        )
+
+        self.scalar_codec = self.build_codec_model(self.config_path)
+        self.sr = sample_rate
+        self.dim_codebook = dim_codebook
+        self.n_codebook = n_codebook
+        self.bw = bw
+        self.mask_id = self.dim_codebook * self.n_codebook
+
+    def build_codec_model(self, config):
+        """
+        Loads and builds the scalar codec model from the given configuration.
+
+        Parameters
+        ----------
+        config : str
+            Path to the configuration file.
+
+        Returns
+        -------
+        ScalarModel
+            The built scalar codec model loaded with weights from the checkpoint.
+        """
+        exp_model_config = OmegaConf.load(config)
+        scalar_codec = ScalarModel(**exp_model_config.generator.config)
+        parameter_dict = torch.load(self.ckpt_path)
+        scalar_codec.load_state_dict(parameter_dict["codec_model"])
+        return scalar_codec
+
+    def _flatten_codebooks(self, arr, offset_size=None):
+        """
+        Flattens a 3D array (B, N, D) to a 1D array while applying an offset to each codebook if specified.
+
+        Parameters
+        ----------
+        arr : numpy.ndarray
+            A 3D array of shape (B, N, D).
+        offset_size : int or None, optional
+            The offset size to be applied to each codebook slice (default is None).
+
+        Returns
+        -------
+        numpy.ndarray
+            A 1D array representing the flattened codebooks.
+        """
+        assert (
+            len(arr.shape) == 3
+        ), "Input array must have 3 dimensions [B, N, D]"
+        N, B, D = arr.shape
+        arr = arr.copy()
+        if offset_size is not None:
+            for n in range(N):
+                arr[n, :, :] += offset_size * n
+        flattened_arr = arr.transpose(1, 2, 0).reshape(B, N * D)
+        return flattened_arr
+
+    def encode(self, inputs):
+        """
+        Encodes the input audio tensor using the scalar codec and quantizes the output.
+
+        Parameters
+        ----------
+        inputs : torch.Tensor
+            Input audio tensor of shape (B, T) or (B, 1, T), where B is the batch size
+            and T is the length of the audio sequence.
+
+        Returns
+        -------
+        tuple
+            A tuple containing:
+            - torch.Tensor: The flattened and quantized encoded representation of the input.
+            - torch.Tensor: Quantized embedding.
+        """
+        if inputs.dim() == 2:
+            inputs = inputs.unsqueeze(1)
+        compressed = self.scalar_codec.encode(inputs)
+        chunks = compressed.chunk(self.n_codebook, dim=1)
+        codec_ls = []
+        for i, chunk in enumerate(chunks):
+            chunk = chunk.detach().cpu().numpy().astype(np.int32) + 1
+            tmp_codec = ternary_matrix_to_decimal(chunk)
+            codec_ls.append(tmp_codec)
+        codec_ls = np.array(codec_ls)
+        flat_codec = self._flatten_codebooks(codec_ls, self.dim_codebook)
+        flat_codec = torch.from_numpy(flat_codec).to(torch.int32)
+        return flat_codec.to(inputs.device), compressed.to(inputs.device)
+
+    def decode(self, codes):
+        """
+        Decodes the quantized codes back into an audio tensor.
+
+        Parameters
+        ----------
+        codes : torch.Tensor
+            Quantized codes with shape (B, T).
+
+        Returns
+        -------
+        torch.Tensor
+            Reconstructed audio signal.
+        """
+        assert codes.dim() == 2
+        B, T = codes.shape
+        assert (
+            T % self.n_codebook == 0
+        ), "Length T must be divisible by n_codebook"
+        codes = codes.view(B, -1, self.n_codebook).permute(2, 0, 1)
+        for i in range(self.n_codebook):
+            codes[i, :, :] -= i * self.dim_codebook
+        emb_quant = []
+        for i in range(self.n_codebook):
+            tmp_list = decimal_to_ternary_matrix(codes[i, :, :], D=9) - 1
+            emb_quant.append(tmp_list)
+        emb_quant = torch.cat(emb_quant, dim=1)
+        out = self.scalar_codec.decode(emb_quant.float().to(codes.device))
+        return out.detach().cpu().squeeze(0)
+
+    def reconstruct(self, wav_root):
+        """
+        Processes a given waveform file by encoding and decoding it through the scalar codec.
+
+        Parameters
+        ----------
+        wav_root : str
+            Path to the waveform file.
+
+        Returns
+        -------
+        torch.Tensor or None
+            Processed waveform tensor or None if the file is empty.
+        """
+        wav, sr = torchaudio.load(wav_root)
+        if wav.numel() == 0:
+            return None
+        if sr != self.sr:
+            wav = torchaudio.transforms.Resample(sr, self.sr)(wav)
+        wav = wav.unsqueeze(1)
+        emb, emb_quant, x = self.scalar_codec.inference(wav)
+        return x.detach().cpu().squeeze(0)
+
+    @property
+    def is_discrete(self):
+        """Indicates whether the codec works with discrete values."""
+        return True
+
+    @property
+    def codebook_length(self):
+        """Returns the total length of the codebook."""
+        return self.dim_codebook * self.n_codebook + 1
+
+    def find_length(self, x):
+        """
+        Finds the length of the tokenized version of the input tensor.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        int
+            The length of the tokenized input.
+        """
+        return self.tokenize(x).shape[0] // self.n_codebook
+
+
+class ScalarModel(nn.Module):
+    """
+    A custom neural network model for encoding and decoding audio signals.
+
+    The model consists of an encoder-decoder architecture with optional
+    causal convolutions, downsampling, and upsampling layers. It uses
+    vector quantization and various convolutional blocks for processing.
+
+
+    Arguments
+    ---------
+    num_bands : int
+        Number of input bands (or channels).
+    sample_rate : int
+        Sample rate of the input signal.
+    causal : bool
+        If True, uses causal convolutions for processing.
+    num_samples : int
+        Number of samples to process for downsampling or upsampling.
+    downsample_factors : list of int
+        List of factors to downsample the input.
+    downsample_kernel_sizes : list of int
+        List of kernel sizes for downsampling layers.
+    upsample_factors : list of int
+        List of factors to upsample the input.
+    upsample_kernel_sizes : list of int
+        List of kernel sizes for upsampling layers.
+    latent_hidden_dim : int
+        Dimension of the latent representation.
+    default_kernel_size : int
+        Default kernel size for convolutional layers.
+    delay_kernel_size : int
+        Kernel size used for the delay convolutional layer.
+    init_channel : int
+        Number of initial channels for the encoder and decoder.
+    res_kernel_size : int
+        Kernel size used for the residual convolutional blocks.
+
+    Example
+    -------
+    >>> model = ScalarModel(num_bands=1, sample_rate=16000,causal=True,num_samples=2,downsample_factors=[2,4,4,5],downsample_kernel_sizes=[4,8,8,10],upsample_factors=[5,4,4,2],upsample_kernel_sizes=[10,8,8,4],latent_hidden_dim=36,default_kernel_size=7,delay_kernel_size=5,init_channel=48,res_kernel_size=7) # doctest: +SKIP
+    >>> audio = torch.randn(3, 1, 16000)
+    >>> quant_emb = model.encode(audio) # doctest: +SKIP
+    >>> quant_emb.shape
+    torch.Size([3, 36, 50])
+    >>> rec = model.decode(quant_emb) # doctest: +SKIP
+    >>> rec.shap) # doctest: +SKIP
+    torch.Size([3, 1, 16000])
+    """
+
+    def __init__(
+        self,
+        num_bands,
+        sample_rate,
+        causal,
+        num_samples,
+        downsample_factors,
+        downsample_kernel_sizes,
+        upsample_factors,
+        upsample_kernel_sizes,
+        latent_hidden_dim,
+        default_kernel_size,
+        delay_kernel_size,
+        init_channel,
+        res_kernel_size,
+    ):
+        super(ScalarModel, self).__init__()
+        self.sample_rate = sample_rate
+        self.encoder = []
+        self.decoder = []
+        self.vq = lambda x: CustomRoundingFunction.apply(x, "binary")
+
+        # Encoder layers
+        self.encoder.append(
+            weight_norm(
+                Conv1d(
+                    num_bands,
+                    init_channel,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+        if num_samples > 1:
+            # Downsampling layer
+            self.encoder.append(
+                PreProcessor(
+                    init_channel,
+                    init_channel,
+                    num_samples,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        for i, down_factor in enumerate(downsample_factors):
+            self.encoder.append(
+                ResEncoderBlock(
+                    init_channel * np.power(2, i),
+                    init_channel * np.power(2, i + 1),
+                    down_factor,
+                    downsample_kernel_sizes[i],
+                    res_kernel_size,
+                    causal=causal,
+                )
+            )
+        self.encoder.append(
+            weight_norm(
+                Conv1d(
+                    init_channel * np.power(2, len(downsample_factors)),
+                    latent_hidden_dim,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+
+        # Decoder layers
+        self.decoder.append(
+            weight_norm(
+                Conv1d(
+                    latent_hidden_dim,
+                    init_channel * np.power(2, len(upsample_factors)),
+                    kernel_size=delay_kernel_size,
+                )
+            )
+        )
+        for i, upsample_factor in enumerate(upsample_factors):
+            self.decoder.append(
+                ResDecoderBlock(
+                    init_channel * np.power(2, len(upsample_factors) - i),
+                    init_channel * np.power(2, len(upsample_factors) - i - 1),
+                    upsample_factor,
+                    upsample_kernel_sizes[i],
+                    res_kernel_size,
+                    causal=causal,
+                )
+            )
+        if num_samples > 1:
+            self.decoder.append(
+                PostProcessor(
+                    init_channel,
+                    init_channel,
+                    num_samples,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        self.decoder.append(
+            weight_norm(
+                Conv1d(
+                    init_channel,
+                    num_bands,
+                    kernel_size=default_kernel_size,
+                    causal=causal,
+                )
+            )
+        )
+
+        self.encoder = nn.ModuleList(self.encoder)
+        self.decoder = nn.ModuleList(self.decoder)
+
+    def forward(self, x):
+        """
+        Performs a forward pass through the encoder and decoder.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, channels, length).
+
+        Returns
+        -------
+        torch.Tensor
+            Reconstructed output tensor.
+        """
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))
+        x = self.vq(x)  # Quantization step
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return x
+
+    def inference(self, x):
+        """
+        Encodes input tensor `x` and decodes the quantized embeddings.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, channels, length).
+
+        Returns
+        -------
+        tuple
+            A tuple (emb, emb_quant, x), where `emb` is the latent embedding,
+            `emb_quant` is the quantized embedding, and `x` is the decoded output.
+        """
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))
+        emb = x
+        emb_quant = self.vq(emb)
+        x = emb_quant
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return emb, emb_quant, x
+
+    def encode(self, x):
+        """
+        Encodes the input tensor `x` into a quantized embedding.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, channels, length).
+
+        Returns
+        -------
+        torch.Tensor
+            Quantized embedding.
+        """
+        for i, layer in enumerate(self.encoder):
+            if i != len(self.encoder) - 1:
+                x = layer(x)
+            else:
+                x = F.tanh(layer(x))
+        emb = x
+        emb_quant = self.vq(emb)
+        return emb_quant
+
+    def decode(self, emb_quant):
+        """
+        Decodes the quantized embeddings back into a tensor.
+
+        Parameters
+        ----------
+        emb_quant : torch.Tensor
+            Quantized embedding tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Reconstructed output tensor.
+        """
+        x = emb_quant
+        for i, layer in enumerate(self.decoder):
+            x = layer(x)
+        return x
+
+
+class CustomRoundingFunction(Function):
+    """
+    A customizable rounding function for various rounding operations, including:
+    - Rounding to the nearest multiple of a specified divisor.
+    - Rounding to the nearest integer.
+    - Applying the Heaviside step function.
+
+    Arguments
+    ---------
+    mode : str
+        The mode of the operation. Can be 'round', 'binary', or 'heaviside'.
+    divisor : float, optional
+        The divisor for rounding. Only used in 'round' mode.
+    """
+
+    @staticmethod
+    def forward(ctx, input, mode="round", divisor=1.0):
+        """
+        Forward pass for the custom rounding function.
+
+        Arguments
+        ---------
+        ctx : context object
+            Context object used to store information for the backward computation.
+        input : torch.Tensor
+            The input tensor to be processed.
+        mode : str
+            The mode of the operation ('round', 'binary', 'heaviside').
+        divisor : float
+            The divisor for rounding. Only used in 'round' mode.
+
+        Returns
+        -------
+        torch.Tensor
+            The processed tensor after applying the operation.
+        """
+        ctx.mode = mode
+        ctx.divisor = divisor
+
+        if mode == "round":
+            return torch.round(divisor * input) / divisor
+        elif mode == "binary":
+            return torch.round(input)
+        elif mode == "heaviside":
+            values = torch.tensor([0.0]).type_as(input)
+            return torch.heaviside(input, values)
+        else:
+            raise ValueError(
+                f"Invalid mode '{mode}'. Supported modes: 'round', 'binary', 'heaviside'."
+            )
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass for the custom rounding function.
+
+        Arguments
+        ---------
+        ctx : context object
+            Context object containing information saved during the forward pass.
+        grad_output : torch.Tensor
+            The gradient of the output with respect to the loss.
+
+        Returns
+        -------
+        torch.Tensor
+            The gradient of the input with respect to the loss.
+        """
+        # For all modes, the gradient is propagated unchanged.
+        return grad_output.clone(), None, None
+
+
+class PreProcessor(nn.Module):
+    """
+    A module for preprocessing input data through convolution and pooling operations.
+    It is used as an initial step before the encoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1.
+
+    Arguments
+    ---------
+    n_in : int
+        Number of input channels.
+    n_out : int
+        Number of output channels.
+    num_samples : int
+        Number of samples for pooling.
+    kernel_size : int, optional
+        Size of the convolutional kernel (default is 7).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    """
+
+    def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False):
+        super(PreProcessor, self).__init__()
+        self.pooling = torch.nn.AvgPool1d(kernel_size=num_samples)
+        self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal)
+        self.activation = nn.PReLU()
+
+    def forward(self, x):
+        """
+        Applies convolution, activation, and pooling to the input data.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        output = self.activation(self.conv(x))
+        output = self.pooling(output)
+        return output
+
+
+class PostProcessor(nn.Module):
+    """
+    A module for postprocessing data through convolution and reshaping.
+    It is used as an initial step after the decoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1.
+
+    Arguments
+    ---------
+    n_in : int
+        Number of input channels.
+    n_out : int
+        Number of output channels.
+    num_samples : int
+        Number of samples for repetition.
+    kernel_size : int, optional
+        Size of the convolutional kernel (default is 7).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    """
+
+    def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False):
+        super(PostProcessor, self).__init__()
+        self.num_samples = num_samples
+        self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal)
+        self.activation = nn.PReLU()
+
+    def forward(self, x):
+        """
+        Applies reshaping, repetition, and convolution to the input data.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        x = torch.transpose(x, 1, 2)
+        B, T, C = x.size()
+        x = x.repeat(1, 1, self.num_samples).view(B, -1, C)
+        x = torch.transpose(x, 1, 2)
+        output = self.activation(self.conv(x))
+        return output
+
+
+class DownsampleLayer(nn.Module):
+    """
+    A downsampling layer that applies convolution, optional pooling, and activation.
+
+    Arguments
+    ---------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int
+        Size of the convolutional kernel.
+    stride : int, optional
+        Stride of the convolution (default is 1).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    activation : nn.Module, optional
+        Activation function (default is PReLU).
+    use_weight_norm : bool, optional
+        If True, applies weight normalization to the convolution (default is True).
+    pooling : bool, optional
+        If True, applies an average pooling operation (default is False).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        activation=nn.PReLU(),
+        use_weight_norm: bool = True,
+        pooling: bool = False,
+    ):
+        super(DownsampleLayer, self).__init__()
+        self.pooling = pooling
+        self.stride = stride
+        self.activation = activation
+        self.use_weight_norm = use_weight_norm
+        if pooling:
+            self.layer = Conv1d(
+                in_channels, out_channels, kernel_size, causal=causal
+            )
+            self.pooling = nn.AvgPool1d(kernel_size=stride)
+        else:
+            self.layer = Conv1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                causal=causal,
+            )
+        if use_weight_norm:
+            self.layer = weight_norm(self.layer)
+
+    def forward(self, x):
+        """
+        Applies convolution, optional pooling, and activation to the input data.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        x = self.layer(x)
+        x = self.activation(x) if self.activation is not None else x
+        if self.pooling:
+            x = self.pooling(x)
+        return x
+
+    def remove_weight_norm(self):
+        """
+        Removes weight normalization from the convolutional layer.
+        """
+        if self.use_weight_norm:
+            remove_weight_norm(self.layer)
+
+
+class UpsampleLayer(nn.Module):
+    """
+    An upsampling layer that applies transposed convolution or repetition, with activation.
+
+    Arguments
+    ---------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int
+        Size of the convolutional kernel.
+    stride : int, optional
+        Stride of the transposed convolution (default is 1).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    activation : nn.Module, optional
+        Activation function (default is PReLU).
+    use_weight_norm : bool, optional
+        If True, applies weight normalization to the convolution (default is True).
+    repeat : bool, optional
+        If True, applies repetition instead of transposed convolution (default is False).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        activation=nn.PReLU(),
+        use_weight_norm: bool = True,
+        repeat: bool = False,
+    ):
+        super(UpsampleLayer, self).__init__()
+        self.repeat = repeat
+        self.stride = stride
+        self.activation = activation
+        self.use_weight_norm = use_weight_norm
+        if repeat:
+            self.layer = Conv1d(
+                in_channels, out_channels, kernel_size, causal=causal
+            )
+        else:
+            self.layer = ConvTranspose1d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                causal=causal,
+            )
+        if use_weight_norm:
+            self.layer = weight_norm(self.layer)
+
+    def forward(self, x):
+        """
+        Applies upsampling through transposed convolution or repetition, followed by activation.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        x = self.layer(x)
+        x = self.activation(x) if self.activation is not None else x
+        if self.repeat:
+            x = torch.transpose(x, 1, 2)
+            B, T, C = x.size()
+            x = x.repeat(1, 1, self.stride).view(B, -1, C)
+            x = torch.transpose(x, 1, 2)
+        return x
+
+    def remove_weight_norm(self):
+        """
+        Removes weight normalization from the convolutional layer.
+        """
+        if self.use_weight_norm:
+            remove_weight_norm(self.layer)
+
+
+class ResidualUnit(nn.Module):
+    """
+    A residual unit with two convolutional layers and activation functions.
+    This module is commonly used in the encoder and decoder blocks of the ScalarModel
+
+    Arguments
+    ---------
+    n_in : int
+        Number of input channels.
+    n_out : int
+        Number of output channels.
+    dilation : int
+        Dilation factor for the first convolutional layer.
+    res_kernel_size : int, optional
+        Size of the convolutional kernel for residual connections (default is 7).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    """
+
+    def __init__(self, n_in, n_out, dilation, res_kernel_size=7, causal=False):
+        super(ResidualUnit, self).__init__()
+        self.conv1 = weight_norm(
+            Conv1d(
+                n_in,
+                n_out,
+                kernel_size=res_kernel_size,
+                dilation=dilation,
+                causal=causal,
+            )
+        )
+        self.conv2 = weight_norm(
+            Conv1d(n_in, n_out, kernel_size=1, causal=causal)
+        )
+        self.activation1 = nn.PReLU()
+        self.activation2 = nn.PReLU()
+
+    def forward(self, x):
+        """
+        Applies two convolutional layers with activations and adds the input for a residual connection.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor with residual connection applied.
+        """
+        output = self.activation1(self.conv1(x))
+        output = self.activation2(self.conv2(output))
+        return output + x
+
+
+class ResEncoderBlock(nn.Module):
+    """
+    A residual encoder block with multiple residual units and a downsampling layer.
+
+    Arguments
+    ---------
+    n_in : int
+        Number of input channels.
+    n_out : int
+        Number of output channels.
+    stride : int
+        Stride for the downsampling layer.
+    down_kernel_size : int
+        Kernel size for the downsampling layer.
+    res_kernel_size : int, optional
+        Size of the convolutional kernel for residual connections (default is 7).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    """
+
+    def __init__(
+        self,
+        n_in,
+        n_out,
+        stride,
+        down_kernel_size,
+        res_kernel_size=7,
+        causal=False,
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                ResidualUnit(
+                    n_in,
+                    n_out // 2,
+                    dilation=1,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=3,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=5,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=7,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out // 2,
+                    n_out // 2,
+                    dilation=9,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+            ]
+        )
+        self.down_conv = DownsampleLayer(
+            n_in, n_out, down_kernel_size, stride=stride, causal=causal
+        )
+
+    def forward(self, x):
+        """
+        Applies a series of residual units and a downsampling layer.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        for conv in self.convs:
+            x = conv(x)
+        x = self.down_conv(x)
+        return x
+
+
+class ResDecoderBlock(nn.Module):
+    """
+    A residual decoder block with upsampling and multiple residual units.
+
+    Arguments
+    ---------
+    n_in : int
+        Number of input channels.
+    n_out : int
+        Number of output channels.
+    stride : int
+        Stride for the upsampling layer.
+    up_kernel_size : int
+        Kernel size for the upsampling layer.
+    res_kernel_size : int, optional
+        Size of the convolutional kernel for residual connections (default is 7).
+    causal : bool, optional
+        If True, applies causal convolution (default is False).
+    """
+
+    def __init__(
+        self,
+        n_in,
+        n_out,
+        stride,
+        up_kernel_size,
+        res_kernel_size=7,
+        causal=False,
+    ):
+        super(ResDecoderBlock, self).__init__()
+        self.up_conv = UpsampleLayer(
+            n_in,
+            n_out,
+            kernel_size=up_kernel_size,
+            stride=stride,
+            causal=causal,
+            activation=None,
+        )
+        self.convs = nn.ModuleList(
+            [
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=1,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=3,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=5,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=7,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+                ResidualUnit(
+                    n_out,
+                    n_out,
+                    dilation=9,
+                    res_kernel_size=res_kernel_size,
+                    causal=causal,
+                ),
+            ]
+        )
+
+    def forward(self, x):
+        """
+        Applies upsampling followed by a series of residual units.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Processed output tensor.
+        """
+        x = self.up_conv(x)
+        for conv in self.convs:
+            x = conv(x)
+        return x
+
+
+class Conv1d(nn.Conv1d):
+    """
+    Custom 1D convolution layer with an optional causal mode.
+
+    This class extends PyTorch's `nn.Conv1d` and allows for causal convolutions
+    by automatically applying the correct amount of padding to ensure that the output
+    does not depend on future inputs, which is useful for sequential data processing.
+
+    Arguments
+    ---------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int
+        Size of the convolutional kernel.
+    stride : int, optional
+        Stride of the convolution (default is 1).
+    dilation : int, optional
+        Dilation factor for the convolution (default is 1).
+    groups : int, optional
+        Number of blocked connections from input channels to output channels (default is 1).
+    padding_mode : str, optional
+        Padding mode to use ('zeros', 'reflect', 'replicate', or 'circular') (default is 'zeros').
+    bias : bool, optional
+        If True, adds a learnable bias to the output (default is True).
+    padding : int, optional
+        Explicit padding value. If not provided, it will be computed automatically.
+    causal : bool, optional
+        If True, applies causal convolution where the output depends only on the past and current inputs (default is False).
+    w_init_gain : str, optional
+        Gain value used for Xavier initialization (e.g., 'relu', 'tanh', etc.). If provided, applies Xavier uniform initialization to the convolutional weights.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        padding_mode: str = "zeros",
+        bias: bool = True,
+        padding=None,
+        causal: bool = False,
+        w_init_gain=None,
+    ):
+        self.causal = causal
+        if padding is None:
+            if causal:
+                padding = 0
+                self.left_padding = dilation * (kernel_size - 1)
+            else:
+                padding = get_padding(kernel_size, dilation)
+        super(Conv1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            bias=bias,
+        )
+        if w_init_gain is not None:
+            torch.nn.init.xavier_uniform_(
+                self.weight, gain=torch.nn.init.calculate_gain(w_init_gain)
+            )
+
+    def forward(self, x):
+        """
+        Applies the forward pass of the convolutional layer.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, channels, sequence_length).
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor after applying the convolution operation.
+            If `causal` is True, the input tensor is padded to ensure that
+            the output at each timestep only depends on the current and previous inputs.
+        """
+        if self.causal:
+            x = F.pad(x.unsqueeze(2), (self.left_padding, 0, 0, 0)).squeeze(2)
+
+        return super(Conv1d, self).forward(x)
+
+
+class ConvTranspose1d(nn.ConvTranspose1d):
+    """
+    Custom transposed 1D convolution layer with causal option.
+
+    Arguments
+    ---------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int
+        Size of the convolutional kernel.
+    stride : int, optional
+        Stride of the convolution (default is 1).
+    output_padding : int, optional
+        Additional size added to one side of the output (default is 0).
+    groups : int, optional
+        Number of blocked connections (default is 1).
+    bias : bool, optional
+        If True, adds a learnable bias (default is True).
+    dilation : int, optional
+        Dilation factor (default is 1).
+    padding : int, optional
+        Explicit padding value (default is None).
+    padding_mode : str, optional
+        Padding mode (default is 'zeros').
+    causal : bool, optional
+        If True, applies causal convolution.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        output_padding: int = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding=None,
+        padding_mode: str = "zeros",
+        causal: bool = False,
+    ):
+        if padding is None:
+            padding = 0 if causal else (kernel_size - stride) // 2
+        if causal:
+            assert (
+                padding == 0
+            ), "padding is not allowed in causal ConvTranspose1d."
+            assert (
+                kernel_size == 2 * stride
+            ), "kernel_size must be equal to 2*stride is not allowed in causal ConvTranspose1d."
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            bias=bias,
+            dilation=dilation,
+            padding_mode=padding_mode,
+        )
+        self.causal = causal
+        self.stride = stride
+
+    def forward(self, x):
+        """
+        Applies the transposed convolution operation.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor.
+
+        Returns
+        -------
+        torch.Tensor
+            Transposed convolved output tensor.
+        """
+        x = super(ConvTranspose1d, self).forward(x)
+        if self.causal:
+            x = x[:, :, : -self.stride]
+        return x
+
+
+def decimal_to_ternary_matrix(decimals, D):
+    """
+    Convert a tensor of decimal numbers to a D*T ternary matrix for each batch.
+
+    Arguments
+    ---------
+    decimals : torch.Tensor
+        A 2D tensor of decimal numbers with shape (B, T), where B is the batch size
+        and T is the number of elements in each batch.
+    D : int
+        Number of ternary digits to represent each number (depth).
+
+    Returns
+    -------
+    torch.Tensor
+        A 3D tensor of shape (B, D, T) where each slice along the first dimension
+        corresponds to a batch, and each column is represented as a ternary number.
+    """
+    B, T = decimals.shape
+    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long)
+    for pos in range(D):
+        ternary_matrix[:, pos, :] = decimals % 3  # Modulo operation
+        decimals //= 3  # Floor division for next ternary digit
+
+    return ternary_matrix
+
+
+def ternary_matrix_to_decimal(matrix):
+    """
+    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
+
+    Arguments
+    ---------
+    matrix : numpy.ndarray
+        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
+        of ternary digits, and N is the number of ternary numbers in each batch.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 2D numpy array of shape (B, N), where each value represents the decimal
+        equivalent of the corresponding ternary number in the input matrix.
+    """
+    B, D, N = (
+        matrix.shape
+    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
+    powers_of_three = 3 ** np.arange(D)  # [3^0, 3^1, ..., 3^(D-1)]
+
+    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
+    powers_of_three = powers_of_three[:, np.newaxis]  # Shape [D, 1]
+
+    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
+    decimals = np.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+
+    return decimals
+
+
+def get_padding(kernel_size, dilation=1):
+    """
+    Computes the padding size for a given kernel size and dilation.
+
+    Arguments
+    ---------
+    kernel_size : int
+        Size of the convolutional kernel.
+    dilation : int, optional
+        Dilation factor for convolution (default is 1).
+
+    Returns
+    -------
+    int
+        Calculated padding size.
+    """
+    return int((kernel_size * dilation - dilation) / 2)
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index ff1194968..c8e81eb7a 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -7,7 +7,8 @@
 ---------
 * Pooneh Mousavi, 2024
 """
-
+import sys
+import os
 import torch
 from abc import ABC, abstractmethod
 from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
@@ -15,9 +16,16 @@
     DiscreteSSL,
 )
 from speechbrain.lobes.models.discrete.dac import DAC
-from speechbrain.lobes.models.discrete.speechtokenizer_interface import (
-    SpeechTokenizer_interface,
-)
+from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer
+from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer
+from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi
+
+base_dir = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)  # noqa: E402
+sys.path.append(base_dir)  # noqa: E402
+
+from model.sq_codec import SQCodec  # noqa: E402
 
 
 class BaseTokenizer(ABC):
@@ -52,7 +60,7 @@ def __init__(self):
 
     @abstractmethod
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         """
         Encode a signal into discrete tokens.
 
@@ -114,18 +122,40 @@ def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs):
         Returns
         -------
         embeddings : torch.Tensor
-            Pretrained embedding weights with shape [K, C, H], where H is the embedding dimension.
+            Pretrained embedding weights with shape [K * C, H], where H is the embedding dimension.
         """
         pass
 
 
 class EncodecTokenizer(Encodec, BaseTokenizer):
+    """This is a wrapper for the Encodec implemented in the SpeechBrain main repository.
+
+    Source paper:
+        https://arxiv.org/abs/2210.13438
+    Example
+    -------
+    >>> model_hub = "facebook/encodec_24khz"
+    >>> save_path = "savedir"
+    >>> model = EncodecTokenizer(model_hub, save_path)
+    >>> emb=model.get_pretrained_embeddings()
+    >>> emb.shape
+    torch.Size([2048, 128])
+    >>> audio = torch.randn(4, 1000)
+    >>> length = torch.tensor([1.0, .5, .75, 1.0])
+    >>> tokens= model.sig_to_tokens(audio, length)
+    >>> tokens.shape
+    torch.Size([4, 4, 2])
+    >>> rec = model.tokens_to_sig(tokens, lenght=length)
+    >>> rec.shape
+    torch.Size([4, 1280]
+    """
+
     def __init__(self, *args, **kwargs):
         Encodec.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         self.eval()
         tokens, _ = self.encode(signal, lengths)
         if num_codebooks:
@@ -151,12 +181,31 @@ def get_pretrained_embeddings(
 
 
 class DACTokenizer(DAC, BaseTokenizer):
+    """This is a wrapper for the DAC implemented in the SpeechBrain main repository.
+
+    Source paper:
+        http://arxiv.org/abs/2306.06546
+    Example
+    -------
+    >>> model = DACTokenizer(load_pretrained=True, model_type="24KHz", model_bitrate="8kbps", tag="latest")
+    >>> audio = torch.randn(4, 16000)
+    >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8)
+    >>> emb.shape
+    torch.Size([8192, 1024])
+    >>> tokens= model.sig_to_tokens(audio)
+    >>> tokens.shape
+    torch.Size([4, 50, 32])
+    >>> rec = model.tokens_to_sig(tokens)
+    >>> rec.shape
+    torch.Size([4, 15992])
+    """
+
     def __init__(self, *args, **kwargs):
         DAC.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         self.eval()
         tokens, _ = self(signal[:, None], n_quantizers=num_codebooks)
         return tokens.movedim(-1, -2)
@@ -185,14 +234,35 @@ def get_pretrained_embeddings(
         return torch.cat(z_qs)[:, :, 0]
 
 
-class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer):
+class SpeechTokenizerWrapper(SpeechTokenizer, BaseTokenizer):
+    """This is a wrapper for the SpeechTokenizer implemented in the SpeechBrain main repository.
+
+    Source paper:
+        https://arxiv.org/abs/2308.16692
+    Example
+    -------
+    >>> audio = torch.rand([10, 600])
+    >>> model_hub = "fnlp/SpeechTokenizer"
+    >>> save_path = "savedir"
+    >>> model = SpeechTokenizerWrapper(model_hub, save_path)
+    >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8)
+    >>> emb.shape
+    torch.Size([8192, 1024])
+    >>> tokens= model.sig_to_tokens(audio)
+    >>> tokens.shape
+    torch.Size([10, 2, 8])
+    >>> rec = model.tokens_to_sig(tokens)
+    >>> rec.shape
+    torch.Size([10, 640])
+    """
+
     def __init__(self, *args, **kwargs):
-        SpeechTokenizer_interface.__init__(self, *args, **kwargs)
+        SpeechTokenizer.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
         self.sample_rate = 16000
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         self.eval()
         tokens = self(signal)
         if num_codebooks:
@@ -223,12 +293,41 @@ def get_pretrained_embeddings(
 
 
 class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer):
+    """This is a wrapper for the Encodec implemented in the SpeechBrain main repository.
+
+    Source paper:
+        https://arxiv.org/abs/2210.13438
+    Example
+    -------
+    >>> from speechbrain.lobes.models.huggingface_transformers.wavlm import (WavLM)
+    >>> inputs = torch.rand([3, 2000])
+    >>> model_hub = "microsoft/wavlm-large"
+    >>> save_path = "savedir"
+    >>> ssl_layer_num = [7,23]
+    >>> deduplicate =[False, True]
+    >>> bpe_tokenizers=[None, None]
+    >>> vocoder_repo_id = "speechbrain/hifigan-wavlm-k1000-LibriTTS"
+    >>> kmeans_dataset = "LibriSpeech"
+    >>> num_clusters = 1000
+    >>> ssl_model = WavLM(model_hub, save_path,output_all_hiddens=True)
+    >>> model = DiscreteSSLTokenizer(save_path, ssl_model, vocoder_repo_id=vocoder_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters)
+    >>> emb=model.get_pretrained_embeddings(num_codebooks=ssl_layer_num)
+    >>> emb.shape
+    torch.Size([2000, 1024])
+    >>> tokens= model.sig_to_tokens(inputs,num_codebooks=ssl_layer_num, deduplicates=deduplicate, bpe_tokenizers=bpe_tokenizers)
+    >>> tokens.shape
+    torch.Size([3, 6, 2])
+    >>> sig = model.tokens_to_sig(tokens, SSL_layers=ssl_layer_num)
+    >>> sig.shape
+    torch.Size([3, 1920])
+    """
+
     def __init__(self, *args, **kwargs):
         DiscreteSSL.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
 
     @torch.no_grad()
-    def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         self.eval()
         tokens, _, _ = self.encode(
             signal, lengths, SSL_layers=num_codebooks, **kwargs
@@ -238,7 +337,7 @@ def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs):
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
-        return self.decode(tokens, **kwargs)
+        return self.decode(tokens, **kwargs).squeeze(1)
 
     @torch.no_grad()
     def get_pretrained_embeddings(
@@ -253,3 +352,165 @@ def get_pretrained_embeddings(
             embs.append(torch.as_tensor(vocabulary, dtype=torch.float32))
         embs = torch.cat(embs)
         return embs
+
+
+class MimiTokenizer(Mimi, BaseTokenizer):
+    """This is a wrapper for the Mimi implemented in the SpeechBrain main repository.
+
+    Source paper:
+        https://kyutai.org/Moshi.pdf
+    Example
+    -------
+    >>> model_hub = "kyutai/mimi"
+    >>> save_path = "savedir"
+    >>> model = MimiTokenizer(model_hub, save_path)
+    >>> emb=model.get_pretrained_embeddings()
+    >>> emb.shape
+    torch.Size([16384, 256])
+    >>> audio = torch.randn(4, 48000)
+    >>> length = torch.tensor([1.0, .5, .75, 1.0])
+    >>> tokens = model.sig_to_tokens(audio, length)
+    >>> tokens.shape
+    torch.Size([4, 25, 8])
+    >>> rec = model.tokens_to_sig(tokens, length=length)
+    >>> rec.shape
+    torch.Size([4, 48000])
+    """
+
+    def __init__(self, *args, **kwargs):
+        Mimi.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
+        self.eval()
+        tokens, _ = self.encode(signal, lengths)
+        if num_codebooks:
+            if tokens.shape[-1] < num_codebooks:
+                raise ValueError(
+                    f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested"
+                )
+            tokens = tokens[:, :num_codebooks, :]
+        return tokens.movedim(-1, -2)
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.eval()
+        signal = self.decode(tokens.movedim(-1, -2), **kwargs)[:, 0]
+        return signal
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        return self.embeddings.view(-1, self.embeddings.size(-1))
+
+
+class WavTokenizerWrapper(WavTokenizer, BaseTokenizer):
+    """This is a wrapper for the WavTokenizer implemented in the SpeechBrain main repository.
+
+    Source paper:
+        https://arxiv.org/abs/2408.16532
+
+    Example
+    -------
+    >>> model_hub = "novateur/WavTokenizer"
+    >>> save_path = "savedir"
+    >>> config="wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
+    >>> checkpoint="WavTokenizer_small_600_24k_4096.ckpt"
+    >>> model = WavTokenizerWrapper(model_hub, save_path,config=config,checkpoint=checkpoint)
+    >>> emb=model.get_pretrained_embeddings()
+    >>> emb.shape
+    torch.Size([4096, 512])
+    >>> audio = torch.randn(4, 48000)
+    >>> length = torch.tensor([1.0, .5, .75, 1.0])
+    >>> tokens= model.sig_to_tokens(audio, length)
+    >>> tokens.shape
+    torch.Size([4, 80, 1])
+    >>> rec = model.tokens_to_sig(tokens)
+    >>> rec.shape
+    torch.Size([4, 48000])
+    """
+
+    def __init__(self, *args, **kwargs):
+        WavTokenizer.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
+        self.eval()
+        tokens, _ = self.encode(signal)
+        if num_codebooks:
+            if tokens.shape[1] < num_codebooks:
+                raise ValueError(
+                    f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested"
+                )
+            tokens = tokens[:, :num_codebooks, :]
+
+        return tokens.movedim(-2, -1)
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.eval()
+        signal = self.decode(tokens.movedim(-1, -2))
+        return signal
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        return self.embeddings
+
+
+class SQCodecTokenizer(SQCodec, BaseTokenizer):
+    """This is a wrapper for the SQCoced implemented in the model folder.
+
+    Source paper:
+        https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893
+
+
+    Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo:
+        - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip
+
+    Example
+    -------
+    >>> save_path = "savedir"
+    >>> config = "config.yaml"
+    >>> checkpoint = "ckpt_00190000.pth"
+    >>> model = SQCodecTokenizer(save_path, config, checkpoint)
+    >>> audio = torch.randn(3, 48000)
+    >>> tokens = model.sig_to_tokens(audio)
+    >>> tokens.shape
+    torch.Size([3, 150, 4])
+    >>> rec = model.tokens_to_sig(tokens)
+    >>> rec.shape
+    torch.Size([3, 48000]
+    """
+
+    def __init__(self, *args, **kwargs):
+        SQCodec.__init__(self, *args, **kwargs)
+        BaseTokenizer.__init__(self)
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
+        self.eval()
+        tokens, _ = self.encode(signal)
+        return tokens.view(tokens.shape[0], -1, self.n_codebook)
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.eval()
+        signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs)
+        return signal.squeeze(1)
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        """
+        This method is not implemented for SQCodec, as it uses scalar quantization
+        and does not have any trainable quantizer or embedding.
+        """
+        raise ValueError(
+            "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization."
+        )

From f534bdfdd9c630e4a467cded91b529e0ce9f1225 Mon Sep 17 00:00:00 2001
From: poonehmousavi <moosavi.pooneh@gmail.com>
Date: Sat, 11 Jan 2025 23:17:31 -0500
Subject: [PATCH 053/270] fix precommit

---
 benchmarks/DASB/model/sq_codec.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index f04c094d4..6057a5f73 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1323,7 +1323,11 @@ def ternary_matrix_to_decimal(matrix):
         A 2D numpy array of shape (B, N), where each value represents the decimal
         equivalent of the corresponding ternary number in the input matrix.
     """
-    B, D, N = (
+    (
+        B,
+        D,
+        N,
+    ) = (
         matrix.shape
     )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
     powers_of_three = 3 ** np.arange(D)  # [3^0, 3^1, ..., 3^(D-1)]

From 54dab6782b7da1adc7eb41eab11f77b9db5da326 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 13 Jan 2025 00:25:02 -0500
Subject: [PATCH 054/270] Tokotron: Fixes

---
 .../DASB/LJSpeech/TTS/tokotron/evaluate.py    |  42 +-------
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  52 ++-------
 .../TTS/tokotron/hparams/train_encodec.yaml   | 102 ++++--------------
 .../DASB/LibriTTS/TTS/tokotron/train.py       |   2 +-
 .../LibriTTS/TTS/tokotron/train_encodec.py    |   2 +-
 benchmarks/DASB/model/Tokotron.py             |   7 +-
 6 files changed, 38 insertions(+), 169 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
index bcb2670a6..e40e9bb31 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
@@ -51,17 +51,7 @@ def __init__(self, hparams, create_waveform_fn, device):
         else:
             self.evaluators = {}
 
-        bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {})
-        if bulk_evaluators:
-            self.bulk_evaluators = {
-                key: evaluator_f()
-                for key, evaluator_f in bulk_evaluators.items()
-                if key in self.enabled_evaluators
-            }
-        else:
-            self.bulk_evaluators = {}
-
-        if not self.evaluators and not self.bulk_evaluators:
+        if not self.evaluators:
             logger.warn(
                 "No evaluators were defined - this run will produce samples only"
             )
@@ -98,9 +88,7 @@ def on_evaluate_start(self, stage, epoch):
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
-        details_keys = list(self.evaluators.keys()) + list(
-            self.bulk_evaluators.keys()
-        )
+        details_keys = list(self.evaluators.keys())
         self.details = {evaluator_key: [] for evaluator_key in details_keys}
         self.sample_text = []
         self.sample_file_names = []
@@ -141,7 +129,6 @@ def on_evaluate_end(self):
         dataset : speechbrain.dataio.dataset.DynamicItemDataset
             a dataset
         """
-        self.evaluate_bulk()
         self.write_summary()
         logger.info("Evaluation done")
 
@@ -182,19 +169,6 @@ def get_report_columns(self, evaluator_key):
                 wavs_ref=bogus_wavs,
                 length_ref=bogus_length,
             )
-        else:
-            bogus_file_name = self.output_folder / "bogus.wav"
-            evaluator = self.bulk_evaluators[evaluator_key]
-            sb.dataio.dataio.write_audio(
-                str(bogus_file_name),
-                bogus_wavs[0].cpu(),
-                samplerate=self.hparams.model_sample_rate,
-            )
-            result = evaluator.evaluate_files(
-                file_names=[bogus_file_name],
-                text=["BOGUS"],
-                file_names_ref=[bogus_file_name],
-            )
 
         return ["uttid"] + list(result.details.keys())
 
@@ -228,18 +202,6 @@ def evaluate_batch(self, batch):
                 self.write_result(evaluator_key, batch.uttid, details)
                 self.details[evaluator_key].extend(details)
 
-    def evaluate_bulk(self):
-        """Runs all configured bulk evaluators, which evaluate a directory
-        of files - rather than one file at a time"""
-        for evaluator_key, evaluator in self.bulk_evaluators.items():
-            result = evaluator.evaluate_files(
-                file_names=self.sample_file_names,
-                text=self.sample_text,
-                file_names_ref=self.ref_file_names,
-            )
-            self.details[evaluator_key].append(result.details)
-            details = undo_batch(result.details)
-            self.write_result(evaluator_key, self.item_ids, details)
 
     def write_result(self, evaluator_key, uttid, details):
         """Outputs the result details to the report for the specified evaluator
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index d72df92aa..3d5320fdb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -62,17 +62,7 @@ def __init__(self, hparams, create_waveform_fn, device):
         else:
             self.evaluators = {}
 
-        bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {})
-        if bulk_evaluators:
-            self.bulk_evaluators = {
-                key: evaluator_f()
-                for key, evaluator_f in bulk_evaluators.items()
-                if key in self.enabled_evaluators
-            }
-        else:
-            self.bulk_evaluators = {}
-
-        if not self.evaluators and not self.bulk_evaluators:
+        if not self.evaluators:
             logger.warning("No evaluators were defined - this run will produce samples only")
 
         self.attention = []
@@ -101,9 +91,7 @@ def on_evaluate_start(self, stage, epoch):
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
-        details_keys = list(self.evaluators.keys()) + list(
-            self.bulk_evaluators.keys()
-        )
+        details_keys = list(self.evaluators.keys()) 
         self.details = {evaluator_key: [] for evaluator_key in details_keys}
         self.sample_text = []
         self.sample_file_names = []
@@ -157,7 +145,7 @@ def evaluate(self, dataset):
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
-        details_keys = list(self.evaluators.keys()) + list(self.bulk_evaluators.keys())
+        details_keys = list(self.evaluators.keys())
         self.details = {
             evaluator_key: []
             for evaluator_key in details_keys
@@ -170,7 +158,6 @@ def evaluate(self, dataset):
         batch_count = math.ceil(len(dataset) / self.hparams.batch_size)
         for batch in tqdm(loader_it, desc="Evaluation", total=batch_count):
             self.evaluate_batch(batch)
-        self.evaluate_bulk()
         self.write_summary()
         logger.info("Evaluation done")
 
@@ -285,19 +272,6 @@ def get_report_columns(self, evaluator_key):
                 wavs_ref=bogus_wavs,
                 length_ref=bogus_length,
             )
-        else:
-            bogus_file_name = self.output_folder / "bogus.wav"
-            evaluator = self.bulk_evaluators[evaluator_key]
-            sb.dataio.dataio.write_audio(
-                str(bogus_file_name),
-                bogus_wavs[0].cpu(),
-                samplerate=self.hparams.model_sample_rate,
-            )
-            result = evaluator.evaluate_files(
-                file_names=[bogus_file_name],
-                text=["BOGUS"],
-                file_names_ref=[bogus_file_name],
-            )
 
         return ["uttid"] + list(result.details.keys())
 
@@ -311,9 +285,10 @@ def evaluate_batch(self, batch):
         with torch.no_grad():
             batch = batch.to(self.device)
             tokens, tokens_length = batch.tokens
-            vocoder_to_device(self.modules.vocoder, self.device)
-            if hasattr(self.modules.vocoder, "device"):
-                self.modules.vocoder.device = self.device
+            if hasattr(self.modules, "vocoder"):
+                vocoder_to_device(self.modules.vocoder, self.device)
+                if hasattr(self.modules.vocoder, "device"):
+                    self.modules.vocoder.device = self.device
             audio_resampled = torchaudio.functional.resample(
                 batch.sig.data,
                 self.hparams.sample_rate,
@@ -361,19 +336,6 @@ def evaluate_batch(self, batch):
                 perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"]
                 self.write_perf_stats(batch.uttid, perf_stats)
 
-
-    def evaluate_bulk(self):
-        """Performs bulk evaluation"""
-        for evaluator_key, evaluator in self.bulk_evaluators.items():
-            result = evaluator.evaluate_files(
-                file_names=self.sample_file_names,
-                text=self.sample_text,
-                file_names_ref=self.ref_file_names,
-            )
-            self.details[evaluator_key].append(result.details)
-            details = undo_batch(result.details)
-            self.write_result(evaluator_key, self.item_ids, details)
-
     def write_result(self, evaluator_key, uttid, details):
         """Outputs the result details to the report for the specified evaluator
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index a82d82a2c..0fed45124 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -40,6 +40,11 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+   data_path: !ref <tokens_folder>
+
 freeze_token_model: True
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
@@ -125,7 +130,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -137,14 +142,6 @@ guides_enabled: False
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
 
-
-# Token model (pretrained)
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-    bandwidth: !ref <bandwidth>
-    flat_embeddings: True
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
@@ -180,15 +177,6 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <extract_features_batch_size>
-        num_workers: !ref <num_workers>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-
 
 ####################### Model parameters ###########################
 # Transformer
@@ -197,11 +185,6 @@ nhead: 4
 enc_num_layers: 6
 dec_num_layers: 12
 d_ffn: 2048
-z_dim: 128
-hidden_dim: 2048
-enc_n_dim: 16
-dec_n_dim: 256
-decoder_chunk_size: -1
 transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
@@ -221,38 +204,6 @@ bandwidth: 1.5
 attention_type: regularMHA
 
 ############################## models ################################
-
-vocoder: !apply:speechbrain.utils.hparams.choice
-    value: !ref <vocoder_type>
-    choices:
-        encodec: !new:benchmarks.DASB.model.custom_model.EncodecVocoder
-            encodec: !ref <token_model>
-        vocos: !new:speechbrain.lobes.models.huggingface_transformers.vocos.Vocos
-            source: !ref <vocoder_src>
-            save_path: !ref <pretrained_model_save_folder>
-
-
-inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    using_eos_threshold: False
-    length_normalization: True
-    audio_token_shift: !ref <audio_token_shift>
-
-inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
-    scale_factor: !ref <scale_factor>
-    gate_threshold: !ref <gate_threshold>
-    eos_mode: !ref <eos_mode>
-
-inference: !apply:speechbrain.utils.hparams.choice
-    value: !ref <inference_mode>
-    choices:
-        search: !ref <inference_search>
-        forward: !ref <inference_forward>
-
 emb:
   spk:
     kind: "pretrained"
@@ -260,17 +211,12 @@ emb:
     vocoder: !ref <vocoder_takes_spk_emb>
     injection: !ref <spk_emb_injection>
 
-model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
     d_ffn: !ref <d_ffn>
-    z_dim: !ref <z_dim>
-    hidden_dim: !ref <hidden_dim>
-    enc_n_dim: !ref <enc_n_dim>
-    dec_n_dim: !ref <dec_n_dim>
-    decoder_chunk_size: !ref <decoder_chunk_size>
     nhead: !ref <nhead>
     enc_num_layers: !ref <enc_num_layers>
     dec_num_layers: !ref <dec_num_layers>
@@ -278,13 +224,11 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     target_dropout: !ref <target_dropout>
     activation: !ref <activation>
     attention_type: !ref <attention_type>
-    vocoder: !ref <vocoder>
     gate_threshold: !ref <gate_threshold>
     gate_offset: !ref <gate_offset>
     audio_emb_size: !ref <audio_emb_size>
     audio_emb_freeze: !ref <audio_emb_freeze>
     max_audio_length: !ref <max_audio_length>
-    inference: !ref <inference>
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
@@ -292,16 +236,27 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+   save_path: !ref <save_folder>
+   sample_rate: !ref <sample_rate>
+   bandwidth: !ref <bandwidth>
+   flat_embeddings: False
+   freeze: True
+   renorm_embeddings: False
+
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
     compute_cost: !ref <compute_cost>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
+compute_cost: !new:Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -327,26 +282,11 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:benchmarks.DASB.utils.preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
-progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger
-    current_path: !ref <progress_current>
-    archive_path: !ref <progress_archive>
-    meta_path: !ref <progress_meta>
-    epoch_counter: !ref <epoch_counter>
-
-progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
-    logger: !ref <progress_logger>
-    sample_rate: !ref <model_sample_rate>
-    eos_threshold: !ref <gate_threshold>
-
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
-    seed: !ref <seed>
\ No newline at end of file
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index a09a4cc23..e19cf3eba 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -246,7 +246,7 @@ def on_stage_start(self, stage, epoch):
             The currently-starting epoch. This is passed
             `None` during the test stage.
         """
-        if hasattr(self.modules.vocoder, "model"):
+        if hasattr(self.modules, "vocoder") and hasattr(self.modules.vocoder, "model"):
             self.modules.vocoder.model.device = self.device
         self.layer_idx = self._get_selected_layer_idx()
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
index 07edbbd8c..98f1b27cc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
@@ -36,7 +36,7 @@ def create_waveform(self, audio, length, emb):
         -------
         wav : torch.Tensor
         """
-        wav = self.modules.token_model.decode(audio)
+        wav = self.modules.tokenizer.decode(audio)
         wav = wav.squeeze(1)
         clean_padding_(wav, length)
         return wav
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 14aa38693..266090be4 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -12,6 +12,7 @@
 
 import math
 import torch
+import inspect
 from torch import nn
 from torch.nn import functional as F
 from speechbrain.lobes.models.transformer.Transformer import (
@@ -2110,7 +2111,11 @@ def get_silence_token(
     model_training = model.training
     model.eval()
     if hasattr(model, "encode"):
-        result = model.encode(audio, length, **model_kwargs)
+        spec = inspect.getfullargspec(model.encode)
+        if "length" in spec.args:
+            result = model.encode(audio, length, **model_kwargs)
+        else:
+            result = model.encode(audio, **model_kwargs)
     else:
         result = model(audio, length, **model_kwargs)
     if model_training:

From 2552b06264c93686001c90ea3d07bec04b3dc1b4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 14 Jan 2025 11:42:09 -0500
Subject: [PATCH 055/270] DASB: Tokotron: Cosmetic changes

---
 .../DASB/LJSpeech/TTS/tokotron/evaluate.py    |   1 -
 .../TTS/tokotron/hparams/train_dac.yaml       |   2 +-
 .../DASB/LJSpeech/TTS/tokotron/train.py       |  43 +++----
 benchmarks/DASB/LJSpeech/ljspeech_prepare.py  |   1 -
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    | 110 ++++++++---------
 .../hparams/train_continuous_ssl.yaml         |   2 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |   4 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   2 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   2 +-
 .../hparams/train_speech_tokenizer.yaml       |   4 +-
 .../DASB/LibriTTS/TTS/tokotron/train.py       | 114 ++++++++++--------
 .../DASB/LibriTTS/extraction/extract.py       |   2 +-
 benchmarks/DASB/utils/eval.py                 |  38 ++----
 13 files changed, 151 insertions(+), 174 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
index e40e9bb31..52b7e1817 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py
@@ -202,7 +202,6 @@ def evaluate_batch(self, batch):
                 self.write_result(evaluator_key, batch.uttid, details)
                 self.details[evaluator_key].extend(details)
 
-
     def write_result(self, evaluator_key, uttid, details):
         """Outputs the result details to the report for the specified evaluator
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 4f50c7ed2..240b57a7d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -199,7 +199,7 @@ tokenizer: !new:utils.tokenizer_interface.DACTokenizer
   n_codebooks: !ref <audio_tokens_per_step>
   load_pretrained: True
   tag: latest
-  
+
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index deb8a3236..8da11247b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -57,8 +57,9 @@ def __init__(
             create_waveform_fn=self.create_waveform,
             device=self.device,
         )
-        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
-
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
 
     def compute_forward(self, batch, stage):
         """Runs all the computation of the Tokotron TTS
@@ -97,7 +98,7 @@ def prepare_features(self, batch):
         """Prepares features, depending on the configuration
 
         Arguments
-        ---------        
+        ---------
         batch : PaddedBatch
             This batch object contains all the relevant tensors for computation
 
@@ -122,13 +123,10 @@ def prepare_features(self, batch):
                 1, 2, 0, 3
             )
             batch_size, _, heads, dim = audio.shape
-            bos = torch.zeros_like(
-                audio[:, :1, :, :]
-            ).reshape(batch_size, self.hparams.bos_width, heads, dim)
-            audio_bos = torch.concatenate(
-                [bos, audio],
-                dim=1
+            bos = torch.zeros_like(audio[:, :1, :, :]).reshape(
+                batch_size, self.hparams.bos_width, heads, dim
             )
+            audio_bos = torch.concatenate([bos, audio], dim=1)
             audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
             audio_tgt = audio
             audio_tgt_length = audio_length
@@ -469,13 +467,16 @@ def audio_ref_pipeline(wav):
         if layers_key in hparams
         else hparams["audio_tokens_per_step"]
     )
-    if use_silence_padding and representation_mode == RepresentationMode.DISCRETE:
+    if (
+        use_silence_padding
+        and representation_mode == RepresentationMode.DISCRETE
+    ):
         silence_token, _ = get_silence_token(
             hparams[model_key],
             model_kwargs=hparams.get("token_model_kwargs"),
             extract_emb=False,
             model_shape=hparams.get("model_shape", "BLH"),
-            unsqueeze=hparams.get("model_needs_channel", False)
+            unsqueeze=hparams.get("model_needs_channel", False),
         )
     else:
         silence_token = (
@@ -495,7 +496,9 @@ def audio_ref_pipeline(wav):
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
     def audio_pipeline(id):
-        audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step)
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=audio_tokens_per_step
+        )
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -737,9 +740,7 @@ def run_experiment(brain_cls):
                 "extract_phonemes": hparams["input"] == "phonemes",
                 "model_name": "tokotron",
                 "g2p_src": hparams["g2p_src"],
-                "skip_ignore_folders": hparams[
-                    "prepare_skip_ignore_folders"
-                ],
+                "skip_ignore_folders": hparams["prepare_skip_ignore_folders"],
                 "frozen_split_path": hparams.get("frozen_split_path"),
                 "device": run_opts.get("device", "cpu"),
             },
@@ -767,21 +768,18 @@ def run_experiment(brain_cls):
     # stopped at any point, and will be resumed on next call.
 
     dataloader_opts = [
-        hparams[f"{key}_dataloader_opts"]
-        for key in ["train", "valid", "test"]
+        hparams[f"{key}_dataloader_opts"] for key in ["train", "valid", "test"]
     ]
     representation_mode = RepresentationMode(hparams["representation_mode"])
     if representation_mode == RepresentationMode.DISCRETE:
         dataloader_opts = [
-            use_silence_padding(
-                opts, silence_padding, audio_keys
-            )
+            use_silence_padding(opts, silence_padding, audio_keys)
             for opts in dataloader_opts
         ]
     (
         train_dataloader_opts,
         valid_dataloader_opts,
-        test_dataloader_opts
+        test_dataloader_opts,
     ) = dataloader_opts
 
     tts_brain.fit(
@@ -794,8 +792,7 @@ def run_experiment(brain_cls):
 
     # Load best checkpoint for evaluation
     tts_brain.evaluate(
-        test_set=datasets["test"],
-        test_loader_kwargs=test_dataloader_opts,
+        test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts,
     )
 
     # Save final checkpoint (fixed name)
diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
index bfd1b3743..06292fd34 100644
--- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
+++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
@@ -797,4 +797,3 @@ def custom_clean(text, model_name):
     for regex, replacement in _abbreviations:
         text = re.sub(regex, replacement, text)
     return text
-
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 3d5320fdb..0a75a3482 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -4,7 +4,7 @@
 * Artem Ploujnikov 2024
 """
 
-#TODO: There are too many evaluation scripts. Refactor to extract common
+# TODO: There are too many evaluation scripts. Refactor to extract common
 # features
 
 import speechbrain as sb
@@ -33,7 +33,7 @@
 
 class TokotronEvaluator:
     """An evaluator class for the TTS model
-    
+
     Arguments
     ---------
     hparams: dict
@@ -41,6 +41,7 @@ class TokotronEvaluator:
     device : str | torch.device
         the device
     """
+
     def __init__(self, hparams, create_waveform_fn, device):
         self.hparams = SimpleNamespace(**hparams)
         self.create_waveform_fn = create_waveform_fn
@@ -63,7 +64,9 @@ def __init__(self, hparams, create_waveform_fn, device):
             self.evaluators = {}
 
         if not self.evaluators:
-            logger.warning("No evaluators were defined - this run will produce samples only")
+            logger.warning(
+                "No evaluators were defined - this run will produce samples only"
+            )
 
         self.attention = []
 
@@ -91,7 +94,7 @@ def on_evaluate_start(self, stage, epoch):
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
-        details_keys = list(self.evaluators.keys()) 
+        details_keys = list(self.evaluators.keys())
         self.details = {evaluator_key: [] for evaluator_key in details_keys}
         self.sample_text = []
         self.sample_file_names = []
@@ -124,7 +127,6 @@ def get_output_folder(self, stage, epoch):
         output_folder.mkdir(parents=True, exist_ok=True)
         return output_folder
 
-
     def evaluate(self, dataset):
         """Runs evaluation on a dataset
 
@@ -139,17 +141,18 @@ def evaluate(self, dataset):
             raise ValueError("Unable to recover the checkpoint")
         self.modules.model.eval()
         if self.hparams.eval_samples is not None:
-            dataset = dataset.filtered_sorted(select_n=self.hparams.eval_samples)
-        loader = sb.dataio.dataloader.make_dataloader(dataset, batch_size=self.hparams.batch_size)
+            dataset = dataset.filtered_sorted(
+                select_n=self.hparams.eval_samples
+            )
+        loader = sb.dataio.dataloader.make_dataloader(
+            dataset, batch_size=self.hparams.batch_size
+        )
         loader_it = iter(loader)
         self.create_reports()
         self.modules.model.show_inference_progress = False
         self.item_ids = []
         details_keys = list(self.evaluators.keys())
-        self.details = {
-            evaluator_key: []
-            for evaluator_key in details_keys
-        }
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
         self.read_reports()
         self.sample_text = []
         self.sample_file_names = []
@@ -187,7 +190,7 @@ def create_reports(self):
                     "vocoder_flops",
                     "total_flops",
                     "total_flops_per_step",
-                ]
+                ],
             )
             self.perf_writer.writeheader()
 
@@ -221,18 +224,14 @@ def vocoder(self, infer_out, emb):
 
         with flop_counter:
             wav = self.create_waveform_fn(
-                infer_out.audio,
-                length=infer_out.length,
-                emb=emb
+                infer_out.audio, length=infer_out.length, emb=emb
             )
             if wav.dim() > 2:
                 wav = wav.squeeze(1)
 
         if self.hparams.eval_perf:
             flops = flop_counter.get_total_flops()
-            stats = {
-                "vocoder_flops": flops
-            }
+            stats = {"vocoder_flops": flops}
         return wav, stats
 
     def read_reports(self):
@@ -245,7 +244,10 @@ def read_reports(self):
                     reader = csv.DictReader(report_file)
                     for row in reader:
                         del row["uttid"]
-                        row = {key : handle_number(value) for key, value in row.items()}
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
                         self.details[evaluator_key].append(row)
 
     def get_report_columns(self, evaluator_key):
@@ -262,7 +264,7 @@ def get_report_columns(self, evaluator_key):
             a list of column headers
         """
         bogus_wavs = torch.randn(2, 10000, device=self.device)
-        bogus_length = torch.tensor([1., 1.], device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
         if evaluator_key in self.evaluators:
             evaluator = self.evaluators[evaluator_key]
             result = evaluator.evaluate(
@@ -294,21 +296,14 @@ def evaluate_batch(self, batch):
                 self.hparams.sample_rate,
                 self.hparams.model_sample_rate,
             )
-            mel_spec = self.spk_emb_model.mel_spectogram(
-                audio=audio_resampled
-            )
+            mel_spec = self.spk_emb_model.mel_spectogram(audio=audio_resampled)
             spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch(
                 mel_spec, batch.sig.lengths
             ).squeeze(1)
             infer_out, perf_stats = self.infer(
-                tokens=tokens, tokens_length=tokens_length,
-                emb={
-                    "spk": spk_emb
-                }
-            )
-            wav, vocoder_stats = self.vocoder(
-                infer_out, spk_emb
+                tokens=tokens, tokens_length=tokens_length, emb={"spk": spk_emb}
             )
+            wav, vocoder_stats = self.vocoder(infer_out, spk_emb)
             perf_stats.update(vocoder_stats)
             length = infer_out.length
             if wav.dim() > 2:
@@ -324,7 +319,7 @@ def evaluate_batch(self, batch):
                     wavs_ref=batch.sig.data,
                     length_ref=batch.sig.lengths,
                     sample_rate_ref=self.hparams.sample_rate,
-                    sample_rate=self.hparams.model_sample_rate
+                    sample_rate=self.hparams.model_sample_rate,
                 )
                 details = undo_batch(result.details)
                 self.write_result(evaluator_key, batch.uttid, details)
@@ -332,8 +327,12 @@ def evaluate_batch(self, batch):
 
             if self.hparams.eval_perf:
                 perf_stats.update(vocoder_stats)
-                perf_stats["total_flops"] = perf_stats["vocoder_flops"] + perf_stats["infer_flops"]
-                perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"]
+                perf_stats["total_flops"] = (
+                    perf_stats["vocoder_flops"] + perf_stats["infer_flops"]
+                )
+                perf_stats["total_flops_per_step"] = (
+                    perf_stats["total_flops"] / perf_stats["steps"]
+                )
                 self.write_perf_stats(batch.uttid, perf_stats)
 
     def write_result(self, evaluator_key, uttid, details):
@@ -354,9 +353,7 @@ def write_result(self, evaluator_key, uttid, details):
                 "uttid": uttid,
                 **details_item,
             }
-            writer.writerow(
-                ascii_only(flatten(report_details))
-            )
+            writer.writerow(ascii_only(flatten(report_details)))
         self.report_files[evaluator_key].flush()
 
     def save_samples(self, batch, wav, length):
@@ -375,12 +372,12 @@ def save_samples(self, batch, wav, length):
         for item_id, infer_wav, wav_length in zip(
             batch.uttid, wav, wav_length_abs
         ):
-            file_name = str(
-                self.samples_folder / f"{item_id}_pred.wav"
-            )
-            infer_wav_cut = infer_wav[:wav_length.item()].cpu()
+            file_name = str(self.samples_folder / f"{item_id}_pred.wav")
+            infer_wav_cut = infer_wav[: wav_length.item()].cpu()
             sb.dataio.dataio.write_audio(
-                file_name, infer_wav_cut, samplerate=self.hparams.model_sample_rate
+                file_name,
+                infer_wav_cut,
+                samplerate=self.hparams.model_sample_rate,
             )
             self.sample_file_names.append(file_name)
 
@@ -392,28 +389,22 @@ def write_summary(self):
             json.dump(summary, output_file, indent=4)
 
     def write_perf_stats(self, uttid, details):
-        self.perf_writer.writerow(
-            {
-                "uttid": " ".join(uttid),
-                **details
-            }
-        )
+        self.perf_writer.writerow({"uttid": " ".join(uttid), **details})
         self.perf_file.flush()
 
-
     def compute_summary(self):
         """Computes the summarized statistics"""
         return {
             f"{evaluator_key}_{stat_key}": value
             for evaluator_key in self.enabled_evaluators
             if evaluator_key in self.details
-            for metric_key in self.hparams.eval_summary[evaluator_key]["descriptive"]
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
             for stat_key, value in descriptive_statistics(
-                items=self.details[evaluator_key],
-                key=metric_key,
+                items=self.details[evaluator_key], key=metric_key,
             ).items()
         }
-    
 
 
 def flatten(value):
@@ -436,18 +427,15 @@ def flatten(value):
 
 
 RE_PUNCTUATION = re.compile(
-    "|".join(
-        re.escape(char) for char in string.punctuation
-    )
+    "|".join(re.escape(char) for char in string.punctuation)
 )
 
-RE_NON_ASCII = re.compile(r'[^\x00-\x7F]+')
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
 
 
 def ascii_only(values):
     return {
-        key: RE_NON_ASCII.sub('', value) if isinstance(value, str)
-        else value
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
         for key, value in values.items()
     }
 
@@ -494,7 +482,7 @@ def audio_ref_pipeline(wav):
 
 def descriptive_statistics(items, key):
     """Computes descriptive statistics for the summary
-    
+
     Arguments
     ---------
     items : list
@@ -515,8 +503,7 @@ def descriptive_statistics(items, key):
         "iqr": q3 - q1,
     }
     return {
-        f"{key}_{stat_key}": value.item()
-        for stat_key, value in stats.items()
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
     }
 
 
@@ -562,4 +549,3 @@ def handle_number(value):
     elif RE_FLOAT.match(value):
         value = float(value)
     return value
-
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
index 3626079ef..2cbca90fb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
@@ -233,7 +233,7 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
 asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide
     source: !ref <asr_src>
     savedir:  !ref <pretrained_model_save_folder>/asr-transformer
-    
+
 
 # Dataloader options
 train_dataloader_opts:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index c6875498c..2d91a521e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -143,7 +143,7 @@ token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
-    
+
 
 # Dataloader options
 train_dataloader_opts:
@@ -257,7 +257,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     z_dim: !ref <z_dim>
     hidden_dim: !ref <hidden_dim>
     n_dim: !ref <n_dim>
-    decoder_chunk_size: !ref <decoder_chunk_size>    
+    decoder_chunk_size: !ref <decoder_chunk_size>
     nhead: !ref <nhead>
     enc_num_layers: !ref <enc_num_layers>
     dec_num_layers: !ref <dec_num_layers>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index c1c2f9f1c..2ecb72a84 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -224,7 +224,7 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
     pymodule_file: custom_interface.py
     classname: DiscreteSpkEmb
     overrides:
-        ssl_layer_num_selected: !ref <token_model_layers>    
+        ssl_layer_num_selected: !ref <token_model_layers>
 
 # Dataloader options
 train_dataloader_opts:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 0fed45124..1f3764ceb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -145,7 +145,7 @@ use_silence_padding: True
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
-    
+
 
 # Dataloader options
 train_dataloader_opts:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 1711b10f4..2de6e121e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -139,7 +139,7 @@ token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
-    
+
 
 # Dataloader options
 train_dataloader_opts:
@@ -250,7 +250,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     z_dim: !ref <z_dim>
     hidden_dim: !ref <hidden_dim>
     n_dim: !ref <n_dim>
-    decoder_chunk_size: !ref <decoder_chunk_size>    
+    decoder_chunk_size: !ref <decoder_chunk_size>
     nhead: !ref <nhead>
     enc_num_layers: !ref <enc_num_layers>
     dec_num_layers: !ref <dec_num_layers>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index e19cf3eba..3df858844 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -28,7 +28,7 @@
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
-)    
+)
 from types import SimpleNamespace
 from evaluate import TokotronEvaluator
 import re
@@ -60,7 +60,9 @@ def __init__(
             create_waveform_fn=self.create_waveform,
             device=self.device,
         )
-        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
 
     def create_waveform(self, audio, length, emb):
         """Creates a waveform from a discrete or continuous audio
@@ -104,7 +106,7 @@ def compute_forward(self, batch, stage):
             audio_bos_length,
             audio_tgt,
             audio_tgt_length,
-            spk_emb
+            spk_emb,
         ) = features
 
         predictions = self.modules.model(
@@ -112,9 +114,7 @@ def compute_forward(self, batch, stage):
             input_length=tokens_length,
             audio=audio_bos,
             audio_length=audio_bos_length,
-            emb={
-                "spk": spk_emb
-            }
+            emb={"spk": spk_emb},
         )
 
         return predictions, features
@@ -136,13 +136,10 @@ def prepare_features(self, batch):
                 1, 2, 0, 3
             )
             batch_size, _, heads, dim = audio.shape
-            bos = torch.zeros_like(
-                audio[:, :1, :, :]
-            ).reshape(batch_size, self.hparams.bos_width, heads, dim)
-            audio_bos = torch.concatenate(
-                [bos, audio],
-                dim=1
+            bos = torch.zeros_like(audio[:, :1, :, :]).reshape(
+                batch_size, self.hparams.bos_width, heads, dim
             )
+            audio_bos = torch.concatenate([bos, audio], dim=1)
             audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1)
             audio_tgt = audio
             audio_tgt_length = audio_length
@@ -150,8 +147,7 @@ def prepare_features(self, batch):
         return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb
 
     def _compute_spk(self, wav, wav_length):
-        mel_spec = self.spk_emb_model.mel_spectogram(
-            wav.squeeze(1))
+        mel_spec = self.spk_emb_model.mel_spectogram(wav.squeeze(1))
         spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch(
             mel_spec, wav_length
         )
@@ -159,12 +155,14 @@ def _compute_spk(self, wav, wav_length):
 
     def _get_selected_layer_idx(self):
         selected_layers = None
-        if hasattr(self.hparams, "select_layers") and self.hparams.select_layers:
+        if (
+            hasattr(self.hparams, "select_layers")
+            and self.hparams.select_layers
+        ):
             layers = self.hparams.select_layers
             model_layers_map = {
                 layer: idx
-                for idx, layer in enumerate(
-                    self.hparams.token_model_layers)
+                for idx, layer in enumerate(self.hparams.token_model_layers)
             }
             selected_layers = [model_layers_map[layer] for layer in layers]
         return selected_layers
@@ -214,7 +212,7 @@ def compute_objectives(self, predictions, batch, stage):
             audio_bos_length,
             audio_tgt,
             audio_tgt_length,
-            spk_emb
+            spk_emb,
         ) = features
 
         loss_details = self.hparams.compute_cost(
@@ -246,7 +244,9 @@ def on_stage_start(self, stage, epoch):
             The currently-starting epoch. This is passed
             `None` during the test stage.
         """
-        if hasattr(self.modules, "vocoder") and hasattr(self.modules.vocoder, "model"):
+        if hasattr(self.modules, "vocoder") and hasattr(
+            self.modules.vocoder, "model"
+        ):
             self.modules.vocoder.model.device = self.device
         self.layer_idx = self._get_selected_layer_idx()
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
@@ -273,7 +273,9 @@ def on_stage_start(self, stage, epoch):
         self.spk_emb_model = self.hparams.spk_emb_model(
             run_opts=pretrained_run_opts
         )
-        self.representation_mode = RepresentationMode(self.hparams.representation_mode)
+        self.representation_mode = RepresentationMode(
+            self.hparams.representation_mode
+        )
         # If speaker embedding shuffling is enabled, re-initialize them for the
         # epoch
         if self.hparams.spk_emb_shuffle:
@@ -370,7 +372,7 @@ def evaluate_batch(self, batch, stage):
         loss = self.compute_objectives(out, batch, stage=stage)
         if self.is_evaluating:
             self.evaluator.evaluate_batch(batch)
-        return loss.detach().cpu()            
+        return loss.detach().cpu()
 
     def make_dataloader(
         self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs
@@ -398,7 +400,9 @@ def make_dataloader(
         -------
         DataLoader for the input dataset
         """
-        if stage == sb.Stage.TRAIN and not getattr(self, "_ckpt_recovered", False):
+        if stage == sb.Stage.TRAIN and not getattr(
+            self, "_ckpt_recovered", False
+        ):
             self.checkpointer.recover_if_possible()
             self._ckpt_recovered = True
         if self.guides_running(pre_epoch=True):
@@ -407,7 +411,7 @@ def make_dataloader(
             dataset=dataset,
             stage=stage,
             ckpt_prefix=ckpt_prefix,
-            **loader_kwargs
+            **loader_kwargs,
         )
 
     def guides_running(self, pre_epoch=False):
@@ -477,7 +481,6 @@ def fit_batch(self, batch):
         return loss
 
 
-
 INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
 
 
@@ -491,7 +494,7 @@ def dataio_prepare(hparams, guide_ctx=None):
     hparams : dict
         This dictionary is loaded from the `train.yaml` file, and it includes
         all the hyperparameters needed for dataset construction and loading.
-    
+
     guide_ctx : SimpleNamespace, optional
         The guide context with pretrained models
 
@@ -546,19 +549,18 @@ def text_pipeline(label):
         yield label.upper()
         label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
         yield label_norm_eval
- 
 
     @sb.utils.data_pipeline.takes(input_feature)
     @sb.utils.data_pipeline.provides("tokens")
     def tokens_pipeline(label):
         """Processes the transcriptions to generate proper labels"""
         return label_encoder.encode_sequence_torch(label)
-    
+
     @sb.utils.data_pipeline.takes("label_norm")
     @sb.utils.data_pipeline.provides("asr_tokens")
     def asr_tokens_pipeline(label):
         """Processes the transcriptions to generate proper labels"""
-        return torch.tensor(guide_ctx.asr_model.encode(label))    
+        return torch.tensor(guide_ctx.asr_model.encode(label))
 
     use_silence_padding = hparams.get("use_silence_padding", True)
     if "token_model_layers" in hparams:
@@ -577,7 +579,11 @@ def asr_tokens_pipeline(label):
             * hparams["eos_index"]
         )
 
-    silence_padding = silence_token if representation_mode == RepresentationMode.DISCRETE else silence_emb
+    silence_padding = (
+        silence_token
+        if representation_mode == RepresentationMode.DISCRETE
+        else silence_emb
+    )
     silence_padding = silence_padding.cpu()
     silence_padding_len = int(math.ceil(hparams["silence_padding"]))
     bos_width = hparams.get("bos_width", 1)
@@ -585,14 +591,18 @@ def asr_tokens_pipeline(label):
         torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"]
     )
     if representation_mode == RepresentationMode.CONTINUOUS:
-        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(1, 1, hparams["audio_dim"])
+        audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(
+            1, 1, hparams["audio_dim"]
+        )
 
     tokens_loader = hparams.get("tokens_loader")
 
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
     def audio_pipeline(id):
-        audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step)    
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=audio_tokens_per_step
+        )
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -613,7 +623,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
         text_pipeline,
         tokens_pipeline,
         audio_ref_pipeline,
-        audio_pipeline
+        audio_pipeline,
     ]
     output_keys = [
         "uttid",
@@ -628,7 +638,11 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
 
     resample_fn = {}
     for dataset in data_info:
-        dataset_output_keys = output_keys if dataset == "train" else output_keys + ["label_norm_eval"]
+        dataset_output_keys = (
+            output_keys
+            if dataset == "train"
+            else output_keys + ["label_norm_eval"]
+        )
         dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
             json_path=data_info[dataset],
             replacements={"data_root": data_folder},
@@ -639,10 +653,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
         if hparams["spk_emb_shuffle"]:
-            spk_idx, spk_samplers = group_by_speaker(
-                dynamic_dataset,
-                hparams
-            )
+            spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams)
             spk_sample = {}
             spk_emb_random_match_pipeline = partial(
                 spk_emb_random_match,
@@ -659,7 +670,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
                 spk_idx=spk_idx,
                 sample=spk_sample,
                 dataset=dynamic_dataset,
-                spk_samplers=spk_samplers
+                spk_samplers=spk_samplers,
             )
             resample_fn[dataset](epoch=0)
 
@@ -689,9 +700,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
     if hparams["input"] == "phonemes":
         for key in datasets:
             datasets[key] = datasets[key].filtered_sorted(
-                key_test={
-                    "phn": lambda value: value
-                }
+                key_test={"phn": lambda value: value}
             )
     datasets["sample"] = select_sample(hparams, datasets)
     return datasets, silence_padding, resample_fn
@@ -748,7 +757,7 @@ def group_by_speaker(dataset, hparams):
         the dataset from which to select items
     hparams : dict
         hyperparameters
-    
+
     Returns
     -------
     spk_idx : dict
@@ -929,14 +938,14 @@ def get_guide_ctx(hparams, run_opts):
     """Initializes a context object for guides,
     containing pretrained models only for guides that will be
     used per hparams
-    
+
     Arguments
     ---------
-    hparams : dict  
+    hparams : dict
         Hyperparameters
     run_opts : dict
         Run options
-    
+
     Returns
     -------
     ctx : SimpleNamespace
@@ -960,7 +969,6 @@ def get_guide_ctx(hparams, run_opts):
 )
 
 
-
 def run_experiment(brain_cls):
     """Starts the experiement
 
@@ -1014,14 +1022,16 @@ def run_experiment(brain_cls):
                 "save_json_train": hparams["train_json"],
                 "save_json_valid": hparams["valid_json"],
                 "save_json_test": (
-                    hparams["test_json"] if "test" in hparams["splits"]
+                    hparams["test_json"]
+                    if "test" in hparams["splits"]
                     else None
                 ),
                 "sample_rate": hparams["sample_rate"],
                 "train_split": hparams["train_split"],
                 "valid_split": hparams["valid_split"],
                 "test_split": (
-                    hparams["test_split"] if "test" in hparams["splits"]
+                    hparams["test_split"]
+                    if "test" in hparams["splits"]
                     else None
                 ),
                 "seed": hparams["seed"],
@@ -1031,11 +1041,9 @@ def run_experiment(brain_cls):
 
     # We can now directly create the datasets for training, valid, and test
     guide_ctx = get_guide_ctx(hparams, run_opts)
-    (
-        datasets,
-        silence_padding,
-        resample_fn
-    ) = dataio_prepare(hparams, guide_ctx)
+    (datasets, silence_padding, resample_fn) = dataio_prepare(
+        hparams, guide_ctx
+    )
 
     # Apply overfit test settings
     datasets = apply_overfit_test(hparams, datasets)
diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
index ad2f5bf0c..87de6f84b 100644
--- a/benchmarks/DASB/LibriTTS/extraction/extract.py
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -35,7 +35,7 @@
         overrides=overrides,
     )
 
-    # Dataset prep (parsing Librispeech    
+    # Dataset prep (parsing Librispeech
     from libritts_prepare import prepare_libritts  # noqa
 
     # multi-gpu (ddp) save data preparation
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 1cf092a46..ecc5a7e34 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -34,9 +34,12 @@
 has_transformers = False
 try:
     from transformers import AutoModelForAudioXVector
+
     has_transformers = True
 except ImportError:
-    logger.warning("transformers library not found - some evaluators may be disabled")
+    logger.warning(
+        "transformers library not found - some evaluators may be disabled"
+    )
 
 
 RE_PUNCTUATION = re.compile(
@@ -850,10 +853,7 @@ def __init__(
         sample_rate=16000,
     ):
         super().__init__(sample_rate=sample_rate)
-        self.model = UTMOSModel(
-            source=source,
-            save_path=save_path,
-        )
+        self.model = UTMOSModel(source=source, save_path=save_path,)
         if run_opts is not None:
             device = run_opts.get("device")
             if device:
@@ -930,6 +930,7 @@ class SpkSimWavLM(SpeechEvaluator):
         The sample rate to which all samples will be resampled
         before being processed
     """
+
     def __init__(
         self,
         source,
@@ -937,7 +938,7 @@ def __init__(
         model_sample_rate=16000,
         run_opts=None,
         *args,
-        **kwargs
+        **kwargs,
     ):
         if not has_transformers:
             raise ValueError(
@@ -948,9 +949,7 @@ def __init__(
             run_opts = {}
         device = run_opts.get("device")
         self.model = AutoModelForAudioXVector.from_pretrained(
-            source, cache_dir=savedir,
-            *args,
-            **kwargs
+            source, cache_dir=savedir, *args, **kwargs
         )
         if device is not None:
             self.model = self.model.to(device)
@@ -972,15 +971,13 @@ def evaluate(
         # Resample
         if sample_rate is not None:
             wavs = torchaudio.functional.resample(
-                wavs,
-                orig_freq=sample_rate,
-                new_freq=self.model_sample_rate
+                wavs, orig_freq=sample_rate, new_freq=self.model_sample_rate
             )
         if sample_rate_ref is not None:
             wavs_ref = torchaudio.functional.resample(
                 wavs_ref,
                 orig_freq=sample_rate_ref,
-                new_freq=self.model_sample_rate
+                new_freq=self.model_sample_rate,
             )
 
         # Concatenate
@@ -989,14 +986,8 @@ def evaluate(
         length_abs = length * wavs_max_len
         length_ref_abs = length_ref * wavs_ref_max_len
         max_len = max(wavs_max_len, wavs_ref_max_len)
-        wavs, _ = pad_right_to(
-            wavs,
-            (batch_size, max_len)
-        )
-        wavs_ref, _ = pad_right_to(
-            wavs_ref,
-            (batch_size, max_len)
-        )
+        wavs, _ = pad_right_to(wavs, (batch_size, max_len))
+        wavs_ref, _ = pad_right_to(wavs_ref, (batch_size, max_len))
         audio = torch.cat([wavs, wavs_ref])
 
         length_cat_abs = torch.cat([length_abs, length_ref_abs])
@@ -1015,10 +1006,7 @@ def evaluate(
             hyp_embs, ref_embs, dim=-1
         )
 
-        return SpeechEvaluationResult(
-            scores,
-            {"score": scores}
-        )
+        return SpeechEvaluationResult(scores, {"score": scores})
 
 
 def vocoder_to_device(vocoder, device):

From f982325bff464753f376094e95c7285a99590f69 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 15 Jan 2025 14:37:50 -0500
Subject: [PATCH 056/270] DASB: More cosmetic changes from linters

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |  15 ++-
 .../hparams/train_continuous_ssl.yaml         |  10 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |  13 +--
 .../tokotron/hparams/train_discrete_ssl.yaml  |  55 ++-------
 .../TTS/tokotron/hparams/train_encodec.yaml   |  16 +--
 .../hparams/train_speech_tokenizer.yaml       |   6 +-
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  33 ------
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |  15 ++-
 .../hparams/train_continuous_ssl.yaml         | 105 ++++--------------
 .../TTS/tokotron/hparams/train_dac.yaml       |  20 ++--
 .../tokotron/hparams/train_discrete_ssl.yaml  |  53 ++++-----
 .../TTS/tokotron/hparams/train_encodec.yaml   |  36 +++---
 .../hparams/train_speech_tokenizer.yaml       |  18 +--
 .../LibriTTS/extraction/hparams/encodec.yaml  |   1 -
 benchmarks/DASB/model/Tokotron.py             |   2 -
 benchmarks/DASB/utils/eval.py                 |   1 -
 16 files changed, 127 insertions(+), 272 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index bad9ce7c1..98b2bb00d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -15,17 +15,16 @@ eval_utmos_model_name: utmos.ckpt
 eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
 eval_utmos_domain_id: null
 eval_utmos_judge_id: null
-eval_perf: false
+eval_perf: False
 
 
 eval_utmos: !name:eval.UTMOSSpeechEvaluator
-    source: !ref <eval_utmos_source>
-    save_path: !ref <eval_utmos_save_path>
-    model_name: !ref <eval_utmos_model_name>
-    model_url: !ref <eval_utmos_model_url>
-    domain_id: !ref <eval_utmos_domain_id>
-    judge_id: !ref <eval_utmos_judge_id>
-
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
 
 eval_asr: !apply:speechbrain.utils.hparams.choice
   value: !ref <eval_asr_type>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
index 087eb6cf9..9c0b98d3b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
@@ -310,8 +310,8 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-  save_path: !ref <kmeans_cache_dir>
-  ssl_model: !ref <ssl_model>
-  vocoder_repo_id: !ref <vocoder_repo_id>
-  kmeans_dataset: !ref <kmeans_dataset>
-  num_clusters: !ref <vocab_size>
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 240b57a7d..4c4f03689 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -41,7 +41,7 @@ samples_interval: 5
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
+    data_path: !ref <tokens_folder>
 
 token_model_kwargs:
     n_quantizers: !ref <audio_tokens_per_step>
@@ -194,11 +194,11 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     infer_max_audio_length: !ref <infer_max_audio_length>
 
 tokenizer: !new:utils.tokenizer_interface.DACTokenizer
-  model_type: !ref <model_type>
-  model_bitrate: !ref <model_bitrate>
-  n_codebooks: !ref <audio_tokens_per_step>
-  load_pretrained: True
-  tag: latest
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
 
 
 modules:
@@ -238,4 +238,3 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index e3b549549..e14c1ce9d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -4,23 +4,18 @@
 # ############################################################################
 
 experiment_name: tokotron/discrete_ssl
-
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
-
 # Model Type
 ssl_model_type: wavlm
 representation_mode: discrete
-
 output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
-
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/discrete-<ssl_model_type>
+data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <data_folder>/prepared/discrete-<ssl_model_type> # e.g., /path/to/LibriSpeech
 pretrained_model_save_folder: !ref <prepare_save_folder>
 vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
 vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
@@ -37,12 +32,9 @@ progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
-
-tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
-
+tokens_folder: !PLACEHOLDER
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
-
+    data_path: !ref <tokens_folder>
 freeze_token_model: True
 token_model_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -50,7 +42,6 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
         wav2vec2: facebook/wav2vec2-large-960h-lv60-self
-
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
@@ -67,15 +58,10 @@ vocoder_src: !apply:speechbrain.utils.hparams.choice
         wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
-
 vocoder_available_layers: [1, 3, 7, 12, 18, 23]
-
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
 ckpt_interval_minutes: 30 # save checkpoint every N min
-
 # Training parameters
 input: text
 number_of_epochs: 50
@@ -88,8 +74,6 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
-
-
 # index
 pad_index: 0
 bos_index: 0
@@ -97,7 +81,6 @@ bos_width: 1
 eos_index: 0
 eos_width: 1
 audio_token_shift: 0
-
 # stages related parameters
 lr: 0.0005
 lr_warmup_steps: 10000
@@ -109,25 +92,20 @@ gate_threshold: 0.5
 gate_loss_beta: 0.2
 gate_loss_gamma: 0.01
 gate_loss_max_weight: 1.
-
 # Inference parameters
 eos_mode: gate
 decoder_mode: autoregressive
 scale_factor: 4
-
 # Beam Search-specific parameters
 min_decode_ratio: 1.0
 max_decode_ratio: 10.0
 beam_size: 5
-
-
 # Feature parameters
 sample_rate: 22050
 model_sample_rate: 16000
 max_audio_length: 1000
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10
-
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
 token_list_file_text: ./hparams/char_en.txt
@@ -137,17 +115,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <token_list_file_text>
         phonemes: !ref <token_list_file_phn>
-
 # Gate offset
 gate_offset: !apply:Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
-
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
-
-
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -167,12 +141,9 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
             save_path: !ref <pretrained_model_save_folder>
             freeze: !ref <freeze_token_model>
             output_all_hiddens: True
-
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -181,32 +152,26 @@ train_dataloader_opts:
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 valid_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 test_dataloader_opts:
     batch_size: 1
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 sample_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 token_model_kwargs:
     SSL_layers: !ref <token_model_layers>
-
-
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -264,7 +229,6 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     representation_mode: discrete
-
 modules:
     model: !ref <model>
     vocoder: !ref <vocoder>
@@ -272,6 +236,7 @@ modules:
     ssl_model: !ref <ssl_model>
 
 # define two optimizers here for two-stage training
+
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
@@ -314,8 +279,8 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-  save_path: !ref <kmeans_cache_dir>
-  ssl_model: !ref <ssl_model>
-  vocoder_repo_id: !ref <vocoder_repo_id>
-  kmeans_dataset: !ref <kmeans_dataset>
-  num_clusters: !ref <vocab_size>
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 0082e20db..7ccd9d716 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -41,7 +41,7 @@ samples_interval: 5
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
+    data_path: !ref <tokens_folder>
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
@@ -220,10 +220,10 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
-   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-   sample_rate: !ref <sample_rate>
-   bandwidth: !ref <bandwidth>
-   flat_embeddings: False
-   freeze: True
-   renorm_embeddings: False
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ec6de9bb2..39c394d71 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -41,7 +41,7 @@ samples_interval: 5
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
+    data_path: !ref <tokens_folder>
 
 
 splits: ["train", "valid", "test"]
@@ -240,5 +240,5 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
-  source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 0a75a3482..439869651 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -11,14 +11,12 @@
 import json
 import logging
 import math
-import sys
 import csv
 import torch
 import torchaudio
 import string
 import re
 from pathlib import Path
-from hyperpyyaml import load_hyperpyyaml
 from types import SimpleNamespace
 from torch.nn import ModuleDict
 from tqdm.auto import tqdm
@@ -507,37 +505,6 @@ def descriptive_statistics(items, key):
     }
 
 
-def select_subset(dataset, hparams):
-    """Selects a subset of the dataset provided, if specified.
-    The selection is controlled by a hyperparameter named
-    eval_subset, which is expected to list the IDs of the
-    data items on which evaluation will take place, one per line
-
-    Arguments
-    ---------
-    dataset : speechbrain.dataio.dataset.DynamicItemDataset
-        A dataset
-    hparams : dict
-        A hyperparameters file
-
-    Returns
-    -------
-    subset : dataset
-        The dataset, filtered down if applicable
-    """
-    eval_subset_path = hparams.get("eval_subset")
-    if eval_subset_path is not None:
-        eval_subset_path = Path(eval_subset_path)
-        if not eval_subset_path.exists():
-            raise ValueError(f"eval_subset {eval_subset_path} does not exist")
-        with open(eval_subset_path) as eval_subset_file:
-            eval_subset_ids = [line.strip() for line in eval_subset_file]
-        subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
-    else:
-        subset = dataset
-    return subset
-
-
 RE_INTEGER = re.compile(r"^-?\d+$")
 RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index a4c8b6b59..94fb319c8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -21,7 +21,7 @@ eval_utmos_model_name: utmos.ckpt
 eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
 eval_utmos_domain_id: null
 eval_utmos_judge_id: null
-eval_perf: false
+eval_perf: False
 
 
 eval_asr: !apply:speechbrain.utils.hparams.choice
@@ -39,12 +39,12 @@ eval_asr: !apply:speechbrain.utils.hparams.choice
       savedir: !ref <pretrained_model_save_folder>
 
 eval_utmos: !name:eval.UTMOSSpeechEvaluator
-    source: !ref <eval_utmos_source>
-    save_path: !ref <eval_utmos_save_path>
-    model_name: !ref <eval_utmos_model_name>
-    model_url: !ref <eval_utmos_model_url>
-    domain_id: !ref <eval_utmos_domain_id>
-    judge_id: !ref <eval_utmos_judge_id>
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
 
 eval_spk_sim: !name:eval.SpkSimWavLM
   source: !ref <eval_spk_sim_source>
@@ -63,4 +63,3 @@ eval_summary:
     descriptive: ["utmos"]
   spk_sim:
     descriptive: ["score"]
-
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
index 2cbca90fb..08ddc0984 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
@@ -9,10 +9,9 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-data_folder_alignments: null
+data_folder: !PLACEHOLDER
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
 ssl_model_type: wavlm
@@ -26,11 +25,11 @@ train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
 train_split: !apply:speechbrain.utils.hparams.choice
-  value: !ref <data_mode>
-  choices:
-    lite: ["train-clean-100"]
-    clean: ["train-clean-100", "train-clean-360"]
-    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
 frozen_split_path: null
@@ -41,15 +40,12 @@ progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
-
 # Position shift
 use_position_shift: True
 max_position_shift: 1000
 position_shift_seed: 42
 position_shift_probability: 1.0
-
 freeze_token_model: True
-
 token_model_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
     choices:
@@ -91,13 +87,9 @@ spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
         wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite
 asr_src: speechbrain/asr-transformer-transformerlm-librispeech
 spk_emb_shuffle: True
-
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
 ckpt_interval_minutes: 30 # save checkpoint every N min
-
 # Training parameters
 input: text
 number_of_epochs: 1000
@@ -113,8 +105,6 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
-
-
 # index
 pad_index: 0
 bos_index: 0
@@ -122,7 +112,6 @@ bos_width: 1
 eos_index: 0
 eos_width: 1
 audio_token_shift: 0
-
 # stages related parameters
 lr: 0.0005
 lr_warmup_steps: 10000
@@ -134,29 +123,23 @@ gate_threshold: 0.5
 gate_loss_beta: 0.2
 gate_loss_gamma: 0.01
 gate_loss_max_weight: 1.
-
 # Inference parameters
 inference_mode: autoregressive
 eos_mode: gate
 decoder_mode: autoregressive
 scale_factor: 4
-
 # Embedding Injection
 spk_emb_injection: null
-
 # Beam Search-specific parameters
 min_decode_ratio: 1.0
 max_decode_ratio: 10.0
 beam_size: 5
-
-
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 16000
 max_audio_length: 1000
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10
-
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
 token_list_file_text: ./hparams/char_en.txt
@@ -166,16 +149,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <token_list_file_text>
         phonemes: !ref <token_list_file_phn>
-
 # Gate offset
 gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
-
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
-
 # Guides
 guides_enabled: False
 guides_start_epoch: 40
@@ -184,9 +164,6 @@ guides_spk_discrete: True
 guides_spk_loss_weight: 0.2
 guides_asr: True
 guides_asr_loss_weight: 0.1
-
-
-
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -206,8 +183,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
             save_path: !ref <pretrained_model_save_folder>
             freeze: !ref <freeze_token_model>
             output_all_hiddens: True
-
-
 token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
     ssl_model: !ref <ssl_model>
     kmeans_repo_id: !ref <token_model_kmeans_src>
@@ -217,11 +192,9 @@ token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl
     layers_num: !apply:benchmarks.DASB.utils.hparams.as_list
         value: !ref <token_model_layers>
         dtype: int
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
-
 spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
     source: !ref <spk_emb_discrete_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa-<ssl_model_type>
@@ -229,13 +202,12 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
     classname: DiscreteSpkEmb
     overrides:
         ssl_layer_num_selected: !ref <token_model_layers>
-
 asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide
     source: !ref <asr_src>
-    savedir:  !ref <pretrained_model_save_folder>/asr-transformer
+    savedir: !ref <pretrained_model_save_folder>/asr-transformer
+# Dataloader options
 
 
-# Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
@@ -243,28 +215,24 @@ train_dataloader_opts:
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 valid_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 test_dataloader_opts:
     batch_size: 1
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 sample_dataloader_opts:
     batch_size: !ref <batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
-
 token_model_kwargs:
     SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list
         value: !ref <token_model_layers>
@@ -275,8 +243,6 @@ token_model_kwargs:
     bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers
         layers: !ref <token_model_layers>
         value: null
-
-
 extract_features_opts:
     dataloader_opts:
         batch_size: !ref <extract_features_batch_size>
@@ -292,8 +258,6 @@ extract_features_opts:
     model_sample_rate: !ref <model_sample_rate>
     spk_emb_model: !ref <spk_emb_model>
     data_folder_alignments: !ref <data_folder_alignments>
-
-
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -331,9 +295,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 6
 attention_type: regularMHA
-
 ############################## models ################################
-
 vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list
     value: !apply:speechbrain.utils.hparams.choice
         value: !ref <select_layers>
@@ -341,28 +303,24 @@ vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list
         choices:
             null: !ref <token_model_layers>
     dtype: int
-
 vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper
-  model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
-  available_layers: !ref <vocoder_available_layers>
-  layers: !ref <vocoder_layers>
-  num_units: !ref <audio_num_tokens>
-  offset: !ref <token_offset>
-
+    model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
+        source: !ref <vocoder_src>
+        savedir: !ref <vocoder_model_path>
+    available_layers: !ref <vocoder_available_layers>
+    layers: !ref <vocoder_layers>
+    num_units: !ref <audio_num_tokens>
+    offset: !ref <token_offset>
 vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper
     model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams
         source: !ref <vocoder_src>
         savedir: !ref <vocoder_model_path>
-
 vocoder: !apply:benchmarks.DASB.utils.hparams.choice
     value: !ref <representation_mode>
     apply: True
     choices:
         discrete: !ref <vocoder_discrete>
         continuous: !ref <vocoder_continuous>
-
 inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
     bos_index: !ref <bos_index>
     eos_index: !ref <eos_index>
@@ -372,28 +330,24 @@ inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
     using_eos_threshold: False
     length_normalization: True
     audio_token_shift: !ref <audio_token_shift>
-
 inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
     scale_factor: !ref <scale_factor>
     gate_threshold: !ref <gate_threshold>
     eos_mode: !ref <eos_mode>
     representation_mode: !ref <representation_mode>
-
 inference: !apply:speechbrain.utils.hparams.choice
     value: !ref <inference_mode>
     choices:
         search: !ref <inference_search>
         forward: !ref <inference_forward>
-
 emb:
-  spk:
-    kind: "pretrained"
-    dim: 192
-    vocoder: True
-    injection: !ref <spk_emb_injection>
-
-model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
-    input_num_tokens: !ref <input_num_tokens>
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: True
+        injection: !ref <spk_emb_injection>
+model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
@@ -430,16 +384,13 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     position_shift_seed: !ref <position_shift_seed>
     emb: !ref <emb>
     layerwise_renorm: !ref <layerwise_renorm>
-
 modules:
     model: !ref <model>
     vocoder: !ref <vocoder>
     compute_cost: !ref <compute_cost>
-
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
-
 compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
@@ -456,33 +407,25 @@ compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
     spk_weight: !ref <guides_spk_loss_weight>
     asr_weight: !ref <guides_asr_loss_weight>
     representation_mode: !ref <representation_mode>
-
-
 lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
-
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
-
 freezer: !new:benchmarks.DASB.utils.preparation.Freezer
     save_path: !ref <prepare_save_folder>
     archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
-
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
 progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
     logger: !ref <progress_logger>
     sample_rate: !ref <model_sample_rate>
     eos_threshold: !ref <gate_threshold>
-
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
-    seed: !ref <seed>
\ No newline at end of file
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 2d91a521e..3333cfceb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -22,11 +22,11 @@ train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
 train_split: !apply:speechbrain.utils.hparams.choice
-  value: !ref <data_mode>
-  choices:
-    lite: ["train-clean-100"]
-    clean: ["train-clean-100", "train-clean-360"]
-    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
 frozen_split_path: null
@@ -219,8 +219,6 @@ attention_type: regularMHA
 vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder
     dac: !ref <dac>
 
-
-
 inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
     bos_index: !ref <bos_index>
     eos_index: !ref <eos_index>
@@ -243,10 +241,10 @@ inference: !apply:speechbrain.utils.hparams.choice
         forward: !ref <inference_forward>
 
 emb:
-  spk:
-    kind: "pretrained"
-    dim: 192
-    vocoder: !ref <vocoder_takes_spk_emb>
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
 
 model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 2ecb72a84..3ba568d94 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -9,10 +9,9 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
-
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-data_folder_alignments: null
+data_folder: !PLACEHOLDER
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
 ssl_model_type: wavlm
@@ -26,11 +25,11 @@ train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
 train_split: !apply:speechbrain.utils.hparams.choice
-  value: !ref <data_mode>
-  choices:
-    lite: ["train-clean-100"]
-    clean: ["train-clean-100", "train-clean-360"]
-    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
 frozen_split_path: null
@@ -42,10 +41,10 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
-tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
+    data_path: !ref <tokens_folder>
 
 # Position shift
 use_position_shift: True
@@ -192,7 +191,6 @@ guides_asr: True
 guides_asr_loss_weight: 0.1
 
 
-
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -270,8 +268,6 @@ extract_features_opts:
     model_sample_rate: !ref <model_sample_rate>
     spk_emb_model: !ref <spk_emb_model>
     data_folder_alignments: !ref <data_folder_alignments>
-
-
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -303,22 +299,18 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 6
 attention_type: regularMHA
-
 ############################## models ################################
-
 vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
     source: !ref <vocoder_src>
     savedir: !ref <vocoder_model_path>
-
 emb:
-  spk:
-    kind: "pretrained"
-    dim: 192
-    vocoder: True
-    injection: !ref <spk_emb_injection>
-
-model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
-    input_num_tokens: !ref <input_num_tokens>
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: True
+        injection: !ref <spk_emb_injection>
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
     audio_num_tokens: !ref <vocab_size>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
     d_model: !ref <d_model>
@@ -342,16 +334,13 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     scale_factor: !ref <scale_factor>
     representation_mode: !ref <representation_mode>
     emb: !ref <emb>
-
 modules:
     model: !ref <model>
     vocoder: !ref <vocoder>
     compute_cost: !ref <compute_cost>
-
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
-
 compute_cost: !new:Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
@@ -389,8 +378,8 @@ spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-  save_path: !ref <kmeans_cache_dir>
-  ssl_model: !ref <ssl_model>
-  vocoder_repo_id: !ref <vocoder_repo_id>
-  kmeans_dataset: !ref <kmeans_dataset>
-  num_clusters: !ref <vocab_size>
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 1f3764ceb..e766267e7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -24,11 +24,11 @@ train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
 train_split: !apply:speechbrain.utils.hparams.choice
-  value: !ref <data_mode>
-  choices:
-    lite: ["train-clean-100"]
-    clean: ["train-clean-100", "train-clean-360"]
-    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
 frozen_split_path: null
@@ -43,7 +43,7 @@ samples_interval: 5
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
-   data_path: !ref <tokens_folder>
+    data_path: !ref <tokens_folder>
 
 freeze_token_model: True
 token_model_src: "facebook/encodec_24khz"
@@ -205,11 +205,11 @@ attention_type: regularMHA
 
 ############################## models ################################
 emb:
-  spk:
-    kind: "pretrained"
-    dim: 192
-    vocoder: !ref <vocoder_takes_spk_emb>
-    injection: !ref <spk_emb_injection>
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+        injection: !ref <spk_emb_injection>
 
 model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
@@ -238,13 +238,13 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
 
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
-   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-   save_path: !ref <save_folder>
-   sample_rate: !ref <sample_rate>
-   bandwidth: !ref <bandwidth>
-   flat_embeddings: False
-   freeze: True
-   renorm_embeddings: False
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
 
 
 modules:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 2de6e121e..97ab94275 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -24,11 +24,11 @@ train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
 train_split: !apply:speechbrain.utils.hparams.choice
-  value: !ref <data_mode>
-  choices:
-    lite: ["train-clean-100"]
-    clean: ["train-clean-100", "train-clean-360"]
-    full: ["train-clean-100", "train-clean-360", "train-other-500"]
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
 frozen_split_path: null
@@ -236,10 +236,10 @@ inference: !apply:speechbrain.utils.hparams.choice
         forward: !ref <inference_forward>
 
 emb:
-  spk:
-    kind: "pretrained"
-    dim: 192
-    vocoder: !ref <vocoder_takes_spk_emb>
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
 
 model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
index 31211ec75..d3cd83c3e 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
@@ -22,7 +22,6 @@ valid_json: !ref <output_folder>/dev-clean.json
 test_json: !ref <output_folder>/test.json
 
 
-
 batch_size: 8
 num_workers: 8
 src_key: wav
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 266090be4..949840380 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -28,8 +28,6 @@
 from speechbrain.nnet.linear import Linear
 from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss
 from speechbrain.dataio.dataio import length_to_mask
-from speechbrain.dataio.batch import PaddedBatch
-from speechbrain.decoders.seq2seq import S2STransformerBeamSearcher
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
 
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index ecc5a7e34..9d5e8642f 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -15,7 +15,6 @@
 from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher
 from speechbrain.dataio.batch import PaddedBatch
 from speechbrain.utils.metric_stats import ErrorRateStats
-from speechbrain.utils.superpowers import run_shell
 from speechbrain.utils.data_utils import pad_right_to
 from speechbrain.utils.fetching import fetch
 from collections import namedtuple

From 1357ff146a27cf83b34ab4f47b10e41d117f8376 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:10:24 -0500
Subject: [PATCH 057/270] DASB: Tokotron: Relative paths

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 3 +++
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 8da11247b..1b1dd4795 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -31,6 +31,9 @@
 from Tokotron import RepresentationMode
 from evaluate import TokotronEvaluator
 
+base_dir = str(Path(__file__).parent.parent.parent.parent)
+sys.path.append(base_dir)
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 3df858844..943727635 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -34,6 +34,8 @@
 import re
 import string
 
+base_dir = str(Path(__file__).parent.parent.parent.parent)
+sys.path.append(base_dir)
 
 logger = logging.getLogger(__name__)
 

From 958ee870bc83de36ec0d2db07f8abf76113c8b65 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:30:44 -0500
Subject: [PATCH 058/270] DASB: Tokotron: Add choices for the model type

---
 .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 8 +++++++-
 .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 7 ++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index e14c1ce9d..f96681a3a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -46,7 +46,13 @@ g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
 kmeans_dataset: LibriSpeech
-vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+    wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+    wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
 token_offset: 1
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 3ba568d94..5b1a06b46 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -63,7 +63,12 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
 g2p_src: flexthink/soundchoice-g2p
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
 kmeans_dataset: LibriSpeech
-vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+  value: !ref <ssl_model_type>
+  choices:
+    hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+    wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+    wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
 select_layers: null

From 043eb9ca15d4c175a873747c0c8f0c04d4d6c546 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 16 Jan 2025 16:59:25 -0500
Subject: [PATCH 059/270] DASB: Tokotron: more clean-up

---
 .../DASB/LJSpeech/TTS/tokotron/Tokotron.py    |   1 -
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |   4 +
 .../hparams/train_continuous_ssl.yaml         | 317 -------------
 .../TTS/tokotron/hparams/train_dac.yaml       |  12 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |  70 ++-
 .../TTS/tokotron/hparams/train_encodec.yaml   |  32 +-
 .../hparams/train_speech_tokenizer.yaml       |  38 +-
 .../DASB/LJSpeech/TTS/tokotron/preparation.py |   1 -
 .../DASB/LJSpeech/TTS/tokotron/train.py       |  46 +-
 .../TTS/tokotron/train_continuous_ssl.py      |  45 --
 .../DASB/LJSpeech/TTS/tokotron/train_dac.py   |  45 --
 .../TTS/tokotron/train_discrete_ssl.py        |  77 ----
 .../LJSpeech/TTS/tokotron/train_encodec.py    |  44 --
 .../TTS/tokotron/train_speech_tokenizer.py    |  44 --
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  40 +-
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |   5 +
 .../hparams/train_continuous_ssl.yaml         | 431 ------------------
 .../TTS/tokotron/hparams/train_dac.yaml       |  47 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |  57 +--
 .../hparams/train_speech_tokenizer.yaml       |  47 +-
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  92 ++--
 .../TTS/tokotron/train_continuous_ssl.py      |  47 --
 .../DASB/LibriTTS/TTS/tokotron/train_dac.py   |  47 --
 .../TTS/tokotron/train_discrete_ssl.py        |  79 ----
 .../LibriTTS/TTS/tokotron/train_encodec.py    |  46 --
 .../TTS/tokotron/train_speech_tokenizer.py    |  46 --
 26 files changed, 154 insertions(+), 1606 deletions(-)
 delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
 delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py
 delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
 delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py
deleted file mode 120000
index 097a6d488..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../model/Tokotron.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index 98b2bb00d..8ca3fb8dd 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -48,3 +48,7 @@ eval_summary:
     descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
   utmos:
     descriptive: ["utmos"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
deleted file mode 100644
index 9c0b98d3b..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ /dev/null
@@ -1,317 +0,0 @@
-# ############################################################################
-# Model: Tokenized TTS (WhisperSpeech-inspired)
-# Authors:  Artem Ploujnikov
-# ############################################################################
-
-experiment_name: tokotron/continuous_ssl
-
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 74443
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-
-# Model type
-ssl_model_type: wavlm
-representation_mode: continuous
-
-# Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/continuous-<ssl_model_type>
-pretrained_model_save_folder: !ref <prepare_save_folder>
-vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-continuous
-vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
-prepare_archive_path: null
-prepare_skip_ignore_folders: False
-train_json: !ref <prepare_save_folder>/train.json
-valid_json: !ref <prepare_save_folder>/valid.json
-test_json: !ref <prepare_save_folder>/test.json
-frozen_split_path: null
-sample_path: null
-progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
-progress_current: !ref <progress_folder>/current
-progress_meta: !ref <progress_folder>/meta.yaml
-num_audio_samples: 32
-samples_interval: 5
-
-freeze_ssl_model: True
-ssl_model_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: microsoft/wavlm-large
-        hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
-
-g2p_src: speechbrain/soundchoice-g2p
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_offset: 1
-vocoder_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
-spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
-use_spk_emb: False
-
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
-
-splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
-
-
-ckpt_interval_minutes: 30 # save checkpoint every N min
-
-# Training parameters
-input: text
-number_of_epochs: 50
-batch_size: 16
-grad_accumulation_factor: 1
-max_grad_norm: 0.01
-sorting: random
-num_workers: 4
-skip_prep: False
-overfit_test: False
-overfit_test_sample_count: !ref <batch_size>
-overfit_test_epoch_data_count: 1000
-
-
-# index
-pad_index: 0
-bos_index: 0
-bos_width: 1
-eos_index: 0
-eos_width: 1
-audio_token_shift: 0
-
-# stages related parameters
-lr: 0.0005
-lr_warmup_steps: 10000
-lr_annealing_mode: step
-guided_attention_weight: 50.0
-guided_attention_sigma: 0.5
-gate_loss_weight: 1.0
-gate_threshold: 0.5
-gate_loss_beta: 0.2
-gate_loss_gamma: 0.01
-gate_loss_max_weight: 1.
-
-# Inference parameters
-inference_mode: autoregressive
-eos_mode: gate
-decoder_mode: autoregressive
-scale_factor: 4
-
-# Beam Search-specific parameters
-min_decode_ratio: 1.0
-max_decode_ratio: 10.0
-beam_size: 5
-
-
-# Feature parameters
-sample_rate: 22050
-model_sample_rate: 16000
-max_audio_length: 1000
-infer_max_audio_length: !ref <max_audio_length>
-debug_infer_max_audio_length: 10
-
-# Label encoder
-label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
-token_list_file: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <token_list_file_text>
-        phonemes: !ref <token_list_file_phn>
-
-# Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
-    beta: !ref <gate_loss_beta>
-    gamma: !ref <gate_loss_gamma>
-    max_weight: !ref <gate_loss_max_weight>
-
-silence_padding: !ref <gate_offset>
-use_silence_padding: True
-
-
-# Token model (pretrained)
-ssl_model: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-            source: !ref <ssl_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_ssl_model>
-            output_all_hiddens: True
-
-spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
-    source: !ref <spk_emb_src>
-    savedir: !ref <pretrained_model_save_folder>/ecapa
-
-# Dataloader options
-train_dataloader_opts:
-    batch_size: !ref <batch_size>
-    shuffle: True
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-
-valid_dataloader_opts:
-    batch_size: !ref <batch_size>
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-
-test_dataloader_opts:
-    batch_size: 1
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-
-sample_dataloader_opts:
-    batch_size: !ref <batch_size>
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !ref <ssl_model_layers>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-
-
-####################### Model parameters ###########################
-# Transformer
-d_model: 512
-nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
-d_ffn: 2048
-transformer_dropout: 0.2
-target_dropout: 0.2
-activation: !name:torch.nn.GELU
-audio_num_tokens: 1000
-audio_dim: 1024
-audio_emb_size: 128
-audio_emb_freeze: False
-audio_emb_pretrained: False
-audio_emb_lr: 0.00001
-audio_emb_weight_decay: 0.001
-text_num_tokens: 39
-phn_num_tokens: 52
-input_num_tokens: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <text_num_tokens>
-        phonemes: !ref <phn_num_tokens>
-audio_tokens_per_step: 6
-attention_type: regularMHA
-
-############################## models ################################
-
-vocoder: !apply:speechbrain.inference.vocoders.HIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
-
-model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
-    input_num_tokens: !ref <input_num_tokens>
-    audio_num_tokens: !ref <audio_num_tokens>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    d_model: !ref <d_model>
-    d_ffn: !ref <d_ffn>
-    nhead: !ref <nhead>
-    enc_num_layers: !ref <enc_num_layers>
-    dec_num_layers: !ref <dec_num_layers>
-    dropout: !ref <transformer_dropout>
-    target_dropout: !ref <target_dropout>
-    activation: !ref <activation>
-    attention_type: !ref <attention_type>
-    gate_threshold: !ref <gate_threshold>
-    gate_offset: !ref <gate_offset>
-    audio_emb_size: !ref <audio_emb_size>
-    audio_emb_freeze: !ref <audio_emb_freeze>
-    max_audio_length: !ref <max_audio_length>
-    eos_mode: !ref <eos_mode>
-    infer_max_audio_length: !ref <infer_max_audio_length>
-    audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
-    scale_factor: !ref <scale_factor>
-    audio_dim: !ref <audio_dim>
-    representation_mode: continuous
-
-
-modules:
-    model: !ref <model>
-    vocoder: !ref <vocoder>
-    compute_cost: !ref <compute_cost>
-    ssl_model: !ref <ssl_model>
-
-# define two optimizers here for two-stage training
-opt_class: !name:torch.optim.Adam
-    lr: !ref <lr>
-
-compute_cost: !new:Tokotron.TokotronLoss
-    guided_attention_weight: !ref <guided_attention_weight>
-    guided_attention_sigma: !ref <guided_attention_sigma>
-    gate_weight: !ref <gate_loss_weight>
-    gate_beta: !ref <gate_loss_beta>
-    gate_gamma: !ref <gate_loss_gamma>
-    gate_max_weight: !ref <gate_loss_max_weight>
-    silence_padding: !ref <silence_padding>
-    eos_mode: !ref <eos_mode>
-    eos_index: !ref <eos_index>
-    eos_width: !ref <eos_width>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    audio_token_shift: !ref <audio_token_shift>
-    representation_mode: continuous
-
-
-lr_annealing: !new:Tokotron.TargetedNoamScheduler
-    lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
-    n_warmup_steps: !ref <lr_warmup_steps>
-    param_group: 0
-
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-    checkpoints_dir: !ref <save_folder>
-    recoverables:
-        model: !ref <model>
-        lr_scheduler: !ref <lr_annealing>
-        counter: !ref <epoch_counter>
-
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-    limit: !ref <number_of_epochs>
-
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-    save_file: !ref <train_log>
-
-tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-    save_path: !ref <kmeans_cache_dir>
-    ssl_model: !ref <ssl_model>
-    vocoder_repo_id: !ref <vocoder_repo_id>
-    kmeans_dataset: !ref <kmeans_dataset>
-    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 4c4f03689..be20bfa63 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -14,8 +14,6 @@ train_log: !ref <output_folder>/train_log.txt
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
 
 # Model type
 representation_mode: discrete
@@ -105,7 +103,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -173,7 +171,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -209,7 +207,7 @@ modules:
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.Tokotron.oss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -229,10 +227,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index f96681a3a..555878c24 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -7,12 +7,14 @@ experiment_name: tokotron/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+
 # Model Type
 ssl_model_type: wavlm
 representation_mode: discrete
 output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+
 # Data files
 data_folder: !PLACEHOLDER
 prepare_save_folder: !ref <data_folder>/prepared/discrete-<ssl_model_type> # e.g., /path/to/LibriSpeech
@@ -47,27 +49,21 @@ token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
 kmeans_dataset: LibriSpeech
 vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
-  value: !ref <ssl_model_type>
-  choices:
-    hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
-    wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
-    wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
 token_offset: 1
-vocoder_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 ckpt_interval_minutes: 30 # save checkpoint every N min
+
 # Training parameters
 input: text
 number_of_epochs: 50
@@ -80,6 +76,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+
 # index
 pad_index: 0
 bos_index: 0
@@ -87,6 +84,7 @@ bos_width: 1
 eos_index: 0
 eos_width: 1
 audio_token_shift: 0
+
 # stages related parameters
 lr: 0.0005
 lr_warmup_steps: 10000
@@ -98,20 +96,19 @@ gate_threshold: 0.5
 gate_loss_beta: 0.2
 gate_loss_gamma: 0.01
 gate_loss_max_weight: 1.
+
 # Inference parameters
 eos_mode: gate
 decoder_mode: autoregressive
 scale_factor: 4
-# Beam Search-specific parameters
-min_decode_ratio: 1.0
-max_decode_ratio: 10.0
-beam_size: 5
+
 # Feature parameters
 sample_rate: 22050
 model_sample_rate: 16000
 max_audio_length: 1000
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10
+
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
 token_list_file_text: ./hparams/char_en.txt
@@ -121,13 +118,15 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <token_list_file_text>
         phonemes: !ref <token_list_file_phn>
+
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
+
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -150,6 +149,7 @@ ssl_model: !apply:speechbrain.utils.hparams.choice
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
+
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -178,6 +178,7 @@ sample_dataloader_opts:
             value: !ref <pad_index>
 token_model_kwargs:
     SSL_layers: !ref <token_model_layers>
+
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -206,12 +207,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
-
-
-model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <vocab_size>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -235,18 +231,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     representation_mode: discrete
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
-    ssl_model: !ref <ssl_model>
-
-# define two optimizers here for two-stage training
 
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -262,7 +263,7 @@ compute_cost: !new:Tokotron.TokotronLoss
     representation_mode: discrete
 
 
-lr_annealing: !new:Tokotron.TargetedNoamScheduler
+lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
     lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
     n_warmup_steps: !ref <lr_warmup_steps>
     param_group: 0
@@ -274,19 +275,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
-tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-    save_path: !ref <kmeans_cache_dir>
-    ssl_model: !ref <ssl_model>
-    vocoder_repo_id: !ref <vocoder_repo_id>
-    kmeans_dataset: !ref <kmeans_dataset>
-    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 7ccd9d716..3ab6eb770 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -14,9 +14,6 @@ train_log: !ref <output_folder>/train_log.txt
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
-
 # Model type
 representation_mode: discrete
 
@@ -97,7 +94,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -162,7 +159,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -182,14 +179,24 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
 modules:
     model: !ref <model>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -209,21 +216,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
-tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
-    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
-    sample_rate: !ref <sample_rate>
-    bandwidth: !ref <bandwidth>
-    flat_embeddings: False
-    freeze: True
-    renorm_embeddings: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 39c394d71..568f8c13e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -14,8 +14,6 @@ train_log: !ref <output_folder>/train_log.txt
 
 token_model_src: "fnlp/SpeechTokenizer"
 g2p_src: flexthink/soundchoice-g2p
-vocoder_type: encodec
-vocoder_src: "charactr/vocos-encodec-24khz"
 
 # Model type
 representation_mode: discrete
@@ -99,7 +97,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <token_list_file_phn>
 
 # Gate offset
-gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     beta: !ref <gate_loss_beta>
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
@@ -107,14 +105,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp
 silence_padding: !ref <gate_offset>
 
 # Token model (pretrained)
-speech_tokenizer: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-
-token_model: !new:Tokotron.SpeechTokenizerFeatureExtractor
-    speech_tokenizer: !ref <speech_tokenizer>
-    codebooks: !ref <audio_tokens_per_step>
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -145,14 +135,6 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <batch_size>
-    token_model: !ref <token_model>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-
-
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -181,7 +163,7 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
     input_num_tokens: !ref <input_num_tokens>
     audio_num_tokens: !ref <audio_num_tokens>
     audio_tokens_per_step: !ref <audio_tokens_per_step>
@@ -201,15 +183,19 @@ model: !new:Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+
 modules:
     model: !ref <model>
-    token_model: !ref <token_model>
+    tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:Tokotron.TokotronLoss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>
@@ -229,16 +215,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
-    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py
deleted file mode 120000
index 08621a288..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../utils/preparation.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 1b1dd4795..8c571babd 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -22,18 +22,19 @@
 import string
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataio import clean_padding_
 from speechbrain.utils.distributed import run_on_main
-from Tokotron import (
-    get_silence_token,
-    use_silence_padding,
-    feature_pad_to,
-)
-from Tokotron import RepresentationMode
-from evaluate import TokotronEvaluator
 
-base_dir = str(Path(__file__).parent.parent.parent.parent)
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
+from model.Tokotron import (
+    get_silence_token,
+    use_silence_padding,
+    feature_pad_to,
+    RepresentationMode,
+)  # noqa: E402
+from evaluate import TokotronEvaluator  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -268,6 +269,16 @@ def on_stage_end(self, stage, stage_loss, epoch):
         stage_stats = {"loss": stage_loss, **loss_stats}
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
+        
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
+            self.evaluator.on_evaluate_end()
+            eval_summary = self.evaluator.compute_summary()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
 
         # Perform end-of-iteration things, like annealing, logging, etc.
         if stage == sb.Stage.VALID:
@@ -292,9 +303,6 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
             )
 
-        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
-            self.evaluator.on_evaluate_end()
-
     def fit_batch(self, batch):
         """Fit one batch, override to do multiple updates.
 
@@ -363,7 +371,9 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        raise NotImplementedError()
+        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        clean_padding_(wav, length)
+        return wav
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
@@ -686,15 +696,7 @@ def apply_overfit_test(hparams, dataset):
 )
 
 
-def run_experiment(brain_cls):
-    """Starts the experiement
-
-    Arguments
-    ---------
-    brain_cls : type
-        The brain class to instantiate
-    """
-
+if __name__ == "__main__":
     # Reading command line arguments
     hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
 
@@ -757,7 +759,7 @@ def run_experiment(brain_cls):
     audio_keys = ["audio_pad", "audio_bos"]
 
     # Trainer initialization
-    tts_brain = brain_cls(
+    tts_brain = TokotronBrain(
         modules=hparams["modules"],
         opt_class=hparams["opt_class"],
         hparams=hparams,
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py
deleted file mode 100644
index f3495eaca..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Continuous SSL verfsion
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronContinuousSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.vocoder(audio)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronContinuousSSLBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
deleted file mode 100644
index 83b9ff538..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDACBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        z, _, _ = self.modules.tokenizer.quantizer.from_codes(
-            audio.transpose(1, 2).int()
-        )
-        wav = self.modules.tokenizer.decode(z).squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDACBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
deleted file mode 100644
index aa2c57681..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Discrete SSL version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-import torch
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDiscreteSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def on_stage_start(self, stage, epoch):
-        self.compute_offset()
-        return super().on_stage_start(stage, epoch)
-
-    def compute_offset(self):
-        """Computes per-layer offsets"""
-        layers_set = set(self.hparams.token_model_layers)
-        available_layers_set = set(self.hparams.vocoder_available_layers)
-        if not layers_set.issubset(available_layers_set):
-            unavailable_layers = ",".join(
-                str(layer) for layer in (layers_set - available_layers_set)
-            )
-            raise ValueError(f"Layers {unavailable_layers} are not supported")
-        self.num_units = self.hparams.vocab_size
-        _, layers_idx = torch.where(
-            torch.tensor(
-                self.hparams.vocoder_available_layers, device=self.device
-            ).unsqueeze(0)
-            == torch.tensor(
-                self.hparams.token_model_layers, device=self.device
-            ).unsqueeze(1)
-        )
-        self.layer_offset = (
-            torch.tensor(layers_idx, device=self.device) * self.num_units
-        )[None, None, :]
-        self.offset = self.hparams.token_offset
-        self.modules.vocoder.tokenize = False
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        units_with_offset = (
-            audio + self.layer_offset.to(audio.device) + self.offset
-        )
-        wav = self.modules.vocoder(units_with_offset)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDiscreteSSLBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py
deleted file mode 100644
index 2168f970d..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronEncodecBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.token_model.decode(audio)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronEncodecBrain)
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py
deleted file mode 100644
index bc51db78c..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronSTBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.token_model.decode(audio)
-        if length is not None:
-            clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronSTBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 439869651..99d547cc5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -124,41 +124,19 @@ def get_output_folder(self, stage, epoch):
             output_folder = output_folder / str(epoch)
         output_folder.mkdir(parents=True, exist_ok=True)
         return output_folder
-
-    def evaluate(self, dataset):
-        """Runs evaluation on a dataset
+    
+    def on_evaluate_end(self):
+        """Invoked when evaluation starts
 
         Arguments
         ---------
-        dataset : speechbrain.dataio.dataset.DynamicItemDataset
-            a dataset
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
         """
-        logger.info("Recovering the checkpoint")
-        ckpt = self.hparams.checkpointer.recover_if_possible()
-        if not ckpt:
-            raise ValueError("Unable to recover the checkpoint")
-        self.modules.model.eval()
-        if self.hparams.eval_samples is not None:
-            dataset = dataset.filtered_sorted(
-                select_n=self.hparams.eval_samples
-            )
-        loader = sb.dataio.dataloader.make_dataloader(
-            dataset, batch_size=self.hparams.batch_size
-        )
-        loader_it = iter(loader)
-        self.create_reports()
-        self.modules.model.show_inference_progress = False
-        self.item_ids = []
-        details_keys = list(self.evaluators.keys())
-        self.details = {evaluator_key: [] for evaluator_key in details_keys}
-        self.read_reports()
-        self.sample_text = []
-        self.sample_file_names = []
-        self.ref_file_names = []
-        logger.info("Starting evaluation")
-        batch_count = math.ceil(len(dataset) / self.hparams.batch_size)
-        for batch in tqdm(loader_it, desc="Evaluation", total=batch_count):
-            self.evaluate_batch(batch)
         self.write_summary()
         logger.info("Evaluation done")
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index 94fb319c8..bafd769cc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -63,3 +63,8 @@ eval_summary:
     descriptive: ["utmos"]
   spk_sim:
     descriptive: ["score"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
+  spk_sim: spk_sim_score_mean
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
deleted file mode 100644
index 08ddc0984..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml
+++ /dev/null
@@ -1,431 +0,0 @@
-# ############################################################################
-# Model: Tokenized TTS (WhisperSpeech-inspired)
-# Authors:  Artem Ploujnikov
-# ############################################################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-
-seed: 74443
-__set_seed: !apply:torch.manual_seed [!ref <seed>]
-output_folder: !ref results/transformer/<seed>
-save_folder: !ref <output_folder>/save
-train_log: !ref <output_folder>/train_log.txt
-# Data files
-data_folder: !PLACEHOLDER
-data_folder_alignments: null # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared
-pretrained_model_save_folder: !ref <prepare_save_folder>
-ssl_model_type: wavlm
-representation_mode: discrete
-vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-<representation_mode>-ms
-vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
-prepare_archive_path: null
-prepare_skip_ignore_folders: False
-data_mode: lite
-train_json: !ref <prepare_save_folder>/train.json
-valid_json: !ref <prepare_save_folder>/valid.json
-test_json: !ref <prepare_save_folder>/test.json
-train_split: !apply:speechbrain.utils.hparams.choice
-    value: !ref <data_mode>
-    choices:
-        lite: ["train-clean-100"]
-        clean: ["train-clean-100", "train-clean-360"]
-        full: ["train-clean-100", "train-clean-360", "train-other-500"]
-valid_split: ["dev-clean"]
-test_split: ["test-clean"]
-frozen_split_path: null
-sample_path: null
-progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
-progress_current: !ref <progress_folder>/current
-progress_meta: !ref <progress_folder>/meta.yaml
-num_audio_samples: 32
-samples_interval: 5
-# Position shift
-use_position_shift: True
-max_position_shift: 1000
-position_shift_seed: 42
-position_shift_probability: 1.0
-freeze_token_model: True
-token_model_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: microsoft/wavlm-large
-        hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
-g2p_src: flexthink/soundchoice-g2p
-token_model_kmeans_src: poonehmousavi/SSL_Quantization
-token_model_kmeans_dataset: LibriSpeech-100-360-500
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_model_layers: !ref <ssl_model_layers>
-select_layers: null
-token_offset: 1
-vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
-vocoder_src_continous: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
-vocoder_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <representation_mode>
-    choices:
-        discrete: !ref <vocoder_src_discrete>
-        continuous: !ref <vocoder_src_continous>
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
-vocoder_takes_spk_emb: True
-spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
-spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite
-        hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite
-        wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite
-asr_src: speechbrain/asr-transformer-transformerlm-librispeech
-spk_emb_shuffle: True
-splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
-ckpt_interval_minutes: 30 # save checkpoint every N min
-# Training parameters
-input: text
-number_of_epochs: 1000
-reset_annealing_epoch: null
-batch_size: 16
-batch_size_guided: 2
-extract_features_batch_size: 32
-grad_accumulation_factor: 1
-max_grad_norm: 0.01
-sorting: random
-num_workers: 4
-skip_prep: False
-overfit_test: False
-overfit_test_sample_count: !ref <batch_size>
-overfit_test_epoch_data_count: 1000
-# index
-pad_index: 0
-bos_index: 0
-bos_width: 1
-eos_index: 0
-eos_width: 1
-audio_token_shift: 0
-# stages related parameters
-lr: 0.0005
-lr_warmup_steps: 10000
-lr_annealing_mode: step
-guided_attention_weight: 50.0
-guided_attention_sigma: 0.5
-gate_loss_weight: 1.0
-gate_threshold: 0.5
-gate_loss_beta: 0.2
-gate_loss_gamma: 0.01
-gate_loss_max_weight: 1.
-# Inference parameters
-inference_mode: autoregressive
-eos_mode: gate
-decoder_mode: autoregressive
-scale_factor: 4
-# Embedding Injection
-spk_emb_injection: null
-# Beam Search-specific parameters
-min_decode_ratio: 1.0
-max_decode_ratio: 10.0
-beam_size: 5
-# Feature parameters
-sample_rate: 24000
-model_sample_rate: 16000
-max_audio_length: 1000
-infer_max_audio_length: !ref <max_audio_length>
-debug_infer_max_audio_length: 10
-# Label encoder
-label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
-token_list_file: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <token_list_file_text>
-        phonemes: !ref <token_list_file_phn>
-# Gate offset
-gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp
-    beta: !ref <gate_loss_beta>
-    gamma: !ref <gate_loss_gamma>
-    max_weight: !ref <gate_loss_max_weight>
-silence_padding: !ref <gate_offset>
-use_silence_padding: True
-# Guides
-guides_enabled: False
-guides_start_epoch: 40
-guides_spk: False
-guides_spk_discrete: True
-guides_spk_loss_weight: 0.2
-guides_asr: True
-guides_asr_loss_weight: 0.1
-# Token model (pretrained)
-ssl_model: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
-            source: !ref <token_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_token_model>
-            output_all_hiddens: True
-        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
-            source: !ref <token_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_token_model>
-            output_all_hiddens: True
-        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
-            source: !ref <token_model_src>
-            save_path: !ref <pretrained_model_save_folder>
-            freeze: !ref <freeze_token_model>
-            output_all_hiddens: True
-token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL
-    ssl_model: !ref <ssl_model>
-    kmeans_repo_id: !ref <token_model_kmeans_src>
-    kmeans_dataset: !ref <token_model_kmeans_dataset>
-    num_clusters: !ref <audio_num_tokens>
-    save_path: !ref <pretrained_model_save_folder>
-    layers_num: !apply:benchmarks.DASB.utils.hparams.as_list
-        value: !ref <token_model_layers>
-        dtype: int
-spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
-    source: !ref <spk_emb_src>
-    savedir: !ref <pretrained_model_save_folder>/ecapa
-spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
-    source: !ref <spk_emb_discrete_src>
-    savedir: !ref <pretrained_model_save_folder>/ecapa-<ssl_model_type>
-    pymodule_file: custom_interface.py
-    classname: DiscreteSpkEmb
-    overrides:
-        ssl_layer_num_selected: !ref <token_model_layers>
-asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide
-    source: !ref <asr_src>
-    savedir: !ref <pretrained_model_save_folder>/asr-transformer
-# Dataloader options
-
-
-train_dataloader_opts:
-    batch_size: !ref <batch_size>
-    shuffle: True
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-valid_dataloader_opts:
-    batch_size: !ref <batch_size>
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-test_dataloader_opts:
-    batch_size: 1
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-sample_dataloader_opts:
-    batch_size: !ref <batch_size>
-    num_workers: !ref <num_workers>
-    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
-        padding_kwargs:
-            value: !ref <pad_index>
-token_model_kwargs:
-    SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list
-        value: !ref <token_model_layers>
-        dtype: int
-    deduplicates: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers
-        layers: !ref <token_model_layers>
-        value: False
-    bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers
-        layers: !ref <token_model_layers>
-        value: null
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <extract_features_batch_size>
-        num_workers: !ref <num_workers>
-    token_model: !ref <token_model>
-    token_model_kwargs: !ref <token_model_kwargs>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !apply:benchmarks.DASB.utils.hparams.as_list
-        value: !ref <ssl_model_layers>
-        dtype: int
-    token_model_layers: !ref <token_model_layers>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-    data_folder_alignments: !ref <data_folder_alignments>
-####################### Model parameters ###########################
-# Transformer
-d_model: 512
-nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
-layerwise_renorm: True
-d_ffn: 2048
-z_dim: 128
-hidden_dim: 2048
-enc_n_dim: 16
-dec_n_dim: 256
-decoder_chunk_size: -1
-transformer_dropout: 0.2
-target_dropout: 0.2
-emb_dropout: 0.0
-activation: !name:torch.nn.GELU
-audio_num_tokens: 1000
-audio_dim: 1024
-audio_emb_size: !apply:speechbrain.utils.hparams.choice
-    value: !ref <representation_mode>
-    choices:
-        discrete: 1024
-        continuous: 128
-audio_emb_freeze: False
-audio_emb_lr: 0.00001
-audio_emb_weight_decay: 0.001
-audio_emb_pretrained: False
-text_num_tokens: 39
-phn_num_tokens: 52
-input_num_tokens: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <text_num_tokens>
-        phonemes: !ref <phn_num_tokens>
-audio_tokens_per_step: 6
-attention_type: regularMHA
-############################## models ################################
-vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list
-    value: !apply:speechbrain.utils.hparams.choice
-        value: !ref <select_layers>
-        default: !ref <select_layers>
-        choices:
-            null: !ref <token_model_layers>
-    dtype: int
-vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper
-    model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
-        source: !ref <vocoder_src>
-        savedir: !ref <vocoder_model_path>
-    available_layers: !ref <vocoder_available_layers>
-    layers: !ref <vocoder_layers>
-    num_units: !ref <audio_num_tokens>
-    offset: !ref <token_offset>
-vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper
-    model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams
-        source: !ref <vocoder_src>
-        savedir: !ref <vocoder_model_path>
-vocoder: !apply:benchmarks.DASB.utils.hparams.choice
-    value: !ref <representation_mode>
-    apply: True
-    choices:
-        discrete: !ref <vocoder_discrete>
-        continuous: !ref <vocoder_continuous>
-inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    using_eos_threshold: False
-    length_normalization: True
-    audio_token_shift: !ref <audio_token_shift>
-inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
-    scale_factor: !ref <scale_factor>
-    gate_threshold: !ref <gate_threshold>
-    eos_mode: !ref <eos_mode>
-    representation_mode: !ref <representation_mode>
-inference: !apply:speechbrain.utils.hparams.choice
-    value: !ref <inference_mode>
-    choices:
-        search: !ref <inference_search>
-        forward: !ref <inference_forward>
-emb:
-    spk:
-        kind: "pretrained"
-        dim: 192
-        vocoder: True
-        injection: !ref <spk_emb_injection>
-model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel
-    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
-    audio_num_tokens: !ref <audio_num_tokens>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    d_model: !ref <d_model>
-    d_ffn: !ref <d_ffn>
-    z_dim: !ref <z_dim>
-    hidden_dim: !ref <hidden_dim>
-    enc_n_dim: !ref <enc_n_dim>
-    dec_n_dim: !ref <dec_n_dim>
-    decoder_chunk_size: !ref <decoder_chunk_size>
-    nhead: !ref <nhead>
-    enc_num_layers: !ref <enc_num_layers>
-    dec_num_layers: !ref <dec_num_layers>
-    dropout: !ref <transformer_dropout>
-    target_dropout: !ref <target_dropout>
-    emb_dropout: !ref <emb_dropout>
-    activation: !ref <activation>
-    attention_type: !ref <attention_type>
-    vocoder: !ref <vocoder>
-    gate_threshold: !ref <gate_threshold>
-    gate_offset: !ref <gate_offset>
-    audio_emb_size: !ref <audio_emb_size>
-    audio_emb_freeze: !ref <audio_emb_freeze>
-    max_audio_length: !ref <max_audio_length>
-    inference: !ref <inference>
-    eos_mode: !ref <eos_mode>
-    infer_max_audio_length: !ref <infer_max_audio_length>
-    audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
-    scale_factor: !ref <scale_factor>
-    representation_mode: !ref <representation_mode>
-    use_position_shift: !ref <use_position_shift>
-    max_position_shift: !ref <max_position_shift>
-    position_shift_probability: !ref <position_shift_probability>
-    position_shift_seed: !ref <position_shift_seed>
-    emb: !ref <emb>
-    layerwise_renorm: !ref <layerwise_renorm>
-modules:
-    model: !ref <model>
-    vocoder: !ref <vocoder>
-    compute_cost: !ref <compute_cost>
-# define two optimizers here for two-stage training
-opt_class: !name:torch.optim.Adam
-    lr: !ref <lr>
-compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss
-    guided_attention_weight: !ref <guided_attention_weight>
-    guided_attention_sigma: !ref <guided_attention_sigma>
-    gate_weight: !ref <gate_loss_weight>
-    gate_beta: !ref <gate_loss_beta>
-    gate_gamma: !ref <gate_loss_gamma>
-    gate_max_weight: !ref <gate_loss_max_weight>
-    silence_padding: !ref <silence_padding>
-    eos_mode: !ref <eos_mode>
-    eos_index: !ref <eos_index>
-    eos_width: !ref <eos_width>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    audio_token_shift: !ref <audio_token_shift>
-    spk_weight: !ref <guides_spk_loss_weight>
-    asr_weight: !ref <guides_asr_loss_weight>
-    representation_mode: !ref <representation_mode>
-lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
-    lr_initial: !ref <lr>
-    n_warmup_steps: !ref <lr_warmup_steps>
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-    checkpoints_dir: !ref <save_folder>
-    recoverables:
-        model: !ref <model>
-        lr_scheduler: !ref <lr_annealing>
-        counter: !ref <epoch_counter>
-freezer: !new:benchmarks.DASB.utils.preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
-    limit: !ref <number_of_epochs>
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-    save_file: !ref <train_log>
-progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
-    logger: !ref <progress_logger>
-    sample_rate: !ref <model_sample_rate>
-    eos_threshold: !ref <gate_threshold>
-spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
-    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 3333cfceb..ba05d6f2c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -88,9 +88,7 @@ gate_loss_gamma: 0.01
 gate_loss_max_weight: 1.
 
 # Inference parameters
-inference_mode: autoregressive
 eos_mode: gate
-decoder_mode: autoregressive
 scale_factor: 4
 
 # Beam Search-specific parameters
@@ -216,30 +214,6 @@ attention_type: regularMHA
 
 ############################## models ################################
 
-vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder
-    dac: !ref <dac>
-
-inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    using_eos_threshold: False
-    length_normalization: True
-    audio_token_shift: !ref <audio_token_shift>
-
-inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
-    scale_factor: !ref <scale_factor>
-    gate_threshold: !ref <gate_threshold>
-    eos_mode: !ref <eos_mode>
-
-inference: !apply:speechbrain.utils.hparams.choice
-    value: !ref <inference_mode>
-    choices:
-        search: !ref <inference_search>
-        forward: !ref <inference_forward>
-
 emb:
     spk:
         kind: "pretrained"
@@ -269,17 +243,22 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     audio_emb_size: !ref <audio_emb_size>
     audio_emb_freeze: !ref <audio_emb_freeze>
     max_audio_length: !ref <max_audio_length>
-    inference: !ref <inference>
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
 
 # define two optimizers here for two-stage training
@@ -312,17 +291,11 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:benchmarks.DASB.utils.preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
-progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
-    logger: !ref <progress_logger>
-    sample_rate: !ref <model_sample_rate>
-    eos_threshold: !ref <gate_threshold>
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 5b1a06b46..b335425e6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -16,8 +16,6 @@ prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
 ssl_model_type: wavlm
 representation_mode: discrete
-vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-<representation_mode>-ms
-vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
 data_mode: lite
@@ -64,34 +62,15 @@ g2p_src: flexthink/soundchoice-g2p
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
 kmeans_dataset: LibriSpeech
 vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
-  value: !ref <ssl_model_type>
-  choices:
-    hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
-    wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
-    wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
 select_layers: null
 token_offset: 1
-vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS
-vocoder_src_continous: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ssl_model_type>
-    choices:
-        wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS
-        hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS
-        wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS
-vocoder_src: !apply:speechbrain.utils.hparams.choice
-    value: !ref <representation_mode>
-    choices:
-        discrete: !ref <vocoder_src_discrete>
-        continuous: !ref <vocoder_src_continous>
-vocoder_available_layers: [1, 3, 7, 12, 18, 23]
-vocoder_takes_spk_emb: True
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -146,9 +125,7 @@ gate_loss_gamma: 0.01
 gate_loss_max_weight: 1.
 
 # Inference parameters
-inference_mode: autoregressive
 eos_mode: gate
-decoder_mode: autoregressive
 scale_factor: 4
 
 # Embedding Injection
@@ -305,15 +282,12 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
 audio_tokens_per_step: 6
 attention_type: regularMHA
 ############################## models ################################
-vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams
-    source: !ref <vocoder_src>
-    savedir: !ref <vocoder_model_path>
 emb:
     spk:
         kind: "pretrained"
         dim: 192
-        vocoder: True
         injection: !ref <spk_emb_injection>
+
 model: !new:Tokotron.TokotronTransformerModel
     input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
     audio_num_tokens: !ref <vocab_size>
@@ -335,14 +309,22 @@ model: !new:Tokotron.TokotronTransformerModel
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     representation_mode: !ref <representation_mode>
     emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
+
 # define two optimizers here for two-stage training
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
@@ -381,10 +363,3 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
-
-tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-    save_path: !ref <kmeans_cache_dir>
-    ssl_model: !ref <ssl_model>
-    vocoder_repo_id: !ref <vocoder_repo_id>
-    kmeans_dataset: !ref <kmeans_dataset>
-    num_clusters: !ref <vocab_size>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 97ab94275..ecc1a754c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -210,31 +210,6 @@ audio_tokens_per_step: 6
 attention_type: regularMHA
 
 ############################## models ################################
-
-vocoder: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerVocoder
-    tokenizer: !ref <token_model>
-
-inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    using_eos_threshold: False
-    length_normalization: True
-    audio_token_shift: !ref <audio_token_shift>
-
-inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference
-    scale_factor: !ref <scale_factor>
-    gate_threshold: !ref <gate_threshold>
-    eos_mode: !ref <eos_mode>
-
-inference: !apply:speechbrain.utils.hparams.choice
-    value: !ref <inference_mode>
-    choices:
-        search: !ref <inference_search>
-        forward: !ref <inference_forward>
-
 emb:
     spk:
         kind: "pretrained"
@@ -264,7 +239,6 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     audio_emb_size: !ref <audio_emb_size>
     audio_emb_freeze: !ref <audio_emb_freeze>
     max_audio_length: !ref <max_audio_length>
-    inference: !ref <inference>
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
@@ -272,9 +246,13 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+    source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+
 modules:
     model: !ref <model>
-    vocoder: !ref <vocoder>
+    tokenizer: !ref <tokenizer>
     compute_cost: !ref <compute_cost>
 
 # define two optimizers here for two-stage training
@@ -307,23 +285,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
 
-freezer: !new:benchmarks.DASB.utils.preparation.Freezer
-    save_path: !ref <prepare_save_folder>
-    archive_path: !ref <prepare_archive_path>
-
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
-
-progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger
-    current_path: !ref <progress_current>
-    archive_path: !ref <progress_archive>
-    meta_path: !ref <progress_meta>
-    epoch_counter: !ref <epoch_counter>
-
-progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport
-    logger: !ref <progress_logger>
-    sample_rate: !ref <model_sample_rate>
-    eos_threshold: !ref <gate_threshold>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 943727635..198b35fec 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -22,21 +22,22 @@
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
+from speechbrain.dataio.dataio import clean_padding_
 from speechbrain.utils.distributed import run_on_main
-from Tokotron import (
-    RepresentationMode,
-    get_silence_token,
-    use_silence_padding,
-    feature_pad_to,
-)
-from types import SimpleNamespace
-from evaluate import TokotronEvaluator
 import re
 import string
 
-base_dir = str(Path(__file__).parent.parent.parent.parent)
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
+from model.Tokotron import (
+    RepresentationMode,
+    get_silence_token,
+    use_silence_padding,
+    feature_pad_to,
+)  # noqa: E402
+from evaluate import TokotronEvaluator  # noqa: E402
+
 logger = logging.getLogger(__name__)
 
 SPECIAL_TOKEN_COUNT = 1
@@ -83,7 +84,9 @@ def create_waveform(self, audio, length, emb):
         -------
         wav : torch.Tensor
         """
-        raise NotImplementedError()
+        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        clean_padding_(wav, length)
+        return wav
 
     def compute_forward(self, batch, stage):
         """Runs all the computation of the Tokotron TTS
@@ -451,6 +454,16 @@ def on_stage_end(self, stage, stage_loss, epoch):
         stage_stats = {"loss": stage_loss, **loss_stats}
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
+        
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
+            self.evaluator.on_evaluate_end()
+            eval_summary = self.evaluator.compute_summary()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
 
         # Perform end-of-iteration things, like annealing, logging, etc.
         if stage == sb.Stage.VALID:
@@ -473,9 +486,6 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
             )
 
-        if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
-            self.evaluator.on_evaluate_end()
-
     def fit_batch(self, batch):
         loss = super().fit_batch(batch)
         if self.hparams.lr_annealing_mode == "step":
@@ -486,7 +496,7 @@ def fit_batch(self, batch):
 INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
 
 
-def dataio_prepare(hparams, guide_ctx=None):
+def dataio_prepare(hparams):
     """This function prepares the datasets to be used in the brain class.
     It also defines the data processing pipeline through user-defined functions.
 
@@ -497,9 +507,6 @@ def dataio_prepare(hparams, guide_ctx=None):
         This dictionary is loaded from the `train.yaml` file, and it includes
         all the hyperparameters needed for dataset construction and loading.
 
-    guide_ctx : SimpleNamespace, optional
-        The guide context with pretrained models
-
     Returns
     -------
     datasets : dict
@@ -558,12 +565,6 @@ def tokens_pipeline(label):
         """Processes the transcriptions to generate proper labels"""
         return label_encoder.encode_sequence_torch(label)
 
-    @sb.utils.data_pipeline.takes("label_norm")
-    @sb.utils.data_pipeline.provides("asr_tokens")
-    def asr_tokens_pipeline(label):
-        """Processes the transcriptions to generate proper labels"""
-        return torch.tensor(guide_ctx.asr_model.encode(label))
-
     use_silence_padding = hparams.get("use_silence_padding", True)
     if "token_model_layers" in hparams:
         audio_tokens_per_step = len(hparams["token_model_layers"])
@@ -936,50 +937,12 @@ def apply_overfit_test(hparams, dataset):
     return result
 
 
-def get_guide_ctx(hparams, run_opts):
-    """Initializes a context object for guides,
-    containing pretrained models only for guides that will be
-    used per hparams
-
-    Arguments
-    ---------
-    hparams : dict
-        Hyperparameters
-    run_opts : dict
-        Run options
-
-    Returns
-    -------
-    ctx : SimpleNamespace
-        The resulting context"""
-    ctx = {}
-    if hparams["guides_enabled"]:
-        pretrained_run_opts = {"device": run_opts.get("device", "cpu")}
-        if hparams["guides_spk"]:
-            ctx["spk_emb_model"] = hparams["spk_emb_model"](
-                run_opts=pretrained_run_opts
-            )
-        if hparams["guides_asr"]:
-            ctx["asr_model"] = hparams["asr_model"](
-                run_opts=pretrained_run_opts
-            )
-    return SimpleNamespace(**ctx)
-
-
 RE_PUNCTUATION = re.compile(
     "|".join(re.escape(char) for char in string.punctuation)
 )
 
 
-def run_experiment(brain_cls):
-    """Starts the experiement
-
-    Arguments
-    ---------
-    brain_cls : type
-        The brain class to instantiate
-    """
-
+if __name__ == "__main__":
     # Reading command line arguments
     hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
 
@@ -1042,9 +1005,8 @@ def run_experiment(brain_cls):
         )
 
     # We can now directly create the datasets for training, valid, and test
-    guide_ctx = get_guide_ctx(hparams, run_opts)
     (datasets, silence_padding, resample_fn) = dataio_prepare(
-        hparams, guide_ctx
+        hparams
     )
 
     # Apply overfit test settings
@@ -1052,7 +1014,7 @@ def run_experiment(brain_cls):
     audio_keys = ["audio_pad", "audio_bos"]
 
     # Trainer initialization
-    tts_brain = brain_cls(
+    tts_brain = TokotronBrain(
         modules=hparams["modules"],
         opt_class=hparams["opt_class"],
         hparams=hparams,
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
deleted file mode 100644
index 9c8b243be..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Continuous SSL verfsion
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronContinuousSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length, emb):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-        emb: dict
-            Embeddings (speaker, etc)
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.vocoder(audio, emb)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronContinuousSSLBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
deleted file mode 100644
index 78c584c45..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDACBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length, emb):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-        emb: dict
-            Embeddings (speaker, etc)
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        z, _, _ = self.modules.tokenizer.quantizer.from_codes(
-            audio.transpose(1, 2).int()
-        )
-        wav = self.modules.tokenizer.decode(z).squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDACBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
deleted file mode 100644
index 3cc0e2644..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
-Discrete SSL version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-import torch
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronDiscreteSSLBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def on_stage_start(self, stage, epoch):
-        self.compute_offset()
-        return super().on_stage_start(stage, epoch)
-
-    def compute_offset(self):
-        """Computes per-layer offsets"""
-        layers_set = set(self.hparams.token_model_layers)
-        available_layers_set = set(self.hparams.vocoder_available_layers)
-        if not layers_set.issubset(available_layers_set):
-            unavailable_layers = ",".join(
-                str(layer) for layer in (layers_set - available_layers_set)
-            )
-            raise ValueError(f"Layers {unavailable_layers} are not supported")
-        self.num_units = self.hparams.vocab_size
-        _, layers_idx = torch.where(
-            torch.tensor(
-                self.hparams.vocoder_available_layers, device=self.device
-            ).unsqueeze(0)
-            == torch.tensor(
-                self.hparams.token_model_layers, device=self.device
-            ).unsqueeze(1)
-        )
-        self.layer_offset = (
-            torch.tensor(layers_idx, device=self.device) * self.num_units
-        )[None, None, :]
-        self.offset = self.hparams.token_offset
-        self.modules.vocoder.tokenize = False
-
-    def create_waveform(self, audio, length, emb):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-        emb: dict
-            Embeddings (speaker, etc)
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        units_with_offset = (
-            audio + self.layer_offset.to(audio.device) + self.offset
-        )
-        wav = self.modules.vocoder(units_with_offset)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronDiscreteSSLBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
deleted file mode 100644
index 98f1b27cc..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronEncodecBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length, emb):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-        emb: dict
-            Embeddings (speaker, etc)
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.tokenizer.decode(audio)
-        wav = wav.squeeze(1)
-        clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronEncodecBrain)
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py
deleted file mode 100644
index fdbbb3ed7..000000000
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version
-
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
-
-
-Authors
- * Artem Ploujnikov 2024
-"""
-
-from train import TokotronBrain, run_experiment
-from speechbrain.dataio.dataio import clean_padding_
-
-
-class TokotronSTBrain(TokotronBrain):
-    """Tokotron implementation for Encodec"""
-
-    def create_waveform(self, audio, length, emb):
-        """Creates a waveform from a discrete or continuous audio
-        representation
-
-        Arguments
-        ---------
-        audio : torch.Tensor
-            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
-        lengths : torch.Tensor
-            A 1-D tensor
-        emb: dict
-            Embeddings (speaker, etc)
-
-        Returns
-        -------
-        wav : torch.Tensor
-        """
-        wav = self.modules.token_model.decode(audio)
-        if length is not None:
-            clean_padding_(wav, length)
-        return wav
-
-
-if __name__ == "__main__":
-    run_experiment(TokotronSTBrain)

From 900481d4d1c323a9b78d9b89a569535d58f06499 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 17 Jan 2025 01:00:19 -0500
Subject: [PATCH 060/270] DASB: Tokotron: Updates for hyperparameter fitting

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |  2 +-
 .../TTS/tokotron/hparams/train_dac.yaml       | 18 +++++++++++-------
 .../tokotron/hparams/train_discrete_ssl.yaml  | 16 ++++++++++------
 .../TTS/tokotron/hparams/train_encodec.yaml   | 17 ++++++++++-------
 .../hparams/train_speech_tokenizer.yaml       | 16 ++++++++++------
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 16 +++++++++++-----
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  2 +-
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |  2 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |  9 +++++----
 .../tokotron/hparams/train_discrete_ssl.yaml  |  9 ++++++---
 .../TTS/tokotron/hparams/train_encodec.yaml   |  8 +++++---
 .../hparams/train_speech_tokenizer.yaml       |  8 +++++---
 .../DASB/LibriTTS/TTS/tokotron/train.py       | 19 +++++++++----------
 13 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index 8ca3fb8dd..e7ffe2576 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -51,4 +51,4 @@ eval_summary:
 
 eval_summary_log:
   utmos: utmos_utmos_mean
-  dwer: asr_dwer_median
\ No newline at end of file
+  dwer: asr_dwer_median
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index be20bfa63..8e74cbedb 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -8,9 +8,12 @@ experiment_name: tokotron/dac
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
@@ -19,8 +22,9 @@ g2p_src: flexthink/soundchoice-g2p
 representation_mode: discrete
 
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/dac
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -70,7 +74,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -94,8 +98,8 @@ model_bitrate: 8kbps
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -146,8 +150,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 555878c24..2205ecf94 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -7,6 +7,7 @@ experiment_name: tokotron/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 
 # Model Type
 ssl_model_type: wavlm
@@ -14,10 +15,13 @@ representation_mode: discrete
 output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 
 # Data files
 data_folder: !PLACEHOLDER
-prepare_save_folder: !ref <data_folder>/prepared/discrete-<ssl_model_type> # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
 vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
@@ -86,7 +90,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -111,8 +115,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -183,8 +187,8 @@ token_model_kwargs:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 3ab6eb770..d166f29ed 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -8,9 +8,11 @@ experiment_name: tokotron/encodec
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
@@ -18,8 +20,9 @@ g2p_src: flexthink/soundchoice-g2p
 representation_mode: discrete
 
 # Data files
-data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/encodec
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -65,7 +68,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -85,8 +88,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -136,8 +139,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 568f8c13e..38927d216 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -8,9 +8,12 @@ experiment_name: tokotron/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/<experiment_name>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 
 token_model_src: "fnlp/SpeechTokenizer"
 g2p_src: flexthink/soundchoice-g2p
@@ -20,7 +23,8 @@ representation_mode: discrete
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared/st
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
@@ -68,7 +72,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -88,8 +92,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
@@ -139,8 +143,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 8c571babd..cf7918e3a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -269,7 +269,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
         stage_stats = {"loss": stage_loss, **loss_stats}
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
-        
+
         # End evaluation and report stats
         if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
             self.evaluator.on_evaluate_end()
@@ -623,7 +623,10 @@ def read_token_list(file_name):
     result: list
         a list of tokens
     """
-    if not Path(file_name).exists():
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
         raise ValueError(f"Token file {file_name} not found")
     with open(file_name) as token_file:
         return [line.strip("\r\n") for line in token_file if line]
@@ -709,6 +712,8 @@ def apply_overfit_test(hparams, dataset):
 
     # Load evaluation hyperparameters
     eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if not eval_hparams_file.exists():
+        eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml"
     if eval_hparams_file.exists():
         logger.info(
             "Using evaluation hyperparameters from %s", eval_hparams_file
@@ -796,9 +801,10 @@ def apply_overfit_test(hparams, dataset):
     )
 
     # Load best checkpoint for evaluation
-    tts_brain.evaluate(
-        test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts,
-    )
+    if hparams["testing"]:
+        tts_brain.evaluate(
+            test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts,
+        )
 
     # Save final checkpoint (fixed name)
     tts_brain.checkpointer.save_checkpoint(name="latest")
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 99d547cc5..377b5955c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -124,7 +124,7 @@ def get_output_folder(self, stage, epoch):
             output_folder = output_folder / str(epoch)
         output_folder.mkdir(parents=True, exist_ok=True)
         return output_folder
-    
+
     def on_evaluate_end(self):
         """Invoked when evaluation starts
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index bafd769cc..18e39ba42 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -67,4 +67,4 @@ eval_summary:
 eval_summary_log:
   utmos: utmos_utmos_mean
   dwer: asr_dwer_median
-  spk_sim: spk_sim_score_mean
\ No newline at end of file
+  spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index ba05d6f2c..0bc9099a8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -9,6 +9,7 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
@@ -55,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16
+batch_size: 16 @orion_step1: --enc_num_layers~"uniform(2, 32,discrete=True)
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -76,7 +77,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -187,8 +188,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index b335425e6..e4dfe96d1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -6,9 +6,12 @@
 
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
 # Data files
 data_folder: !PLACEHOLDER
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
@@ -113,7 +116,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -254,8 +257,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 layerwise_renorm: True
 d_ffn: 2048
 transformer_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index e766267e7..a454802c3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -6,9 +6,11 @@
 
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
@@ -86,7 +88,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -182,8 +184,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ecc1a754c..470503e4e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -6,9 +6,11 @@
 
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
 output_folder: !ref results/transformer/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
@@ -78,7 +80,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -185,8 +187,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6
-dec_num_layers: 12
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 198b35fec..3d7d9cd55 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -454,7 +454,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
         stage_stats = {"loss": stage_loss, **loss_stats}
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
-        
+
         # End evaluation and report stats
         if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
             self.evaluator.on_evaluate_end()
@@ -1005,9 +1005,7 @@ def apply_overfit_test(hparams, dataset):
         )
 
     # We can now directly create the datasets for training, valid, and test
-    (datasets, silence_padding, resample_fn) = dataio_prepare(
-        hparams
-    )
+    (datasets, silence_padding, resample_fn) = dataio_prepare(hparams)
 
     # Apply overfit test settings
     datasets = apply_overfit_test(hparams, datasets)
@@ -1041,9 +1039,10 @@ def apply_overfit_test(hparams, dataset):
     )
 
     # Load best checkpoint for evaluation
-    tts_brain.evaluate(
-        test_set=datasets["test"],
-        test_loader_kwargs=use_silence_padding(
-            hparams["test_dataloader_opts"], silence_padding, audio_keys
-        ),
-    )
+    if hparams["testing"]:
+        tts_brain.evaluate(
+            test_set=datasets["test"],
+            test_loader_kwargs=use_silence_padding(
+                hparams["test_dataloader_opts"], silence_padding, audio_keys
+            ),
+        )

From 4dcd1d3ea005a2952cabe596bd5d64f495cf1f2b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 17 Jan 2025 02:12:27 -0500
Subject: [PATCH 061/270] DASB: Batch size updates, device fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml  | 2 +-
 .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml     | 2 +-
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml     | 2 +-
 .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +-
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py                | 4 ++++
 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml  | 2 +-
 .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml     | 2 +-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml     | 2 +-
 .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +-
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py                | 4 ++++
 10 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 8e74cbedb..75cbae717 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -57,7 +57,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 2205ecf94..68e54fa83 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index d166f29ed..0e923ffc9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -52,7 +52,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 38927d216..76e3c72e3 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index cf7918e3a..7c1b00083 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -371,6 +371,10 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
+        self.modules.tokenizer.device = self.device        
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.codec_vocoder.device = self.device
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 0bc9099a8..8c05d2499 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 @orion_step1: --enc_num_layers~"uniform(2, 32,discrete=True)
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index e4dfe96d1..5c8db0bc4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -94,7 +94,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 batch_size_guided: 2
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index a454802c3..30d2cbfe4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -67,7 +67,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 470503e4e..e3f34fdf8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -59,7 +59,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 3d7d9cd55..f0153271d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -84,6 +84,10 @@ def create_waveform(self, audio, length, emb):
         -------
         wav : torch.Tensor
         """
+        self.modules.tokenizer.device = self.device        
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.codec_vocoder.device = self.device
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav

From fc08f58d5962ac8ce3ec35bfd607508cfb81f5f1 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 17 Jan 2025 02:15:47 -0500
Subject: [PATCH 062/270] DASB: Tokotron: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 2 +-
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 7c1b00083..8945607f9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -374,7 +374,7 @@ def create_waveform(self, audio, length):
         self.modules.tokenizer.device = self.device        
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
-            self.codec_vocoder.device = self.device
+            self.modules.tokenizer.codec_vocoder.device = self.device
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index f0153271d..da228d6ae 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -87,7 +87,7 @@ def create_waveform(self, audio, length, emb):
         self.modules.tokenizer.device = self.device        
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
-            self.codec_vocoder.device = self.device
+            self.modules.tokenizer.codec_vocoder.device = self.device
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav

From fcb37c74b091188ad231af24776dee1138d2e6ee Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 17 Jan 2025 14:59:31 -0500
Subject: [PATCH 063/270] DASB: Ensure UTMOS is maximized rather than
 minimized!

---
 benchmarks/DASB/utils/aggregate_results.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py
index 0df315b7e..e11046ade 100644
--- a/benchmarks/DASB/utils/aggregate_results.py
+++ b/benchmarks/DASB/utils/aggregate_results.py
@@ -144,6 +144,8 @@ def aggregate_metrics(prototype, metrics):
 
     # Report final metric to Orion
     # Remember: orion expects metrics to be minimized!
-    if eval_metric == "acc" or eval_metric == "f1":
+    if eval_metric in ["acc", "f1"]:
         final_metric = 1 - final_metric
+    elif eval_metric == "utmos":
+        final_metric = -final_metric
     report_objective(final_metric)

From 9563cd54e766594b84a2c8c47399a97809d00dec Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:46:43 -0500
Subject: [PATCH 064/270] DASB: Tokotron: Fixes

---
 .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index e3f34fdf8..c307ed0bf 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -36,7 +36,6 @@ test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
@@ -248,7 +247,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
     source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
     save_path: !ref <save_folder>
 

From 4442b4473c1e572cac62c2f4c98f904a9a33f3ac Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 20 Jan 2025 14:53:52 -0500
Subject: [PATCH 065/270] DASB: Tokotron: Fixes

---
 .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 76e3c72e3..ce4e6edaa 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -34,7 +34,6 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
@@ -187,7 +186,7 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
 
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
     source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
     save_path: !ref <save_folder>
 

From d31ad9ce97234f995e4f6312b09b50a15c2557de Mon Sep 17 00:00:00 2001
From: Pooneh Mousavi <mousavi.pooneh@gmail.com>
Date: Mon, 20 Jan 2025 15:10:56 -0500
Subject: [PATCH 066/270] Update tokenizer_interface.py

add sampling rate for mimi and wavtokenizer
---
 benchmarks/DASB/utils/tokenizer_interface.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index c8e81eb7a..a6103de4c 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -380,6 +380,7 @@ class MimiTokenizer(Mimi, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         Mimi.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
+        self.sample_rate= self.sampling_rate
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
@@ -435,6 +436,7 @@ class WavTokenizerWrapper(WavTokenizer, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         WavTokenizer.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
+        self.sample_rate = 24000
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):

From fbebd2e0fecad130d72a4f70aeabf08670b4f765 Mon Sep 17 00:00:00 2001
From: Pooneh Mousavi <mousavi.pooneh@gmail.com>
Date: Mon, 20 Jan 2025 15:12:26 -0500
Subject: [PATCH 067/270] Update sq_codec.py

dix sampling rate name for SQCodec
---
 benchmarks/DASB/model/sq_codec.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 6057a5f73..4ac4b74ad 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -101,7 +101,7 @@ def __init__(
         )
 
         self.scalar_codec = self.build_codec_model(self.config_path)
-        self.sr = sample_rate
+        self.sample_rate = sample_rate
         self.dim_codebook = dim_codebook
         self.n_codebook = n_codebook
         self.bw = bw
@@ -232,8 +232,8 @@ def reconstruct(self, wav_root):
         wav, sr = torchaudio.load(wav_root)
         if wav.numel() == 0:
             return None
-        if sr != self.sr:
-            wav = torchaudio.transforms.Resample(sr, self.sr)(wav)
+        if sr != self.sample_rate:
+            wav = torchaudio.transforms.Resample(sr, self.sample_rate)(wav)
         wav = wav.unsqueeze(1)
         emb, emb_quant, x = self.scalar_codec.inference(wav)
         return x.detach().cpu().squeeze(0)

From 9f64966ca6f0467dc140b7047ac2d908038fefc1 Mon Sep 17 00:00:00 2001
From: poonehmousavi <mousavi.pooneh@gmail.com>
Date: Mon, 20 Jan 2025 20:25:46 +0000
Subject: [PATCH 068/270] add sq-codec, mimi and wavtokenizer for librispeech

---
 .../DASB/LibriSpeech/extraction/extract.py    |  2 +-
 .../LibriSpeech/extraction/hparams/mimi.yaml  | 58 ++++++++++++++++++
 .../extraction/hparams/sqcodec.yaml           | 57 ++++++++++++++++++
 .../extraction/hparams/wavtokenizer.yaml      | 60 +++++++++++++++++++
 4 files changed, 176 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
 create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 3979ba731..5a54f72df 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -52,7 +52,7 @@
             "skip_prep": hparams["skip_prep"],
         },
     )
-
+    
     tokens_extractor = hparams["tokens_extractor"]
     data_folder = hparams["data_folder"]
     datasets = []
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
new file mode 100644
index 000000000..e2dad7f95
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
@@ -0,0 +1,58 @@
+# ############################################################################
+# Auido Tokenizer: Mimi
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/dac
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 1
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+model_hub: kyutai/mimi
+vocab_size: 1024
+num_codebooks: 32
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  num_codebooks: !ref <num_codebooks>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
new file mode 100644
index 000000000..fe202c90d
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
@@ -0,0 +1,57 @@
+# ############################################################################
+# Auido Tokenizer: SQCodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sample_rate: 16000
+save_embedding: False
+num_codebooks : 4
+save_path: /home/ubuntu/sq-codec/SQ-Codec
+
+# wavtokenizer model 
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <save_path>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
new file mode 100644
index 000000000..bc1b56ddb
--- /dev/null
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
@@ -0,0 +1,60 @@
+# ############################################################################
+# Auido Tokenizer: wavtokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_csv: !ref <output_folder>/train.csv
+valid_csv: !ref <output_folder>/dev-clean.csv
+test_csv:
+  - !ref <output_folder>/test-clean.csv
+  - !ref <output_folder>/test-other.csv
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+sample_rate: 24000
+save_embedding: False
+num_codebooks : 1
+vocab_size: 4096
+
+# wavtokenizer model 
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>

From 1e18ead6eec8ddfb295c174ecd5f0147a5fd2786 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 20 Jan 2025 16:41:10 -0500
Subject: [PATCH 069/270] DASB: VALL-E: Initial import

---
 .../DASB/LJSpeech/TTS/valle/evaluation.py     | 360 +++++++
 .../LJSpeech/TTS/valle/hparams/arpabet.txt    |  50 +
 .../LJSpeech/TTS/valle/hparams/char_en.txt    |  38 +
 .../DASB/LJSpeech/TTS/valle/hparams/eval.yaml |  54 +
 .../TTS/valle/hparams/train_discrete_ssl.yaml | 268 +++++
 .../LJSpeech/TTS/valle/ljspeech_prepare.py    |   1 +
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   | 854 ++++++++++++++++
 benchmarks/DASB/model/valle.py                | 924 ++++++++++++++++++
 8 files changed, 2549 insertions(+)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
 create mode 120000 benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/train.py
 create mode 100644 benchmarks/DASB/model/valle.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
new file mode 100644
index 000000000..152db4c87
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
@@ -0,0 +1,360 @@
+import json
+import torch
+import logging
+import re
+import csv
+from speechbrain.utils.metric_stats import MetricStats
+from types import SimpleNamespace
+from pathlib import Path
+from utils.data import undo_batch
+from torch import nn
+
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechEvaluationMetricStats(MetricStats):
+    """An aggregate metric combining multiple speech evaluators
+
+    Arguments
+    ---------
+    hparams : dict | SimpleNamespace | object
+        Raw hyperparameters for evaluation
+
+    device : str
+        The device on which evaluation will be performed
+
+    """
+
+    def __init__(self, hparams, device="cpu"):
+        if isinstance(hparams, dict):
+            hparams = SimpleNamespace(**hparams)
+        self.hparams = hparams
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = nn.ModuleDict(modules).to(self.device)
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.evaluators
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        if not self.evaluators:
+            logger.warn(
+                "No evaluators were defined - this run will produce samples only"
+            )
+
+        self.attention = []
+
+    def on_evaluation_start(self, output_folder="eval"):
+        """Invoked at the beginning of the evaluation cycle.
+
+        Arguments
+        ---------
+        output_folder : str | path-like
+            The folder to which results will be output
+
+        """
+        logger.info("Starting evaluation")
+        output_folder = Path(output_folder)
+        self.output_folder = (
+            output_folder
+            if output_folder.is_absolute()
+            else self.hparams.output_folder / output_folder
+        )
+        self.output_folder.mkdir(parents=True, exist_ok=True)
+
+        self.files = []
+        details_keys = list(self.evaluators.keys())
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.read_reports()
+        self.create_reports()
+        self.item_ids = []
+
+    def on_evaluation_end(self):
+        """Invoked at the beginning of the evaluation cycle. The default
+        implementation is a no-op
+        """
+        logger.info("Ending evaluation")
+        self.write_summary()
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            self.files.append(file_name)
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
+                        self.details[evaluator_key].append(row)
+
+    def get_tracker_file_name(self):
+        """Determines the file name of the tracker file"""
+        suffix = (
+            f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else ""
+        )
+        file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt"
+        return self.output_folder / file_name
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
+        evaluator = self.evaluators[evaluator_key]
+        result = evaluator.evaluate(
+            wavs=bogus_wavs,
+            length=bogus_length,
+            text=["BOGUS"] * len(bogus_wavs),
+            wavs_ref=bogus_wavs,
+            length_ref=bogus_length,
+        )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def append(self, ids, wav, length, text, wav_ref, length_ref):
+        """Appends the result of a single item
+
+        Arguments
+        ---------
+        ids : str
+            Utterance IDs
+        wav : torch.Tensor
+            Synthesized waveforms
+        length : torch.Tensor
+            Relative lengths of the synthesized waveforms
+        text : list
+            Ground truth text
+        wav_ref : torch.Tensor
+            Reference (ground truth) waveforms
+        length_ref : torch.Tensor
+            Reference lengths
+        """
+        with torch.no_grad():
+            self.item_ids.extend(ids)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=text,
+                    wavs_ref=wav_ref,
+                    length_ref=length_ref,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate,
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, ids, details)
+                self.details[evaluator_key].extend(details)
+
+    def write_result(self, evaluator_key, ids, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        ids : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(ids, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(ascii_only(flatten(report_details)))
+        self.report_files[evaluator_key].flush()
+
+    def write_summary(self, file_name=None):
+        """Outputs summarized statistics
+
+        Arguments
+        ---------
+        file_name : str | path-like
+            An alternative path to save the file
+        """
+        summary = self.summarize()
+        if file_name is None:
+            file_name = self.output_folder / "summary.json"
+        self.files.append(file_name)
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def summarize(self, field=None):
+        """Computes the summarized statistics
+
+        Arguments
+        ---------
+        field : str, optional
+            If specified, it will return a specific field
+
+        Returns
+        -------
+        result : dict | float
+            The summary - or the specified field from the sum
+        """
+        result = {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key],
+                key=metric_key,
+            ).items()
+        }
+        if field is not None:
+            result = result[field]
+        return result
+
+    def clear(self):
+        """Deletes all the files that have been created"""
+        for file_name in self.files:
+            file_name.unlink()
+
+
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
+
+
+def ascii_only(values):
+    """Removes any non-ASCII characters from a dictionary
+
+    Arguments
+    ---------
+    values : dict
+        A dictionary of values
+
+    Returns
+    -------
+    result : dict
+        The same dictionary - but with non-ASCII strings removed"""
+    return {
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
+        for key, value in values.items()
+    }
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        The key of the metric for which the statistics will be computed
+
+    Returns
+    -------
+    statistics : dict
+        The desccriptive statistics computed
+            <key>_mean : the arithmetic mean
+            <key>_std : the standard deviation
+            <key>_min : the minimum value
+            <key>_max : the maximum value
+            <key>_median : the median value
+            <key>_q1 : the first quartile
+            <key>_q3 : the third quartile
+            <key>_iqr : the interquartile ratio
+    """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
+    }
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable. Strings
+    that look like integers or floats will be converted to integers
+    or floats.
+
+    Arguments
+    ---------
+    value : str
+        a string value
+
+    Returns
+    -------
+    result : object
+        The processed result"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
new file mode 100644
index 000000000..b80347c82
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
@@ -0,0 +1,54 @@
+eval_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_asr_type: whisper
+eval_asr_source: !apply:speechbrain.utils.hparams.choice
+  value: !ref <eval_asr_type>
+  choices:
+    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
+    whisper: openai/whisper-small
+evaluations: utmos,asr
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_asr: !apply:speechbrain.utils.hparams.choice
+  value: !ref <eval_asr_type>
+  choices:
+    encoder_decoder: !name:utils.eval.EncoderDecoderASRSpeechEvaluator
+      source: !ref <eval_asr_source>
+      sample_rate: !ref <eval_sample_rate>
+      overrides:
+        lm_weight: 0.0
+    whisper: !name:utils.eval.WhisperASRSpeechEvaluator
+      source: !ref <eval_asr_source>
+      sample_rate: !ref <eval_sample_rate>
+      savedir: !ref <pretrained_model_save_folder>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..f0127973c
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,268 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/discrete_ssl
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+ssl_model_type: wavlm
+output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+
+ssl_model_layers: [1, 3, 7, 12, 18, 23]
+flip_layers: True
+token_model_layers: !ref <ssl_model_layers>
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+eos_mode: gate
+decoder_mode: autoregressive
+scale_factor: 4
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <token_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1000
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 6
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py
new file mode 120000
index 000000000..2f703273c
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py
@@ -0,0 +1 @@
+../../ljspeech_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
new file mode 100644
index 000000000..e0ae084a3
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -0,0 +1,854 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import torch
+import sys
+import shutil
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio
+from speechbrain.dataio.dataio import write_audio
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.data_utils import batch_pad_right
+import re
+import string
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from evaluation import SpeechEvaluationMetricStats
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKEN_COUNT = 1
+
+
+# Brain class for speech recognition training
+class VALLEBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluation_metric = SpeechEvaluationMetricStats(
+            self.hparams, self.device
+        )
+            
+    def create_waveform(self, audio, length):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+
+                    Returns
+        -------
+        wav : torch.Tensor
+        """
+        self.modules.tokenizer.device = self.device
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.modules.tokenizer.codec_vocoder.device = self.device
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int()
+        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        clean_padding_(wav, length)
+        return wav
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        prompt, prompt_length = batch.prompt
+        batch_size, prompt_max_len, num_tracks = prompt.shape
+        nar_track = torch.randint(
+            1, num_tracks, (batch_size,),
+            device=self.device
+        )
+        logits_ar, logits_nar = self.modules.model(
+            dec_seq=batch.prompt.data,
+            dec_seq_lengths=batch.prompt.lengths,
+            prefix_len=batch.prefix_length / prompt_max_len,
+            nar_level_idx=nar_track
+        )
+        return logits_ar, logits_nar, nar_track
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        logits_ar, logits_nar, nar_track = predictions
+        prompt, prompt_length = batch.prompt
+        prefix_length = batch.prefix_length
+
+        logits_ar_sm = self.hparams.log_softmax(logits_ar)
+        logits_nar_sm = self.hparams.log_softmax(logits_nar)
+        batch_size, max_len, _ = prompt.shape
+        targets_ar = prompt[:, 1:, 0]
+        batch_idx = torch.arange(batch_size, device=prompt.device)
+        targets_nar = prompt[batch_idx, 1:, nar_track]
+        prompt_max_len = prompt.size(1)
+        length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len)
+        prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not()
+        mask = (length_mask * prefix_mask)[:, 1:]
+
+        loss_ar = self.hparams.compute_cost(
+            log_probabilities=logits_ar_sm,
+            targets=targets_ar,
+            mask=mask
+        )
+        self.loss_metric_ar.append(
+            ids=batch.uttid,
+            log_probabilities=logits_ar_sm,
+            targets=targets_ar,
+            mask=mask,
+            reduction="batch",
+        )
+        loss_nar = self.hparams.compute_cost(
+            log_probabilities=logits_nar_sm,
+            targets=targets_nar,
+            mask=mask,
+        )
+        self.loss_metric_nar.append(
+            ids=batch.uttid,
+            log_probabilities=logits_nar_sm,
+            targets=targets_nar,
+            mask=mask,
+            reduction="batch",
+        )
+        loss = loss_ar + loss_nar
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.offsets = get_offsets(
+            self.hparams.vocab_size,
+            self.hparams.audio_tokens_per_step,
+        )[None, None, :].to(self.device)
+
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.hparams.compute_cost, batch_eval=True,
+        )
+        self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
+            metric=self.hparams.compute_cost,
+            batch_eval=True,
+        )
+        self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
+            metric=self.hparams.compute_cost,
+            batch_eval=True,
+        )
+
+        # TOOO: Reestablish evaluation
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluation_metric.on_evaluation_start()
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluation_metric.on_evaluation_start()
+            self.is_evaluating = True
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        return epoch % self.hparams.eval_interval == 0
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            with torch.no_grad():
+                audio_tokens, audio_length = self.inference(batch)
+                if self.hparams.flip_layers:
+                    audio_tokens = audio_tokens.flip(2)
+                wav = self.create_waveform(audio_tokens, audio_length)
+                wav = wav.squeeze(1)
+                self.save_samples(
+                    batch=batch,
+                    wav=wav,
+                    length=audio_length,
+                    stage=stage
+                )
+                self.evaluation_metric.append(
+                    ids=batch.uttid,
+                    wav=wav,
+                    text=batch.label_norm_eval,
+                    length=audio_length,
+                    wav_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                )
+        return loss.detach().cpu()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_evaluating:
+            self.evaluation_metric.on_evaluation_end()
+            self.save_eval(stage)
+            eval_summary = self.evaluation_metric.summarize()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+            )
+
+    def inference(self, batch):
+        """Runs TTS inference
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch
+
+        Returns
+        -------
+        audio : torch.Tensor
+            A padded tensor of audio
+        audio_length : torch.Tensor
+            Relative lengths
+        """
+        prefix, prefix_length = batch.prefix
+        # NOTE: ESPNET VALL-E does not support batched inference
+        prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference_results = [
+            self.modules.model.inference(
+                prefix=prefix_item.unsqueeze(0),
+                opts=self._get_inference_opts()
+            )            
+            for prefix_item in prefix_items
+        ]
+        inferred_tokens = [
+            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step)
+            for result in inference_results
+        ]
+        audio, audio_length = batch_pad_right(inferred_tokens)
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
+        return audio, audio_length
+
+    def _get_inference_opts(self):
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :]
+        tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None]
+        track_start = (
+            self.hparams.text_num_tokens
+            + self.hparams.special_num_tokens
+            + tracks * self.hparams.vocab_size
+        )
+        if self.hparams.flip_layers:
+            track_start = track_start.flip(0)
+        track_end = track_start + self.hparams.vocab_size
+        mask = (
+            ((idx >= track_start) & (idx < track_end))
+            | (idx == self.hparams.bos_index)
+        ).logical_not()
+        return self.hparams.inference_opts(
+            masks={
+                self.hparams.bos_index: mask
+            },
+            device=self.device,
+        )
+
+    def save_samples(self, batch, wav, length, stage):
+        output_folder = self._get_eval_output_folder(stage)
+        samples = undo_padding_tensor(wav, length)
+        for uttid, sample in zip(batch.uttid, samples):
+            file_name = output_folder / f"pred_{uttid}.wav"
+            write_audio(file_name, sample, self.hparams.model_sample_rate)
+
+    def save_eval(self, stage):
+        """Saves evaluation results
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        """
+        output_folder = self._get_eval_output_folder(stage)
+        for src_file_name in self.evaluation_metric.files:
+            dest_file_name = output_folder / src_file_name.name
+            shutil.copyfile(src_file_name, dest_file_name)
+        self.evaluation_metric.clear()
+
+    def _get_eval_output_folder(self, stage):
+        epoch = self.hparams.epoch_counter.current
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)        
+        output_folder.mkdir(exist_ok=True, parents=True)
+        return output_folder
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+    offsets = get_offsets(
+        hparams["vocab_size"],
+        hparams["audio_tokens_per_step"]
+    ).unsqueeze(0)
+    if hparams["flip_layers"]:
+        offsets = offsets.flip(-1)
+    
+    tokens_loader = hparams.get("tokens_loader")
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label_norm
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens")
+    @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length")
+    def prompt_pipeline(id, tokens):
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=hparams["audio_tokens_per_step"]
+        )
+
+        if hparams["flip_layers"]:
+            audio = audio.flip(-1)
+        yield audio
+        num_tracks = audio.size(1)
+        prefix = torch.cat(
+            [
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
+                torch.ones(1, num_tracks) * hparams["eot_index"],
+            ]
+        )
+        yield prefix
+        prompt = torch.cat(
+            [
+                prefix,
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                audio + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eos_index"],
+            ]
+        ).int()
+        yield prompt
+        yield len(prefix)
+        yield len(prompt)
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def sig_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline]
+
+    init_sequence_encoder(hparams)
+    use_spk_emb = hparams.get("use_spk_emb", False)
+    prepared_features = ["audio_tokens"]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "label_norm",
+        "audio",
+        "prompt",
+        "prefix_length",
+        "length"
+    ]
+    if use_spk_emb:
+        prepared_features.append("spk_emb")
+        output_keys.append("spk_emb")
+
+    for dataset in data_info:
+        dataset_dynamic_items = list(dynamic_items)
+        dataset_output_keys = list(output_keys)
+        if dataset != "train":
+            dataset_dynamic_items.append(sig_pipeline)
+            dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dataset_dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        hparams["train_dataloader_opts"]["shuffle"] = True
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+    return datasets
+
+
+def get_offsets(vocab_size, tracks):
+    """Adds offsets to each track to treat the tokens as distinct
+
+    Arguments
+    ---------
+    vocab_size : int
+        The vocabulary size, for each track
+    tracks : int
+        The number of tracks
+    """
+    return torch.arange(tracks) * vocab_size
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)
+    encoder.add_unk()
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    return encoder
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, _, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+def undo_padding_tensor(batch, lengths):
+    """Produces Python lists given a batch of sentences with
+    their corresponding relative lengths.
+
+    Arguments
+    ---------
+    batch : torch.Tensor
+        Batch of sentences gathered in a batch.
+    lengths : torch.Tensor
+        Relative length of each sentence in the batch.
+
+    Returns
+    -------
+    as_list : list
+        A python list of the corresponding input tensor.
+
+    Example
+    -------
+    >>> batch=torch.rand([4,100])
+    >>> lengths=torch.tensor([0.5,0.6,0.7,1.0])
+    >>> snt_list=undo_padding(batch, lengths)
+    >>> len(snt_list)
+    4
+    """
+    batch_max_len = batch.shape[1]
+    as_list = []
+    for seq, seq_length in zip(batch, lengths):
+        actual_size = int(torch.round(seq_length * batch_max_len))
+        seq_true = seq.narrow(0, 0, actual_size)
+        as_list.append(seq_true)
+    return as_list
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_ljspeech,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_folder": hparams["prepare_save_folder"],
+                "splits": hparams["splits"],
+                "split_ratio": hparams["split_ratio"],
+                "seed": hparams["seed"],
+                "extract_phonemes": hparams["input"] == "phonemes",
+                "model_name": "tokotron",
+                "g2p_src": hparams["g2p_src"],
+                "skip_ignore_folders": hparams["prepare_skip_ignore_folders"],
+                "frozen_split_path": hparams.get("frozen_split_path"),
+                "device": run_opts.get("device", "cpu"),
+            },
+        )
+
+    # We can now directly create the datasets for training, valid, and test
+    datasets = dataio_prepare(hparams)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_tokens"]
+
+    # Trainer initialization
+    tts_brain = VALLEBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Load best checkpoint for evaluation
+    if hparams["testing"]:
+        tts_brain.evaluate(
+            test_set=datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
new file mode 100644
index 000000000..11311d9d8
--- /dev/null
+++ b/benchmarks/DASB/model/valle.py
@@ -0,0 +1,924 @@
+"""An adaptation of ESPNET VALL-E
+Originally by Jinchuan Tian
+
+https://github.com/espnet/espnet
+
+Authors
+ * Artem Ploujnikov 2024 (adaptation only)
+"""
+
+# Copyright 2024 Jinchuan Tian
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Implementation of Vall-E: https://arxiv.org/abs/2301.02111
+
+import logging
+import torch
+from typing import Dict, Tuple, Optional
+from speechbrain.dataio.dataio import length_to_mask
+
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+from dataclasses import dataclass
+
+from speechbrain.nnet.losses import reduce_loss
+from speechbrain.nnet.losses import truncate
+
+
+@dataclass
+class SpeechLMInferenceOptions:
+    """Inference options
+    """
+
+    device: str = None
+    search_algo: str = "topk_sampling"
+    nbest: int = 1
+    sampling_temperature: float = 1.0
+    top_k: int = 20
+    maxlenratio: float = 0.0
+    minlenratio: float = 0.0
+    eos: int = 5
+    start: int = 1
+    masks: torch.Tensor = None
+    nq: int = None
+    allow_invalid: bool = True
+
+
+class ValleLM(nn.Module):
+    """The Vall-E TTS model (decoder-only transformer), adopted from
+    ESPNET2
+
+    Arguments
+    ---------
+    vocab_size : int
+        Dimention of vocabulary.
+    nq : int
+        Number of codes for each token / frame, usually for speech codec.
+    share_emb : bool
+        If true, share the embedding and lm_head weight.
+    qk_norm : bool
+        If true, apply LayerNorm to q and k in atention.
+    dropout : float
+        dropout rate for attention layers.
+    att_unit: int
+        Dimention of Transformer attention.
+    head : int
+        Number of heads in Transformer attention.
+    ar_layer : int
+        Number of layers in AR Transformer.
+    nar_layer : int
+        Number of layers in NAR Transformer.
+    n_ctx : int
+        maximum context length of AR & NAR Transformer.
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        nq: int,
+        pad_id: int = 0,
+        share_emb: bool = True,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+        att_unit: int = 256,
+        head: int = 2,
+        ar_layer: int = 4,
+        nar_layer: int = 4,
+        n_ctx: int = 3000,
+    ):
+        super().__init__()
+
+        self.emb = torch.nn.Embedding(vocab_size, att_unit)
+        self.lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
+        if share_emb:
+            self.lm_head.weight = self.emb.weight
+
+        self.ar_decoder = TransformerDecoder(
+            n_ctx=n_ctx,
+            n_state=att_unit,
+            n_head=head,
+            n_layer=ar_layer,
+            qk_norm=qk_norm,
+            dropout=dropout,
+        )
+
+        self.nar_decoder = ValleNARDecoder(
+            n_level=nq - 1,
+            n_ctx=n_ctx,
+            n_state=att_unit,
+            n_head=head,
+            n_layer=nar_layer,
+            qk_norm=qk_norm,
+            dropout=dropout,
+        )
+
+        self.nq = nq
+        self.n_ctx = n_ctx
+        self.pad_id = pad_id
+        self._initialize()
+
+    def forward(
+        self,
+        dec_seq: torch.Tensor,
+        dec_seq_lengths: torch.Tensor = None,
+        prefix_len: torch.Tensor = None,
+        conti_feats: Tuple = None,
+        nar_level_idx=1,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Dict]:
+        """Vall-E forward for training
+
+        Args:
+            dec_seq (LongTensor): Batch of decoder sequences (B, T, nq).
+            dec_seq_lengths (LongTensor): Lengths of batched decoder sequences (B,).
+            enc_seq (LongTensor): Batch of encoder sequences (B, T, nq), keep
+                the interface, may not be used.
+            enc_seq_lengths (LongTensor): Lengths of batched encoder sequences (B,),
+                keep the interface, may not be used.
+            prefix_len (LongTensor): Lengths of condition part in dec_seq (B,).
+            compute_loss (bool): whether to compute loss or just logits.
+        """
+
+        assert dec_seq.dim() == 3
+
+        dec_seq_emb = self.emb(dec_seq)  # [B, T, nq, D]
+        dec_seq_emb, _ = install_continuous_features(
+            dec_seq_emb, None, conti_feats
+        )
+
+        # Auto-Regressive part
+        input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[
+            :, :-1
+        ]  # [B, T, D]
+        h_ar = self.ar_decoder(input_ar_emb)
+
+        # Non-Auto-Regressive part
+        input_nar_emb = self.prepare_input(
+            dec_seq_emb, prefix_len, nar_level_idx
+        )[
+            :, 1:
+        ]  # [B, T, V]
+        max_len = dec_seq.size(1)
+        mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool()
+        mask = mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+        h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask)
+
+        logits_ar = self.lm_head(h_ar)
+        logits_nar = self.lm_head(h_nar)
+
+        return logits_ar, logits_nar
+
+    def prepare_input(self, dec_seq_emb, prefix_len, level):
+        # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage.
+        # This is because both prefix_mask and level_mask are broadcastable and will
+        # trigger user warning.
+
+        # (1) level mask, [B, 1, nq, 1], True is to include
+        if isinstance(level, int):
+            level = torch.ones_like(dec_seq_emb[:, 0, 0, 0]) * level
+        level_mask = length_to_mask(level, self.nq).bool()
+        level_mask = (
+            level_mask.unsqueeze(1).unsqueeze(3).expand(dec_seq_emb.size())
+        )
+
+        # (2) prefix mask, [B, T, 1, 1], True is the prefix
+        prefix_mask = length_to_mask(
+            prefix_len * dec_seq_emb.size(1), dec_seq_emb.size(1)
+        ).bool()
+        prefix_mask = (
+            prefix_mask.unsqueeze(2).unsqueeze(3).expand(dec_seq_emb.size())
+        )
+
+        # (3) mask and then sum in nq-axis.
+        mask = torch.logical_or(level_mask, prefix_mask)
+        return dec_seq_emb.masked_fill(~mask, 0.0).sum(2)
+
+    @torch.no_grad()
+    def inference(
+        self,
+        prefix: torch.Tensor,
+        opts: SpeechLMInferenceOptions,
+        enc_seq: torch.Tensor = None,
+        suffix: torch.Tensor = None,
+    ):
+        """Vall-E Inference.
+
+        Args:
+            prefix (LongTensor): Prefix part of dec_seq (B, T, nq).
+            opts (SpeechLMInferenceOptions): inference options.
+            enc_seq (LongTensor): Encoder token sequence (B, T, nq).
+            suffix (LongTensor): suffix part of dec_seq (B, T, nq),
+                usually the target sequence for teacher-forcing.
+        """
+
+        # (1) initialization
+        cache = self.ar_decoder.init()
+
+        # (2) auto-regressive prefix forward on first code layer
+        prefix = prefix.expand(opts.nbest, -1, -1)
+        if opts.search_algo == "teacher_force":
+            suffix = suffix.expand(opts.nbest, -1, -1)
+        prefix_emb = self.emb(prefix).sum(dim=2)  # [B, T, D]
+        _ = self.ar_decoder(prefix_emb, kv_cache=cache)
+
+        # (3) auto-regressive loop on first code layer
+        # (3.1) AR initialization
+        minlen = (
+            int(prefix.size(1) * opts.minlenratio)
+            if opts.minlenratio > 0
+            else 0
+        )
+        maxlen = int(prefix.size(1) * opts.maxlenratio)
+        if opts.search_algo == "teacher_force":
+            assert suffix is not None
+            minlen = suffix.size(1)
+            maxlen = suffix.size(1)
+        if maxlen + prefix.size(1) > self.n_ctx:
+            maxlen = self.n_ctx - prefix.size(1)
+        logging.info(f"maxlen={maxlen}, minlen={minlen}")
+
+        generated = {"token": [], "score": []}
+        finish_idx = (
+            torch.Tensor([-1]).expand(opts.nbest).long().to(opts.device)
+        )
+        prev_tok = (
+            torch.Tensor([opts.start])
+            .tile(opts.nbest, 1)
+            .long()
+            .to(opts.device)
+        )
+        modality_index = prev_tok.flatten()
+        mask = modality_index_to_mask(modality_index, opts)
+        mask_cache = []
+
+        for step in range(maxlen):
+            #  (3.2) AR loop
+            prev_emb = self.emb(prev_tok)  # [B, 1, D]
+            h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
+            logits = self.lm_head(h_ar)  # [B, 1, V]
+            gen_tok, gen_score = logits_to_tokens(
+                logits.unsqueeze(2),
+                opts,
+                mask,
+                allow_eos=step >= minlen,
+                nq_level=0,
+            )
+            # [B, 1, 1] -> [B, 1]
+            gen_tok, gen_score = gen_tok.squeeze(2), gen_tok.squeeze(2)
+
+            generated["token"].append(gen_tok)
+            generated["score"].append(gen_score)
+
+            if opts.search_algo == "teacher_force":
+                prev_tok = suffix[:, step : step + 1, 0]
+            else:
+                prev_tok = gen_tok  # [B, 1]
+
+            # (3.3) detect modality swtich
+            mask_cache.append(mask.clone())
+            modality_change_mask = torch.logical_and(
+                prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
+            )
+            if torch.any(modality_change_mask):
+                modality_index = torch.where(
+                    modality_change_mask, prev_tok[:, 0], modality_index,
+                )
+                mask = modality_index_to_mask(modality_index, opts)
+                logging.warning(
+                    f"Step {step}: change modality index {modality_index}"
+                )
+
+            # (3.4) detect ended hypotheses.
+            finish_idx = torch.where(
+                torch.logical_and(prev_tok[:, 0] == opts.eos, finish_idx == -1),
+                step,
+                finish_idx,
+            )
+
+            if torch.all(torch.ge(finish_idx, 0)):
+                break
+
+            if step == maxlen - 1:
+                logging.warning(
+                    f"Some examples cannot finish in {maxlen} steps: {finish_idx}"
+                    f"Consider increasing the maxlenratio"
+                )
+
+        logging.info(f"Terminate at steps: {finish_idx.cpu().tolist()}")
+
+        # (3.4) finalize auto-regressive
+        if opts.allow_invalid:
+            valid_idx = torch.arange(len(finish_idx), device=finish_idx.device)
+            finish_idx = torch.where(finish_idx == -1, step, finish_idx)
+        else:
+            valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0]
+        if len(valid_idx) == 0:
+            self.ar_decoder.reset()
+            logging.warning(f"No valid examples. Return None")
+            return [], []
+        elif len(valid_idx) < prefix.size(0):
+            logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid")
+
+        finish_idx = finish_idx[valid_idx]
+        prefix_emb = prefix_emb[valid_idx]
+        if opts.search_algo == "teacher_force":
+            suffix = suffix[valid_idx]
+        gen_tokens_ar = torch.cat(generated["token"], dim=1)[
+            valid_idx
+        ].unsqueeze(
+            2
+        )  # [B, T, 1]
+        gen_scores_ar = torch.cat(generated["score"], dim=1)[
+            valid_idx
+        ].unsqueeze(2)
+        gen_tokens_ar = gen_tokens_ar[:, : finish_idx.max() + 1]  # idx -> count
+        gen_scores_ar = gen_scores_ar[:, : finish_idx.max() + 1]
+
+        self.ar_decoder.reset()
+
+        # (4) non-auto-regressive loop on the remained code layers
+        # (4.1) NAR initialization
+        if opts.search_algo == "teacher_force":
+            prev_tok = suffix[:, :, 0]
+        else:
+            prev_tok = gen_tokens_ar[:, :, 0]
+        start_emb = self.emb.weight[opts.start].tile(
+            len(valid_idx), 1, 1
+        )  # [B, 1, D]
+        prev_emb = torch.cat(
+            [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
+        )  # [B, T, D]
+
+        ones = torch.ones_like(valid_idx)
+        mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
+        mask = mask.unsqueeze(1).unsqueeze(1)
+        generated = {"token": [], "score": []}
+
+        mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
+        vocab_mask = torch.cat(mask_cache, dim=1)
+
+        # (4.2) NAR loop
+        for step in range(1, opts.nq):
+            h_nar = self.nar_decoder(
+                prev_emb, ones * step - 1, mask=mask
+            )  # [B, T, D]
+            logits = self.lm_head(h_nar)
+            gen_tok, gen_score = logits_to_tokens(
+                logits.unsqueeze(2),
+                opts,
+                vocab_mask,
+                search_algo="greedy_search",
+                allow_eos=False,
+                nq_level=step,
+            )
+            gen_tok, gen_score = (
+                gen_tok.squeeze(2),
+                gen_score.squeeze(2),
+            )  # [B, T]
+
+            generated["token"].append(gen_tok[:, prefix.size(1) :])
+            generated["score"].append(gen_score[:, prefix.size(1) :])
+
+            if opts.search_algo == "teacher_force":
+                prev_tok = suffix[:, :, step]
+            else:
+                prev_tok = generated["token"][-1]
+            prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
+            prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
+
+        # (5) combine AR and NAR results
+        gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
+        gen_scores_nar = torch.stack(generated["score"], dim=2)
+
+        gen_tokens = torch.cat(
+            [gen_tokens_ar, gen_tokens_nar], dim=2
+        )  # [B, T, nq]
+        gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+
+        gen_tokens_list, gen_scores_list = [], []
+        for b in range(len(valid_idx)):
+            gen_tokens_list.append(gen_tokens[b][: finish_idx[b]])
+            gen_scores_list.append(gen_scores[b][: finish_idx[b]])
+
+        return gen_tokens_list, gen_scores_list
+
+    def _initialize(self):
+        for m in self.modules():
+            if isinstance(m, torch.nn.Linear):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+                if m.bias is not None:
+                    torch.nn.init.zeros_(m.bias)
+            elif isinstance(m, torch.nn.Embedding):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        cross_attention: bool = False,
+        causal: bool = False,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(
+            n_state, n_head, causal=causal, qk_norm=qk_norm, dropout=dropout,
+        )
+        self.attn_ln = LayerNorm(n_state)
+        self.attn_dropout = nn.Dropout(p=dropout)
+
+        self.cross_attn = (
+            MultiHeadAttention(
+                n_state, n_head, causal=False, qk_norm=qk_norm, dropout=dropout,
+            )
+            if cross_attention
+            else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        self.cross_attn_dropout = (
+            nn.Dropout(p=dropout) if cross_attention else None
+        )
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+        self.mlp_dropout = nn.Dropout(p=dropout)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn_dropout(
+            self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
+        )
+        if self.cross_attn:
+            x = x + self.cross_attn_dropout(
+                self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
+            )
+        x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x)))
+        return x
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        n_ctx: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        causal: bool = True,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+        layer_class=ResidualAttentionBlock,
+    ):
+        super().__init__()
+
+        self.pos_emb = nn.Embedding(n_ctx, n_state)
+
+        self.blocks = nn.ModuleList(
+            [
+                layer_class(
+                    n_state=n_state,
+                    n_head=n_head,
+                    cross_attention=False,
+                    causal=causal,
+                    qk_norm=qk_norm,
+                    dropout=dropout,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+
+        self.causal = causal
+        self.kv_cache = None
+
+    def forward(
+        self,
+        x: Tensor,
+        mask: torch.Tensor = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        if self.causal and mask is not None:
+            raise ValueError("Causal Transformer dones't allow mask")
+
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0)
+
+        for block in self.blocks:
+            x = block(x, mask=mask, kv_cache=kv_cache)
+
+        x = self.ln(x)
+        return x
+
+    def init(self):
+        self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache)
+        return self.kv_cache
+
+    def reset(self,):
+        for hook in self.hooks:
+            hook.remove()
+        self.kv_cache = None
+
+
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+
+
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+
+
+class ResidualAttentionBlockAdaLN(ResidualAttentionBlock):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        cross_attention: bool = False,
+        causal: bool = False,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+    ):
+        super(ResidualAttentionBlockAdaLN, self).__init__(
+            n_state=n_state,
+            n_head=n_head,
+            cross_attention=cross_attention,
+            causal=causal,
+            qk_norm=qk_norm,
+            dropout=dropout,
+        )
+
+        self.attn_ln = AdaLN(n_state)
+        self.mlp_ln = AdaLN(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        level: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn_dropout(
+            self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache)
+        )
+        if self.cross_attn:
+            x = x + self.cross_attn_dropout(
+                self.cross_attn(
+                    self.cross_attn_ln(x, level), xa, kv_cache=kv_cache
+                )
+            )
+        x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x, level)))
+        return x
+
+
+class ValleNARDecoder(TransformerDecoder):
+    def __init__(
+        self,
+        n_level: int,
+        n_ctx: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        causal: bool = False,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+        layer_class=ResidualAttentionBlockAdaLN,
+    ):
+
+        super().__init__(
+            n_ctx=n_ctx,
+            n_state=n_state,
+            n_head=n_head,
+            n_layer=n_layer,
+            causal=causal,
+            qk_norm=qk_norm,
+            dropout=dropout,
+            layer_class=layer_class,
+        )
+
+        self.level_emb = nn.Embedding(n_level, n_state)
+        self.ln = AdaLN(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        level: Tensor,
+        mask: Tensor = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        if self.causal and mask is not None:
+            raise ValueError("mask is not allowed when causal")
+
+        level = self.level_emb(level)
+
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0)
+
+        for block in self.blocks:
+            x = block(x, level=level, mask=mask, kv_cache=kv_cache)
+
+        x = self.ln(x, level)
+        return x
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        causal: bool = False,
+        qk_norm: bool = False,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        assert n_state % n_head == 0
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.causal = causal
+        self.dropout = dropout
+
+        self.qk_norm = qk_norm
+        if qk_norm:
+            self.q_norm = LayerNorm(n_state // n_head)
+            self.k_norm = LayerNorm(n_state // n_head)
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ValueError("Install torch 2.0.1+ to support Flash Attention")
+
+        try:
+            from flash_attn import flash_attn_func
+
+            self.flash_attn_func = flash_attn_func
+        except ImportError:
+            self.flash_attn_func = None
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv = self.qkv_attention(q, k, v, mask)
+
+        return self.out(wv)
+
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        if self.causal and mask is not None:
+            raise ValueError("mask is not allowed when the attention is causal")
+
+        if self.causal and q.size(1) == k.size(1):
+            causal = True
+        else:
+            causal = False
+
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+
+        if self.qk_norm:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
+        if self.flash_attn_func is not None and mask is None and self.training:
+            wv = self.flash_attn_func(
+                q.transpose(1, 2),
+                k.transpose(1, 2),
+                v.transpose(1, 2),
+                dropout_p=self.dropout,
+                causal=causal,
+            ).flatten(start_dim=2)
+        else:
+            wv = (
+                F.scaled_dot_product_attention(
+                    q, k, v, mask, is_causal=causal, dropout_p=self.dropout
+                )
+                .permute(0, 2, 1, 3)
+                .flatten(start_dim=2)
+            )
+
+        return wv
+
+
+class AdaLN(nn.Module):
+    def __init__(self, n_state, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Linear(n_state, n_state, bias=False)
+        self.bias = nn.Linear(n_state, n_state, bias=False)
+        nn.init.constant_(self.weight.weight, 1.0)
+        nn.init.constant_(self.bias.weight, 0.0)
+
+        self.n_state = n_state
+        self.eps = eps
+
+    def forward(self, x: Tensor, level_emb: Tensor):
+        w = self.weight(level_emb).unsqueeze(1)
+        b = self.bias(level_emb).unsqueeze(1)
+        x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps)
+        x = w * x + b
+        return x
+
+
+def install_kv_cache_hook(model, cache):
+    cache = {**cache} if cache is not None else {}
+    hooks = []
+
+    def save_to_cache(module, _, output):
+        if module not in cache:
+            # save as-is, for the first token or cross attention
+            cache[module] = output
+        else:
+            cache[module] = torch.cat([cache[module], output], dim=1).detach()
+        return cache[module]
+
+    def install_hooks(layer: torch.nn.Module):
+        if isinstance(layer, MultiHeadAttention):
+            hooks.append(layer.key.register_forward_hook(save_to_cache))
+            hooks.append(layer.value.register_forward_hook(save_to_cache))
+
+    model.apply(install_hooks)
+    return cache, hooks
+
+
+def logits_to_tokens(
+    logits: torch.Tensor,
+    opts: SpeechLMInferenceOptions,
+    mask: torch.Tensor,
+    search_algo: str = None,
+    allow_eos: bool = True,
+    nq_level: int = None,
+):
+    """
+    Select the generated tokens and their scores based on logits prediction.
+
+    logits (torch.Tensor), predicted logits, of size [B, T, nq, V]
+    opts (SpeechLMInferenceOptions): search options
+    mask (torch.Tensor): mask to specify valid tokens, of size [B, 1, nq, V]
+    search_algo (str): search algorithm
+    allow_eos (bool): whether to allow end-of-sentence prediction
+    nq_level (int or None): if not None, only conpute the specified codec level nq.
+
+    """
+
+    assert logits.dim() == 4
+    search_algo = search_algo if search_algo is not None else opts.search_algo
+    neg_inf = torch.finfo(logits.dtype).min
+
+    # (1) Apply mask
+    if nq_level is not None:
+        mask = mask[:, :, nq_level : nq_level + 1]
+
+    if allow_eos:
+        mask = mask.clone()
+        mask[:, :, 0, opts.eos] = False
+
+    logits.masked_fill_(mask, neg_inf)
+
+    # (2) token selection
+    if search_algo in ["topk_sampling"]:
+        topk_values, topk_indices = torch.topk(logits, opts.top_k, dim=-1)
+        probs = torch.softmax(topk_values / opts.sampling_temperature, dim=-1)
+        inner_indices = torch.multinomial(
+            probs.flatten(end_dim=-2), num_samples=1
+        ).view(probs[..., :1].size())
+        gen_token_idx = torch.gather(topk_indices, -1, inner_indices).squeeze(
+            -1
+        )
+        gen_token_score = (
+            torch.gather(probs, -1, inner_indices).squeeze(-1).log()
+        )
+
+    elif search_algo in ["topp_sampling"]:
+        probs = torch.softmax(logits / opts.sampling_temperature, dim=-1)
+        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+        accum_probs = torch.cumsum(sorted_probs, dim=-1)
+        clip_probs = torch.where(accum_probs <= opts.top_p, sorted_probs, 0.0)
+        # always keep at least one candidate no matter what value it is
+        if torch.any(clip_probs[..., 0] == 0.0):
+            clip_probs[..., 0] = sorted_probs[..., 0]
+        clip_probs = clip_probs / clip_probs.sum(dim=-1, keepdim=True)
+        inner_indices = torch.multinomial(
+            clip_probs.flatten(end_dim=-2), num_samples=1
+        ).view(clip_probs[..., :1].size())
+        gen_token_idx = torch.gather(sorted_indices, -1, inner_indices).squeeze(
+            -1
+        )
+        gen_token_score = (
+            torch.gather(clip_probs, -1, inner_indices).squeeze(-1).log()
+        )
+
+    elif search_algo in ["greedy_search", "teacher_force"]:
+        probs = logits.softmax(dim=-1)
+        topk_values, topk_indices = torch.topk(logits, 1, dim=-1)
+        gen_token_idx = topk_indices[:, :, :, 0]
+        gen_token_score = topk_values[:, :, :, 0].log()
+
+    else:
+        raise NotImplementedError(f"opts.search_algo={opts.search_algo}")
+
+    return gen_token_idx, gen_token_score
+
+
+@torch.no_grad()
+def install_continuous_features(
+    dec_emb: torch.Tensor,
+    enc_emb: Optional[torch.Tensor] = None,
+    conti_feats: Tuple = None,
+):
+    if conti_feats is None:
+        return dec_emb, enc_emb
+
+    assert dec_emb.size(0) == len(conti_feats)
+    if enc_emb is not None:
+        assert enc_emb.size(0) == len(conti_feats)
+
+    for b, conti_feat in enumerate(conti_feats):
+        for conti_emb, start, end, part in conti_feat:
+            if part == "dec":
+                assert conti_emb.size(1) == dec_emb.size(2)
+                dec_emb[b, start:end] = conti_emb
+            else:
+                assert conti_emb.size(1) == enc_emb.size(2)
+                enc_emb[b, start:end] = conti_emb
+
+    return dec_emb, enc_emb
+
+
+def modality_index_to_mask(
+    modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions,
+):
+    assert modality_index.dim() == 1
+    modality_index = modality_index.cpu().tolist()
+    mask = torch.stack(
+        [inference_opts.masks[idx] for idx in modality_index], dim=0
+    ).unsqueeze(
+        1
+    )  # [B, 1, nq, V]
+
+    return mask
+
+
+def masked_nll_loss(
+    log_probabilities, targets, mask, allowed_len_diff=3, reduction="mean"
+):
+    """Similar to the standard nll_loss from SpeechBrain
+    but applies a custom mask
+
+    Arguments
+    ---------
+    log_probabilities : torch.Tensor
+        The probabilities after log has been applied.
+        Format is [batch, log_p] or [batch, frames, log_p].
+    targets : torch.Tensor
+        The targets, of shape [batch] or [batch, frames].
+    mask : torch.Tensor
+        The mask for loss calculation
+    allowed_len_diff : int
+        Length difference that will be tolerated before raising an exception.
+    reduction : str
+        Options are 'mean', 'batch', 'batchmean', 'sum'.
+        See pytorch for 'mean', 'sum'. The 'batch' option returns
+        one loss per item in the batch, 'batchmean' returns sum / batch size.
+    """
+    log_probabilities, targets = truncate(
+        log_probabilities, targets, allowed_len_diff
+    )
+    log_probabilities = log_probabilities.transpose(1, -1)
+    loss = torch.nn.functional.nll_loss(
+        input=log_probabilities, target=targets.long(), reduction="none"
+    )
+    loss *= mask
+    loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets)
+    return loss

From 3ce4d4daaa4bb284f113185399af30ff76fe68f1 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 20 Jan 2025 17:30:43 -0500
Subject: [PATCH 070/270] DASB: Fixes

---
 .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ce4e6edaa..9027a945b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -151,7 +151,7 @@ activation: !name:torch.nn.GELU
 audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
-audio_emb_pretrained: True
+audio_emb_pretrained: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice

From 57d68cf97c9eb5e3c8113b6542eb6dd5877c6366 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 20 Jan 2025 18:15:24 -0500
Subject: [PATCH 071/270] DASB: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 75cbae717..e88469b3b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -34,7 +34,6 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
@@ -211,7 +210,7 @@ modules:
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:model.Tokotron.Tokotron.oss
+compute_cost: !new:model.Tokotron.TokotronLoss
     guided_attention_weight: !ref <guided_attention_weight>
     guided_attention_sigma: !ref <guided_attention_sigma>
     gate_weight: !ref <gate_loss_weight>

From c1c3b52fcf05667970c8a7ea9fbd3e43ddbaf94f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:00:27 -0500
Subject: [PATCH 072/270] DASB: Add a "brokenness check" to ensure that tokens
 runs that produce no samples at all (non-intelligble speech) are not picked
 by the hyperparameter optimizer

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |  6 +++
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 53 ++++++++++++++++---
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index e7ffe2576..717532724 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -52,3 +52,9 @@ eval_summary:
 eval_summary_log:
   utmos: utmos_utmos_mean
   dwer: asr_dwer_median
+
+eval_threshold:
+  dwer_max: 90.0
+
+eval_threshold_set:
+  utmos: 0.0
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 8945607f9..05f81b341 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -273,11 +273,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
         # End evaluation and report stats
         if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
             self.evaluator.on_evaluate_end()
-            eval_summary = self.evaluator.compute_summary()
-            eval_summary_stats = {
-                key: eval_summary.get(value)
-                for key, value in self.hparams.eval_summary_log.items()
-            }
+            eval_summary_stats = self.get_summary_stats()
             stage_stats.update(eval_summary_stats)
 
         # Perform end-of-iteration things, like annealing, logging, etc.
@@ -303,6 +299,51 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
             )
 
+    def get_summary_stats(self):
+        """Retrieves the stats that needs to be reported on every trial
+        in the train log, as indicated in eval_summary_log in eval.yaml
+        
+        Returns
+        -------
+        eval_summary_stats : dict
+            A dict with stats"""
+        eval_summary = self.evaluator.compute_summary()
+        eval_summary_stats = {
+            key: eval_summary.get(value)
+            for key, value in self.hparams.eval_summary_log.items()
+        }
+        self._check_threshold(eval_summary_stats)
+        return eval_summary_stats
+    
+    def _check_threshold(self, eval_summary_stats):
+        """Checks threshold values for the defined stats and terminates
+        the trials if the parameters are not met. This is necessary because
+        some metrics produce bogus high values when the speech samples
+        do not contain any speech at all (e.g. UTMOS can be above 3 for
+        silence).
+
+        Classic usage: dWER > 0.9 - treat the whole run as "garbage", set
+        UTMOS to 0
+
+        Arguments
+        ---------
+        eval_summary_stats : dict
+            Summary statistics
+        """
+        for key, threshold_value in self.hparams.eval_threshold.items():
+            key, threshold_type = key.split("_")
+            value = eval_summary_stats[key]
+            if threshold_type == "min":
+                meets = value >= threshold_value
+            elif threshold_type == "max":
+                meets = value <= threshold_value
+            else:
+                raise ValueError(f"Invalid threshold definition: {key}, check eval_threshold")
+            if not meets:
+                eval_summary_stats["broken"] = True
+                for key, value in self.hparams.eval_threshold_set.items():
+                    eval_summary_stats[key] = value
+
     def fit_batch(self, batch):
         """Fit one batch, override to do multiple updates.
 
@@ -371,7 +412,7 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        self.modules.tokenizer.device = self.device        
+        self.modules.tokenizer.device = self.device
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device

From 123e12454d4466343da74075bb62daf90a8cabd4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 21 Jan 2025 14:48:09 -0500
Subject: [PATCH 073/270] DASB: Tokotron/VALL-E: Work in progress

---
 .../TTS/tokotron/hparams/train_encodec.yaml   |   1 -
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |   2 -
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |   2 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |  11 -
 .../tokotron/hparams/train_discrete_ssl.yaml  |   1 -
 .../TTS/tokotron/hparams/train_encodec.yaml   |   1 -
 .../hparams/train_speech_tokenizer.yaml       |   7 -
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  58 +-
 .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml |   0
 .../TTS/valle/hparams/train_discrete_ssl.yaml | 280 ++++++
 .../TTS/valle/hparams/train_encodec.yaml      | 225 +++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 866 ++++++++++++++++++
 benchmarks/DASB/model/valle.py                | 103 ++-
 13 files changed, 1434 insertions(+), 123 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/train.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 0e923ffc9..39b28b437 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -32,7 +32,6 @@ test_json: !ref <prepare_save_folder>/test.json
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 377b5955c..aa7ee2c4b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -10,7 +10,6 @@
 import speechbrain as sb
 import json
 import logging
-import math
 import csv
 import torch
 import torchaudio
@@ -19,7 +18,6 @@
 from pathlib import Path
 from types import SimpleNamespace
 from torch.nn import ModuleDict
-from tqdm.auto import tqdm
 from data import undo_batch
 from eval import vocoder_to_device
 from torch.utils.flop_counter import FlopCounterMode
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index 18e39ba42..bafd769cc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -67,4 +67,4 @@ eval_summary:
 eval_summary_log:
   utmos: utmos_utmos_mean
   dwer: asr_dwer_median
-  spk_sim: spk_sim_score_mean
+  spk_sim: spk_sim_score_mean
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 8c05d2499..5670aa208 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -33,7 +33,6 @@ test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
@@ -129,16 +128,6 @@ use_silence_padding: True
 
 
 # Token model (pretrained)
-dac: !new:speechbrain.lobes.models.discrete.dac.DAC
-    sample_rate: !ref <model_sample_rate>
-    model_type: !ref <model_type>
-    model_bitrate: !ref <model_bitrate>
-    load_pretrained: True
-
-token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor
-    dac: !ref <dac>
-    n_quantizers: !ref <audio_tokens_per_step>
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 5c8db0bc4..88bc91aef 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -36,7 +36,6 @@ test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 30d2cbfe4..be49b69f6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -36,7 +36,6 @@ test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
-progress_archive: !ref <progress_folder>/progress.tar
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index c307ed0bf..aac74070a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -130,13 +130,6 @@ use_silence_padding: True
 
 
 # Token model (pretrained)
-# Token model (pretrained)
-token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface
-    source: !ref <token_model_src>
-    save_path: !ref <pretrained_model_save_folder>
-    shape: compat
-    codebooks: !ref <audio_tokens_per_step>
-
 spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index da228d6ae..3ab03d0a3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -84,7 +84,7 @@ def create_waveform(self, audio, length, emb):
         -------
         wav : torch.Tensor
         """
-        self.modules.tokenizer.device = self.device        
+        self.modules.tokenizer.device = self.device
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
@@ -383,62 +383,6 @@ def evaluate_batch(self, batch, stage):
             self.evaluator.evaluate_batch(batch)
         return loss.detach().cpu()
 
-    def make_dataloader(
-        self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs
-    ):
-        """A custom override of make_dataloader that will change the batch
-        size if guides are enabled to meet GPU memory constraints
-
-        Arguments
-        ---------
-        dataset : Dataset
-            A set of data to use to create data loader. If the Dataset is a
-            DynamicItemDataset, PaddedBatch is used as the default collate_fn,
-            unless specified in loader_kwargs.
-        stage : Stage
-            The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
-        ckpt_prefix : str, None
-            Prefix to use for SaveableDataLoader Checkpoint name. The Stage
-            name is added to this to create the full key. Set to None to not
-            save the DataLoader.
-        **loader_kwargs : dict
-            Additional keyword arguments to the DataLoader.
-            E.g., batch_size, num_workers, pin_memory.
-
-        Returns
-        -------
-        DataLoader for the input dataset
-        """
-        if stage == sb.Stage.TRAIN and not getattr(
-            self, "_ckpt_recovered", False
-        ):
-            self.checkpointer.recover_if_possible()
-            self._ckpt_recovered = True
-        if self.guides_running(pre_epoch=True):
-            loader_kwargs["batch_size"] = self.hparams.batch_size_guided
-        return super().make_dataloader(
-            dataset=dataset,
-            stage=stage,
-            ckpt_prefix=ckpt_prefix,
-            **loader_kwargs,
-        )
-
-    def guides_running(self, pre_epoch=False):
-        """Determines whether guides are currently running
-
-        Arguments
-        ---------
-        pre_epoch : bool
-            If enabled, a correction will be applied to the current epoch
-            indicating that the current epoch has not yet started"""
-        epoch = self.hparams.epoch_counter.current
-        if pre_epoch:
-            epoch += 1
-        return (
-            self.hparams.guides_enabled
-            and epoch >= self.hparams.guides_start_epoch
-        )
-
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
new file mode 100644
index 000000000..b5eb30f62
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -0,0 +1,280 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/discrete_ssl
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+ssl_model_type: wavlm
+representation_mode: discrete
+output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+token_model_src: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: microsoft/wavlm-large
+        hubert: facebook/hubert-large-ll60k
+        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
+        wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
+        wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
+
+ssl_model_layers: [1, 3, 7, 12, 18, 23]
+token_model_layers: !ref <ssl_model_layers>
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+# Token model (pretrained)
+ssl_model: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ssl_model_type>
+    choices:
+        wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+        wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+            source: !ref <token_model_src>
+            save_path: !ref <pretrained_model_save_folder>
+            freeze: !ref <freeze_token_model>
+            output_all_hiddens: True
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <token_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 1000
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    decoder_mode: !ref <decoder_mode>
+    scale_factor: !ref <scale_factor>
+    representation_mode: discrete
+
+tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
+    save_path: !ref <kmeans_cache_dir>
+    ssl_model: !ref <ssl_model>
+    vocoder_repo_id: !ref <vocoder_repo_id>
+    kmeans_dataset: !ref <kmeans_dataset>
+    num_clusters: !ref <vocab_size>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+    representation_mode: discrete
+
+
+lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
+    lr_initial: [!ref <lr>, !ref <audio_emb_lr>]
+    n_warmup_steps: !ref <lr_warmup_steps>
+    param_group: 0
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
new file mode 100644
index 000000000..39b28b437
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -0,0 +1,225 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/encodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
new file mode 100644
index 000000000..ebcc78015
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -0,0 +1,866 @@
+#!/usr/bin/env/python3
+"""Recipe for training a Text-to-Speech system based on tokenized audio
+
+Inspired by WhisperSpeech
+https://github.com/collabora/WhisperSpeech
+
+However, this is not an implementation of WhisperSpeech, but rather
+a radical simplification of it that uses only an acoustic model
+
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+
+import logging
+import speechbrain as sb
+import torch
+import sys
+import shutil
+from pathlib import Path
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio
+from speechbrain.dataio.dataio import write_audio
+from speechbrain.utils.distributed import run_on_main
+from speechbrain.utils.data_utils import batch_pad_right
+import re
+import string
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from evaluation import SpeechEvaluationMetricStats
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKEN_COUNT = 1
+
+
+# Brain class for speech recognition training
+class VALLEBrain(sb.Brain):
+    """Class that manages the training loop. See speechbrain.core.Brain."""
+
+    def __init__(
+        self,
+        modules=None,
+        opt_class=None,
+        hparams=None,
+        run_opts=None,
+        checkpointer=None,
+    ):
+        super().__init__(
+            modules, opt_class, hparams, run_opts, checkpointer,
+        )
+        self.evaluation_metric = SpeechEvaluationMetricStats(
+            self.hparams, self.device
+        )
+            
+    def create_waveform(self, audio, length):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+
+                    Returns
+        -------
+        wav : torch.Tensor
+        """
+        self.modules.tokenizer.device = self.device
+        if hasattr(self.modules.tokenizer, "codec_vocoder"):
+            self.modules.tokenizer.codec_vocoder.to(self.device)
+            self.modules.tokenizer.codec_vocoder.device = self.device
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int()
+        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        clean_padding_(wav, length)
+        return wav
+
+    def compute_forward(self, batch, stage):
+        """Runs all the computation of the Tokotron TTS
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        predictions : dict
+            TTS predictions
+        """
+        batch = batch.to(self.device)
+        prompt, prompt_length = batch.prompt
+        batch_size, prompt_max_len, num_tracks = prompt.shape
+        nar_track = torch.randint(
+            1, num_tracks, (batch_size,),
+            device=self.device
+        )
+        logits_ar, logits_nar = self.modules.model(
+            dec_seq=batch.prompt.data,
+            dec_seq_lengths=batch.prompt.lengths,
+            prefix_len=batch.prefix_length / prompt_max_len,
+            nar_level_idx=nar_track
+        )
+        return logits_ar, logits_nar, nar_track
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss given the predicted and targeted outputs. We here
+        do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
+        costs.
+
+        Arguments
+        ---------
+        predictions : dict
+            The output dict from `compute_forward`.
+        batch : PaddedBatch
+            This batch object contains all the relevant tensors for computation.
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+
+        Returns
+        -------
+        loss : torch.Tensor
+            A one-element tensor used for backpropagating the gradient.
+        """
+        batch = batch.to(self.device)
+
+        logits_ar, logits_nar, nar_track = predictions
+        prompt, prompt_length = batch.prompt
+        prefix_length = batch.prefix_length
+
+        logits_ar_sm = self.hparams.log_softmax(logits_ar)
+        logits_nar_sm = self.hparams.log_softmax(logits_nar)
+        batch_size, max_len, _ = prompt.shape
+        targets_ar = prompt[:, 1:, 0]
+        batch_idx = torch.arange(batch_size, device=prompt.device)
+        targets_nar = prompt[batch_idx, 1:, nar_track]
+        prompt_max_len = prompt.size(1)
+        length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len)
+        prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not()
+        mask = (length_mask * prefix_mask)[:, 1:]
+
+        loss_ar = self.hparams.compute_cost(
+            log_probabilities=logits_ar_sm,
+            targets=targets_ar,
+            mask=mask
+        )
+        self.loss_metric_ar.append(
+            ids=batch.uttid,
+            log_probabilities=logits_ar_sm,
+            targets=targets_ar,
+            mask=mask,
+            reduction="batch",
+        )
+        loss_nar = self.hparams.compute_cost(
+            log_probabilities=logits_nar_sm,
+            targets=targets_nar,
+            mask=mask,
+        )
+        self.loss_metric_nar.append(
+            ids=batch.uttid,
+            log_probabilities=logits_nar_sm,
+            targets=targets_nar,
+            mask=mask,
+            reduction="batch",
+        )
+        loss = loss_ar + loss_nar
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+        self.offsets = get_offsets(
+            self.hparams.vocab_size,
+            self.hparams.audio_tokens_per_step,
+        )[None, None, :].to(self.device)
+
+        self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
+            metric=self.hparams.compute_cost, batch_eval=True,
+        )
+        self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
+            metric=self.hparams.compute_cost,
+            batch_eval=True,
+        )
+        self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
+            metric=self.hparams.compute_cost,
+            batch_eval=True,
+        )
+
+        # TOOO: Reestablish evaluation
+        self.is_evaluating = False
+        if stage == sb.Stage.VALID:
+            if self.is_eval_epoch(epoch):
+                self.evaluation_metric.on_evaluation_start()
+                self.is_evaluating = True
+            else:
+                logger.info("No evaluation on epoch %d", epoch)
+        elif stage == sb.Stage.TEST:
+            self.evaluation_metric.on_evaluation_start()
+            self.is_evaluating = True
+
+    def is_eval_epoch(self, epoch):
+        """Determines whether or not evaluation should be performed
+        in the specieied epoch
+
+        Arguments
+        ---------
+        epoch : int
+            The epoch number. If omitted, the epoch number from the
+            epoch counter will be used
+
+        Returns
+        -------
+        eval_epoch : bool
+            True if evaluation should be run in this epoch, false
+            otherwise"""
+        if epoch is None:
+            epoch = self.hparams.epoch_counter.current
+        return epoch % self.hparams.eval_interval == 0
+
+    def on_fit_start(self):
+        """Gets called at the beginning of ``fit()``, on multiple processes
+        if ``distributed_count > 0`` and backend is ddp.
+
+        Default implementation compiles the jit modules, initializes
+        optimizers, and loads the latest checkpoint to resume training.
+        """
+        # Run this *after* starting all processes since jit/compiled modules
+        # cannot be pickled.
+        self._compile()
+
+        # Wrap modules with parallel backend after jit
+        self._wrap_distributed()
+
+        # Initialize optimizers after parameters are configured
+        self.init_optimizers()
+
+        # Load latest checkpoint to resume training if interrupted
+        if self.checkpointer is not None and not getattr(
+            self, "_ckpt_recovered", False
+        ):
+            self.checkpointer.recover_if_possible()
+            self._ckpt_recovered = True
+
+    @torch.no_grad()
+    def evaluate_batch(self, batch, stage):
+        """Evaluate one batch, override for different procedure than train.
+
+        The default implementation depends on two methods being defined
+        with a particular behavior:
+
+        * ``compute_forward()``
+        * ``compute_objectives()``
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for evaluation. Default implementation assumes
+            this batch has two elements: inputs and targets.
+        stage : Stage
+            The stage of the experiment: Stage.VALID, Stage.TEST
+
+        Returns
+        -------
+        detached loss
+        """
+        out = self.compute_forward(batch, stage=stage)
+        loss = self.compute_objectives(out, batch, stage=stage)
+        if self.is_evaluating:
+            with torch.no_grad():
+                audio_tokens, audio_length = self.inference(batch)
+                if self.hparams.flip_layers:
+                    audio_tokens = audio_tokens.flip(2)
+                wav = self.create_waveform(audio_tokens, audio_length)
+                wav = wav.squeeze(1)
+                self.save_samples(
+                    batch=batch,
+                    wav=wav,
+                    length=audio_length,
+                    stage=stage
+                )
+                self.evaluation_metric.append(
+                    ids=batch.uttid,
+                    wav=wav,
+                    text=batch.label_norm_eval,
+                    length=audio_length,
+                    wav_ref=batch.sig.data,
+                    length_ref=batch.sig.lengths,
+                )
+        return loss.detach().cpu()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of an epoch.
+
+        Arguments
+        ---------
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST
+        stage_loss : float
+            The average loss for all of the data processed in this stage.
+        epoch : int
+            The currently-starting epoch. This is passed
+            `None` during the test stage.
+        """
+
+        # Store the train loss until the validation stage.
+        loss_stats = self.loss_metric.summarize(flat=True)
+        stage_stats = {"loss": stage_loss, **loss_stats}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # End evaluation and report stats
+        if stage != sb.Stage.TRAIN and self.is_evaluating:
+            self.evaluation_metric.on_evaluation_end()
+            self.save_eval(stage)
+            eval_summary = self.evaluation_metric.summarize()
+            eval_summary_stats = {
+                key: eval_summary.get(value)
+                for key, value in self.hparams.eval_summary_log.items()
+            }
+            stage_stats.update(eval_summary_stats)
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+
+            if self.hparams.lr_annealing_mode == "epoch":
+                _, new_lr = self.hparams.lr_annealing(stage_loss)
+                sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+
+            lr = self.optimizer.param_groups[0]["lr"]
+
+            # The train_logger writes a summary to stdout and to the logfile.
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+
+            # Save the current checkpoint and delete previous checkpoints.
+            self.checkpointer.save_and_keep_only(
+                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+            )
+
+    def inference(self, batch):
+        """Runs TTS inference
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch
+
+        Returns
+        -------
+        audio : torch.Tensor
+            A padded tensor of audio
+        audio_length : torch.Tensor
+            Relative lengths
+        """
+        prefix, prefix_length = batch.prefix
+        # NOTE: ESPNET VALL-E does not support batched inference
+        prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference_results = [
+            self.modules.model.inference(
+                prefix=prefix_item.unsqueeze(0),
+                opts=self._get_inference_opts()
+            )            
+            for prefix_item in prefix_items
+        ]
+        inferred_tokens = [
+            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step)
+            for result in inference_results
+        ]
+        audio, audio_length = batch_pad_right(inferred_tokens)
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
+        return audio, audio_length
+
+    def _get_inference_opts(self):
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :]
+        tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None]
+        track_start = (
+            self.hparams.text_num_tokens
+            + self.hparams.special_num_tokens
+            + tracks * self.hparams.vocab_size
+        )
+        if self.hparams.flip_layers:
+            track_start = track_start.flip(0)
+        track_end = track_start + self.hparams.vocab_size
+        mask = (
+            ((idx >= track_start) & (idx < track_end))
+            | (idx == self.hparams.bos_index)
+        ).logical_not()
+        return self.hparams.inference_opts(
+            masks={
+                self.hparams.bos_index: mask
+            },
+            device=self.device,
+        )
+
+    def save_samples(self, batch, wav, length, stage):
+        output_folder = self._get_eval_output_folder(stage)
+        samples = undo_padding_tensor(wav, length)
+        for uttid, sample in zip(batch.uttid, samples):
+            file_name = output_folder / f"pred_{uttid}.wav"
+            write_audio(file_name, sample, self.hparams.model_sample_rate)
+
+    def save_eval(self, stage):
+        """Saves evaluation results
+
+        stage : sb.Stage
+            One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST.
+        """
+        output_folder = self._get_eval_output_folder(stage)
+        for src_file_name in self.evaluation_metric.files:
+            dest_file_name = output_folder / src_file_name.name
+            shutil.copyfile(src_file_name, dest_file_name)
+        self.evaluation_metric.clear()
+
+    def _get_eval_output_folder(self, stage):
+        epoch = self.hparams.epoch_counter.current
+        output_folder = (
+            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+        )
+        if epoch is not None:
+            output_folder = output_folder / str(epoch)        
+        output_folder.mkdir(exist_ok=True, parents=True)
+        return output_folder
+
+    def fit_batch(self, batch):
+        loss = super().fit_batch(batch)
+        if self.hparams.lr_annealing_mode == "step":
+            self.hparams.lr_annealing(self.optimizer)
+        return loss
+
+
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions.
+
+
+    Arguments
+    ---------
+    hparams : dict
+        This dictionary is loaded from the `train.yaml` file, and it includes
+        all the hyperparameters needed for dataset construction and loading.
+
+    Returns
+    -------
+    datasets : dict
+        Dictionary containing "train", "valid", and "test" keys that correspond
+        to the DynamicItemDataset objects.
+    silence_token : dict
+        the token used for silence
+    """
+
+    # Define datasets from json data manifest file
+    # Define datasets sorted by ascending lengths for efficiency
+    datasets = {}
+    data_folder = hparams["data_folder"]
+    data_info = {
+        "train": hparams["train_json"],
+        "valid": hparams["valid_json"],
+        "test": hparams["test_json"],
+    }
+    label_encoder = hparams["label_encoder"]
+    input_feature = INPUT_FEATURE_MAP[hparams["input"]]
+    offsets = get_offsets(
+        hparams["vocab_size"],
+        hparams["audio_tokens_per_step"]
+    ).unsqueeze(0)
+    if hparams["flip_layers"]:
+        offsets = offsets.flip(-1)
+    
+    tokens_loader = hparams.get("tokens_loader")
+
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
+    def text_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        label_norm = label.upper()
+        yield label_norm
+        label_norm_eval = RE_PUNCTUATION.sub("", label_norm)
+        yield label_norm_eval
+
+    @sb.utils.data_pipeline.takes(input_feature)
+    @sb.utils.data_pipeline.provides("tokens")
+    def tokens_pipeline(label):
+        """Processes the transcriptions to generate proper labels"""
+        return label_encoder.encode_sequence_torch(label)
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens")
+    @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length")
+    def prompt_pipeline(id, tokens):
+        audio = tokens_loader.tokens_by_uttid(
+            id, num_codebooks=hparams["audio_tokens_per_step"]
+        )
+
+        if hparams["flip_layers"]:
+            audio = audio.flip(-1)
+        yield audio
+        num_tracks = audio.size(1)
+        prefix = torch.cat(
+            [
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
+                torch.ones(1, num_tracks) * hparams["eot_index"],
+            ]
+        )
+        yield prefix
+        prompt = torch.cat(
+            [
+                prefix,
+                torch.ones(1, num_tracks) * hparams["bos_index"],
+                audio + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eos_index"],
+            ]
+        ).int()
+        yield prompt
+        yield len(prefix)
+        yield len(prompt)
+
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def sig_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline]
+
+    init_sequence_encoder(hparams)
+    use_spk_emb = hparams.get("use_spk_emb", False)
+    prepared_features = ["audio_tokens"]
+    output_keys = [
+        "uttid",
+        "tokens",
+        "label_norm",
+        "audio",
+        "prompt",
+        "prefix_length",
+        "length"
+    ]
+    if use_spk_emb:
+        prepared_features.append("spk_emb")
+        output_keys.append("spk_emb")
+
+    for dataset in data_info:
+        dataset_dynamic_items = list(dynamic_items)
+        dataset_output_keys = list(output_keys)
+        if dataset != "train":
+            dataset_dynamic_items.append(sig_pipeline)
+            dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
+        dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=data_info[dataset],
+            replacements={"data_root": data_folder},
+            dynamic_items=dataset_dynamic_items,
+            output_keys=dataset_output_keys,
+        )
+
+        datasets[dataset] = dynamic_dataset
+        hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
+
+    # Sorting training data with ascending order makes the code  much
+    # faster  because we minimize zero-padding. In most of the cases, this
+    # does not harm the performance.
+    if hparams["sorting"] == "ascending":
+        datasets["train"] = datasets["train"].filtered_sorted(sort_key="length")
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        datasets["train"] = datasets["train"].filtered_sorted(
+            sort_key="length", reverse=True
+        )
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        hparams["train_dataloader_opts"]["shuffle"] = True
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending"
+        )
+    return datasets
+
+
+def get_offsets(vocab_size, tracks):
+    """Adds offsets to each track to treat the tokens as distinct
+
+    Arguments
+    ---------
+    vocab_size : int
+        The vocabulary size, for each track
+    tracks : int
+        The number of tracks
+    """
+    return torch.arange(tracks) * vocab_size
+
+
+def init_sequence_encoder(hparams):
+    """Initialize a sequence encoder
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    prefix: str
+        the prefix to be prepended to hyperparameter keys, per the naming
+        convention
+
+        {prefix}_label_encoder: the hparams key for the label encoder
+        {prefix}_list_file:  the hparams key for the list file
+
+    Returns
+    -------
+    encoder: speechbrain.dataio.encoder.TextEncoder
+        an encoder instance"""
+    encoder = hparams["label_encoder"]
+    token_list_file_name = hparams["token_list_file"]
+    tokens = read_token_list(token_list_file_name)
+    encoder.add_unk()
+    encoder.update_from_iterable(tokens, sequence_input=False)
+    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    return encoder
+
+
+def read_token_list(file_name):
+    """Reads a simple text file with tokens (e.g. characters or phonemes) listed
+    one per line
+
+    Arguments
+    ---------
+    file_name: str
+        the file name
+
+    Returns
+    -------
+    result: list
+        a list of tokens
+    """
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
+        raise ValueError(f"Token file {file_name} not found")
+    with open(file_name) as token_file:
+        return [line.strip("\r\n") for line in token_file if line]
+
+
+def apply_overfit_test(hparams, dataset):
+    """Helper for applying an overfit test conditionally based
+    on hyperparameters:
+
+    `overfit_test`: whether or not to apply an overfit test
+    `overfit_test_sample_count`: the number of samples to use from the
+        original dataset
+    `overfit_test_epoch_data_count`: the number of samples per epoch
+
+    The function will accept datasets, (train, valid, test) tuples
+    or dictionaries of the form:
+    {"train": dataset1, "valid": dataset2, "test": dataset3}
+
+    If a tuple or dictionary is used, the training dataset will be of length
+    overfit_test_epoch_data_count wheres the evaluation dataset will be of
+    length overfit_test_sample_count.
+
+    Arguments
+    ---------
+    hparams: dict
+        parsed hyperparameters
+    dataset: DynamicItemDataset|tuple|dict
+        One of the following
+        a dataset
+        a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3})
+        a (train, valid, test)  tuple of datasets
+
+    Returns
+    -------
+    result: DynamicItemDataset|tuple|dict
+        a dataset or collection of datasets suitable for
+        an overfitting test - in the same format as the
+        dataset argument (single dataset, dictionary and tuple)
+    """
+    if hparams["overfit_test"]:
+        if isinstance(dataset, tuple):
+            dataset_train, _, _ = dataset
+            dataset_train = apply_overfit_test(hparams, dataset_train)
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = dataset_train, dataset_eval, dataset_eval
+        elif isinstance(dataset, dict):
+            dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval = dataset_train.filtered_sorted(
+                select_n=hparams["overfit_test_sample_count"]
+            )
+            result = {
+                "train": dataset_train,
+                "valid": dataset_eval,
+                "test": dataset_eval,
+                "sample": dataset_eval,
+            }
+        else:
+            result = dataset.overfit_test(
+                hparams["overfit_test_sample_count"],
+                hparams["overfit_test_epoch_data_count"],
+            )
+    else:
+        result = dataset
+    return result
+
+
+def undo_padding_tensor(batch, lengths):
+    """Produces Python lists given a batch of sentences with
+    their corresponding relative lengths.
+
+    Arguments
+    ---------
+    batch : torch.Tensor
+        Batch of sentences gathered in a batch.
+    lengths : torch.Tensor
+        Relative length of each sentence in the batch.
+
+    Returns
+    -------
+    as_list : list
+        A python list of the corresponding input tensor.
+
+    Example
+    -------
+    >>> batch=torch.rand([4,100])
+    >>> lengths=torch.tensor([0.5,0.6,0.7,1.0])
+    >>> snt_list=undo_padding(batch, lengths)
+    >>> len(snt_list)
+    4
+    """
+    batch_max_len = batch.shape[1]
+    as_list = []
+    for seq, seq_length in zip(batch, lengths):
+        actual_size = int(torch.round(seq_length * batch_max_len))
+        seq_true = seq.narrow(0, 0, actual_size)
+        as_list.append(seq_true)
+    return as_list
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Initialize ddp (useful only for multi-GPU DDP training)
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml = "\n".join([yaml, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    from ljspeech_prepare import prepare_ljspeech
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+    from libritts_prepare import prepare_libritts
+
+    # Data preparation, to be run on only one process.
+    if not hparams["skip_prep"]:
+        run_on_main(
+            prepare_libritts,
+            kwargs={
+                "data_folder": hparams["data_folder"],
+                "save_json_train": hparams["train_json"],
+                "save_json_valid": hparams["valid_json"],
+                "save_json_test": (
+                    hparams["test_json"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "sample_rate": hparams["sample_rate"],
+                "train_split": hparams["train_split"],
+                "valid_split": hparams["valid_split"],
+                "test_split": (
+                    hparams["test_split"]
+                    if "test" in hparams["splits"]
+                    else None
+                ),
+                "seed": hparams["seed"],
+                "model_name": hparams["model"].__class__.__name__,
+            },
+        )
+
+
+    # We can now directly create the datasets for training, valid, and test
+    datasets = dataio_prepare(hparams)
+
+    # Apply overfit test settings
+    datasets = apply_overfit_test(hparams, datasets)
+    audio_keys = ["audio_tokens"]
+
+    # Trainer initialization
+    tts_brain = VALLEBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+
+    # The `fit()` method iterates the training loop, calling the methods
+    # necessary to update the parameters of the model. Since all objects
+    # with changing state are managed by the Checkpointer, training can be
+    # stopped at any point, and will be resumed on next call.
+    tts_brain.fit(
+        tts_brain.hparams.epoch_counter,
+        datasets["train"],
+        datasets["valid"],
+        train_loader_kwargs=hparams["train_dataloader_opts"],
+        valid_loader_kwargs=hparams["valid_dataloader_opts"],
+    )
+
+    # Load best checkpoint for evaluation
+    if hparams["testing"]:
+        tts_brain.evaluate(
+            test_set=datasets["test"],
+            test_loader_kwargs=hparams["test_dataloader_opts"],
+        )
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 11311d9d8..245ac0fd9 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -75,17 +75,17 @@ class ValleLM(nn.Module):
 
     def __init__(
         self,
-        vocab_size: int,
-        nq: int,
-        pad_id: int = 0,
-        share_emb: bool = True,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
-        att_unit: int = 256,
-        head: int = 2,
-        ar_layer: int = 4,
-        nar_layer: int = 4,
-        n_ctx: int = 3000,
+        vocab_size,
+        nq,
+        pad_id=0,
+        share_emb=True,
+        qk_norm=False,
+        dropout=0.0,
+        att_unit=256,
+        head=2,
+        ar_layer=4,
+        nar_layer=4,
+        n_ctx=3000,
     ):
         super().__init__()
 
@@ -120,23 +120,30 @@ def __init__(
 
     def forward(
         self,
-        dec_seq: torch.Tensor,
-        dec_seq_lengths: torch.Tensor = None,
-        prefix_len: torch.Tensor = None,
-        conti_feats: Tuple = None,
+        dec_seq,
+        dec_seq_lengths=None,
+        prefix_len=None,
+        conti_feats=None,
         nar_level_idx=1,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Dict]:
+    ):
         """Vall-E forward for training
 
-        Args:
-            dec_seq (LongTensor): Batch of decoder sequences (B, T, nq).
-            dec_seq_lengths (LongTensor): Lengths of batched decoder sequences (B,).
-            enc_seq (LongTensor): Batch of encoder sequences (B, T, nq), keep
-                the interface, may not be used.
-            enc_seq_lengths (LongTensor): Lengths of batched encoder sequences (B,),
-                keep the interface, may not be used.
-            prefix_len (LongTensor): Lengths of condition part in dec_seq (B,).
-            compute_loss (bool): whether to compute loss or just logits.
+        Arguments
+        ---------
+        dec_seq : torch.Tensor
+            Batch of decoder sequences (B, T, nq).
+        dec_seq_lengths : torch.Tensor
+            Lengths of batched decoder sequences (B,).
+        enc_seq : torch.Tensor
+            Batch of encoder sequences (B, T, nq), keep
+            the interface, may not be used.
+        enc_seq_lengths : torch.Tensor
+            Lengths of batched encoder sequences (B,),
+            keep the interface, may not be used.
+        prefix_len : torch.Tensor
+            Lengths of condition part in dec_seq (B,).
+        nar_level_idx : int
+            the index of the non-autoregressive level to train
         """
 
         assert dec_seq.dim() == 3
@@ -196,19 +203,24 @@ def prepare_input(self, dec_seq_emb, prefix_len, level):
     @torch.no_grad()
     def inference(
         self,
-        prefix: torch.Tensor,
-        opts: SpeechLMInferenceOptions,
-        enc_seq: torch.Tensor = None,
-        suffix: torch.Tensor = None,
+        prefix,
+        opts,
+        enc_seq=None,
+        suffix=None,
     ):
         """Vall-E Inference.
 
-        Args:
-            prefix (LongTensor): Prefix part of dec_seq (B, T, nq).
-            opts (SpeechLMInferenceOptions): inference options.
-            enc_seq (LongTensor): Encoder token sequence (B, T, nq).
-            suffix (LongTensor): suffix part of dec_seq (B, T, nq),
-                usually the target sequence for teacher-forcing.
+        Arguments
+        ---------
+        prefix : torch.Tensor
+            Prefix part of dec_seq (B, T, nq).
+        opts : SpeechLMInferenceOptions
+            inference options.
+        enc_seq : torch.Tensor
+            Encoder token sequence (B, T, nq).
+        suffix : torch.Tensor
+            suffix part of dec_seq (B, T, nq),
+            usually the target sequence for teacher-forcing.
         """
 
         # (1) initialization
@@ -783,13 +795,20 @@ def logits_to_tokens(
     """
     Select the generated tokens and their scores based on logits prediction.
 
-    logits (torch.Tensor), predicted logits, of size [B, T, nq, V]
-    opts (SpeechLMInferenceOptions): search options
-    mask (torch.Tensor): mask to specify valid tokens, of size [B, 1, nq, V]
-    search_algo (str): search algorithm
-    allow_eos (bool): whether to allow end-of-sentence prediction
-    nq_level (int or None): if not None, only conpute the specified codec level nq.
-
+    Arguments
+    ---------
+    logits : torch.Tensor
+        predicted logits, of size [B, T, nq, V]
+    opts : SpeechLMInferenceOptions
+        search options
+    mask : torch.Tensor
+        mask to specify valid tokens, of size [B, 1, nq, V]
+    search_algo : str
+        search algorithm
+    allow_eos : bool
+        whether to allow end-of-sentence prediction
+    nq_level : int, optional
+        if not None, only conpute the specified codec level nq.
     """
 
     assert logits.dim() == 4

From b1ca7adfdd5360905c41f4a6a9d74401a5e32d1f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 21 Jan 2025 18:08:59 -0500
Subject: [PATCH 074/270] DASB: Tokotron: Implement SQCodec, Mimi and
 WavTokenizer (single-speaker)

---
 .../TTS/tokotron/hparams/train_dac.yaml       |   3 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   1 +
 .../TTS/tokotron/hparams/train_encodec.yaml   |   1 +
 .../TTS/tokotron/hparams/train_mimi.yaml      | 225 +++++++++++++++++
 .../hparams/train_speech_tokenizer.yaml       |   2 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   | 228 +++++++++++++++++
 .../tokotron/hparams/train_wavtokenizer.yaml  | 229 ++++++++++++++++++
 .../DASB/LJSpeech/TTS/tokotron/train.py       |  39 ++-
 .../DASB/LJSpeech/extraction/hparams/dac.yaml |   2 +-
 .../extraction/hparams/discrete_ssl.yaml      |   2 +-
 .../LJSpeech/extraction/hparams/encodec.yaml  |   2 +-
 .../LJSpeech/extraction/hparams/mimi.yaml     |  56 +++++
 .../extraction/hparams/speech_tokenizer.yaml  |   2 +-
 .../LJSpeech/extraction/hparams/sqcodec.yaml  |  55 +++++
 .../extraction/hparams/wavtokenizer.yaml      |  58 +++++
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  22 +-
 benchmarks/DASB/model/Tokotron.py             |  66 ++---
 benchmarks/DASB/model/sq_codec.py             |   3 +-
 18 files changed, 930 insertions(+), 66 deletions(-)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index e88469b3b..946c6d8c1 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -159,6 +159,7 @@ audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -168,8 +169,6 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
 bandwidth: 1.5
-model_shape: BHL
-model_needs_channel: True
 attention_type: regularMHA
 
 ############################## models ################################
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 68e54fa83..827da9a25 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -197,6 +197,7 @@ vocab_size: 1000
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 audio_emb_lr: 0.00001
 audio_emb_weight_decay: 0.001
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 0e923ffc9..3bbe8468f 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -149,6 +149,7 @@ audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
new file mode 100644
index 000000000..ebcb2d17f
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -0,0 +1,225 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/mimi
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 2048
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 9027a945b..ff51f8e32 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -152,6 +152,7 @@ audio_num_tokens: 1024
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -161,7 +162,6 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
 bandwidth: 1.5
-model_shape: HBL
 attention_type: regularMHA
 
 ############################## models ################################
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..7ec88ba7d
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -0,0 +1,228 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/sqcodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_path>/sq-codec
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 19683
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: True
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 1
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <sq_codec_save_path>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..79fed90fe
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,229 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/wavtokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+g2p_src: flexthink/soundchoice-g2p
+
+# Model type
+representation_mode: discrete
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 150
+batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+
+# Token model (pretrained)
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 4096
+audio_emb_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+audio_token_offsets: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 1
+attention_type: regularMHA
+
+############################## models ################################
+
+model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <pretrained_model_save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 05f81b341..44c9804ec 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -22,7 +22,7 @@
 import string
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
-from speechbrain.dataio.dataio import clean_padding_
+from speechbrain.dataio.dataio import clean_padding, clean_padding_
 from speechbrain.utils.distributed import run_on_main
 
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
@@ -120,6 +120,17 @@ def prepare_features(self, batch):
         if self.representation_mode == RepresentationMode.DISCRETE:
             audio_bos, audio_bos_length = batch.audio_bos
             audio_tgt, audio_tgt_length = batch.audio_pad
+            if self.audio_token_offsets is not None:
+                audio_bos = torch.cat(
+                    [
+                        audio_bos[:, :self.hparams.bos_width],
+                        audio_bos[:, self.hparams.bos_width:] - self.audio_token_offsets,
+                    ],
+                    dim=1
+                )
+                clean_padding_(audio_bos, audio_bos_length)
+                audio_tgt = audio_tgt - self.audio_token_offsets
+                clean_padding_(audio_tgt, audio_tgt_length)
         else:
             wav, audio_length = batch.sig
             audio = self.modules.ssl_model(wav)
@@ -136,6 +147,16 @@ def prepare_features(self, batch):
             audio_tgt_length = audio_length
         return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length
 
+    def get_token_offsets(self):
+        """Computes token offsets for tokenizers that require them"""
+        token_offsets = None
+        if self.hparams.audio_token_offsets:
+            token_offsets = (torch.arange(
+                self.hparams.audio_tokens_per_step,
+                device=self.device
+            ) * self.hparams.audio_num_tokens)[None, None, :]
+        return token_offsets
+
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):
         """Evaluate one batch, override for different procedure than train.
@@ -249,6 +270,8 @@ def on_stage_start(self, stage, epoch):
         elif stage == sb.Stage.TEST:
             self.evaluator.on_evaluate_start(stage, epoch)
             self.is_evaluating = True
+        
+        self.audio_token_offsets = self.get_token_offsets()
 
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
@@ -416,8 +439,11 @@ def create_waveform(self, audio, length):
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
-        wav = self.modules.tokenizer.tokens_to_sig(audio)
-        clean_padding_(wav, length)
+        with torch.no_grad():
+            if self.audio_token_offsets is not None:
+                audio = clean_padding(audio + self.audio_token_offsets, length)
+            wav = self.modules.tokenizer.tokens_to_sig(audio)
+            wav = clean_padding(wav, length)
         return wav
 
     def is_eval_epoch(self, epoch):
@@ -529,13 +555,12 @@ def audio_ref_pipeline(wav):
         use_silence_padding
         and representation_mode == RepresentationMode.DISCRETE
     ):
-        silence_token, _ = get_silence_token(
+        silence_token = get_silence_token(
             hparams[model_key],
             model_kwargs=hparams.get("token_model_kwargs"),
-            extract_emb=False,
-            model_shape=hparams.get("model_shape", "BLH"),
-            unsqueeze=hparams.get("model_needs_channel", False),
         )
+        if silence_token.dim() == 2:
+            silence_token = silence_token.squeeze(-1)
     else:
         silence_token = (
             torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64)
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
index ebf155bb2..b90054db6 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml
@@ -12,7 +12,7 @@ save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
-data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 frozen_split_path: null
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
index c4c01f527..d50cb85ef 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml
@@ -12,7 +12,7 @@ save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
-data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 frozen_split_path: null
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
index 0b07a6b1f..6de95de73 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
@@ -12,7 +12,7 @@ save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
-data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 frozen_split_path: null
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
new file mode 100644
index 000000000..22e15ef75
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
@@ -0,0 +1,56 @@
+# ############################################################################
+# Auido Tokenizer: Mimi
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/mimi
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 1
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+model_hub: kyutai/mimi
+vocab_size: 1024
+num_codebooks: 23
+sample_rate: 24000
+# Feature parameters
+encoder_dim: 1024
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  num_codebooks: !ref <num_codebooks>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
index 54da4f210..f91d34908 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -12,7 +12,7 @@ save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
-data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 frozen_split_path: null
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
new file mode 100644
index 000000000..4f633cee4
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
@@ -0,0 +1,55 @@
+# ############################################################################
+# Auido Tokenizer: SQCodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/sqcodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sample_rate: 16000
+save_embedding: False
+num_codebooks : 4
+save_path: /home/ubuntu/sq-codec/SQ-Codec
+
+# wavtokenizer model 
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <save_path>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
new file mode 100644
index 000000000..d23c25f96
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
@@ -0,0 +1,58 @@
+# ############################################################################
+# Auido Tokenizer: wavtokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/wavtokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+sample_rate: 24000
+save_embedding: False
+num_codebooks : 1
+vocab_size: 4096
+
+# wavtokenizer model 
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index da228d6ae..600ee5f39 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -32,6 +32,7 @@
 
 from model.Tokotron import (
     RepresentationMode,
+    get_silence_repr,
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
@@ -575,22 +576,21 @@ def tokens_pipeline(label):
     else:
         audio_tokens_per_step = hparams["audio_tokens_per_step"]
     if use_silence_padding:
-        silence_token, silence_emb = get_silence_token(
-            hparams["tokenizer"],
-            extract_emb=True,
-            model_kwargs=hparams.get("token_model_kwargs"),
-        )
+        if representation_mode == RepresentationMode.DISCRETE:
+            silence_padding = get_silence_token(
+                hparams["tokenizer"],
+                model_kwargs=hparams.get("token_model_kwargs"),
+            )
+        else:
+            silence_padding = get_silence_repr(
+                hparams["ssl_model"],
+            )
     else:
-        silence_token = (
+        silence_padding = (
             torch.ones(audio_tokens_per_step, dtype=torch.int64)
             * hparams["eos_index"]
         )
 
-    silence_padding = (
-        silence_token
-        if representation_mode == RepresentationMode.DISCRETE
-        else silence_emb
-    )
     silence_padding = silence_padding.cpu()
     silence_padding_len = int(math.ceil(hparams["silence_padding"]))
     bos_width = hparams.get("bos_width", 1)
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 949840380..1c76a2440 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -2059,8 +2059,6 @@ def decode(self, codes):
 def get_silence_token(
     model,
     sample_length=100000,
-    extract_emb=True,
-    model_shape="BLH",
     unsqueeze=False,
     device=None,
     model_kwargs=None,
@@ -2074,13 +2072,6 @@ def get_silence_token(
         A discrete token model, taking (wav, lengths) as arguments
     sample_length : int
         The length of the sample
-    extract_emb : bool
-        Whether to extract embeddings
-    model_shape : str
-        The shape of tokens output by the model
-        BLH: Batch x Length x Heads (Discrete SSL, Encodec)
-        BHL: Batch x Heads x Length (DAC)
-        HBL: Heads x Batch x Length (SpeechTokenizer)
     unsqueeze: bool
         Whether to add an extra dimension to the audio (needed for DAC)
     device : str | torch.Device
@@ -2108,43 +2099,38 @@ def get_silence_token(
     length = torch.ones(1, device=device)
     model_training = model.training
     model.eval()
-    if hasattr(model, "encode"):
-        spec = inspect.getfullargspec(model.encode)
-        if "length" in spec.args:
-            result = model.encode(audio, length, **model_kwargs)
-        else:
-            result = model.encode(audio, **model_kwargs)
-    else:
-        result = model(audio, length, **model_kwargs)
+    tokens = model.sig_to_tokens(audio, length)
     if model_training:
         model.train()
-    tokens = result if torch.is_tensor(result) else result[0]
-    if model_shape == "HBL":
-        tokens = tokens.permute(1, 2, 0)
-    elif model_shape == "BHL":
-        tokens = tokens.transpose(-1, -2)
-
     tokens = tokens.squeeze(0)
     if unsqueeze:
         tokens = tokens.squeeze(0)
     silence_tokens = tokens.mode(0).values
-    silence_emb = None
-    if extract_emb:
-        if hasattr(model, "embeddings"):
-            silence_emb = model.embeddings(
-                silence_tokens[None, None, :]
-            ).squeeze()
-        else:
-            heads = tokens.shape[-1]
-            embs = result[1]
-            mode_idx = [
-                (tokens[:, head] == silence_tokens[head]).nonzero()[0].item()
-                for head in range(heads)
-            ]
-            silence_emb = torch.stack(
-                [embs[0, idx, head] for head, idx in enumerate(mode_idx)]
-            )
-    return silence_tokens, silence_emb
+    return silence_tokens
+
+
+def get_silence_repr(model, sample_length=100000, device=None):
+    """Gets continuous silence
+
+    Arguments
+    ---------
+    model : nn.Module
+        A discrete token model, taking (wav, lengths) as arguments
+    sample_length : int
+        The length of the sample
+    device : str | torch.Device
+        The device to use
+
+    Returns
+    -------
+    silence : torch.Tensor
+        A silecnce tensor
+    """
+    audio = torch.zeros(1, sample_length, device=device)
+    length = torch.ones(1, device=device)    
+    audio_repr = model(audio, length)
+    silence = audio_repr.mean(dim=1)[0]
+    return silence
 
 
 def feature_pad_to(tensor, length, padding=None):
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 4ac4b74ad..99a38c9bd 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -123,7 +123,8 @@ def build_codec_model(self, config):
         """
         exp_model_config = OmegaConf.load(config)
         scalar_codec = ScalarModel(**exp_model_config.generator.config)
-        parameter_dict = torch.load(self.ckpt_path)
+        device = next(iter(scalar_codec.parameters())).device
+        parameter_dict = torch.load(self.ckpt_path, map_location=device)
         scalar_codec.load_state_dict(parameter_dict["codec_model"])
         return scalar_codec
 

From 2daeaa5192ee113fbf144132d52cee87713c395c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 21 Jan 2025 18:22:00 -0500
Subject: [PATCH 075/270] DASB: Cosmetic changes (pre-commit hooks)

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |  2 +-
 .../TTS/tokotron/hparams/train_mimi.yaml      |  6 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |  6 +-
 .../tokotron/hparams/train_wavtokenizer.yaml  | 10 +--
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 30 ++++---
 .../DASB/LJSpeech/TTS/valle/evaluation.py     |  3 +-
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   | 87 ++++++++++---------
 .../LJSpeech/extraction/hparams/sqcodec.yaml  |  4 +-
 .../extraction/hparams/wavtokenizer.yaml      |  4 +-
 .../DASB/LibriSpeech/extraction/extract.py    |  2 +-
 .../extraction/hparams/sqcodec.yaml           |  4 +-
 .../extraction/hparams/wavtokenizer.yaml      |  4 +-
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  2 -
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  6 +-
 .../LibriTTS/extraction/hparams/mimi.yaml     |  0
 benchmarks/DASB/model/Tokotron.py             |  3 +-
 16 files changed, 87 insertions(+), 86 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index 717532724..f805e23f6 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -57,4 +57,4 @@ eval_threshold:
   dwer_max: 90.0
 
 eval_threshold_set:
-  utmos: 0.0
\ No newline at end of file
+  utmos: 0.0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index ebcb2d17f..0d08747cc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -186,9 +186,9 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
     infer_max_audio_length: !ref <infer_max_audio_length>
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
-  source: !ref <model_hub>
-  save_path: !ref <save_folder>
-  num_codebooks: !ref <audio_tokens_per_step>
+    source: !ref <model_hub>
+    save_path: !ref <save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 7ec88ba7d..0143ef65b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -189,9 +189,9 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
     infer_max_audio_length: !ref <infer_max_audio_length>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
-  save_path: !ref <sq_codec_save_path>
-  checkpoint: !ref <checkpoint>
-  config: !ref <config>
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index 79fed90fe..df0a82050 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -188,11 +188,11 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
     infer_max_audio_length: !ref <infer_max_audio_length>
 
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
-  source: !ref <model_hub>
-  save_path: !ref <pretrained_model_save_folder>
-  checkpoint: !ref <checkpoint>
-  config: !ref <config>
-  freeze: True
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 44c9804ec..e35635b2b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -123,10 +123,11 @@ def prepare_features(self, batch):
             if self.audio_token_offsets is not None:
                 audio_bos = torch.cat(
                     [
-                        audio_bos[:, :self.hparams.bos_width],
-                        audio_bos[:, self.hparams.bos_width:] - self.audio_token_offsets,
+                        audio_bos[:, : self.hparams.bos_width],
+                        audio_bos[:, self.hparams.bos_width :]
+                        - self.audio_token_offsets,
                     ],
-                    dim=1
+                    dim=1,
                 )
                 clean_padding_(audio_bos, audio_bos_length)
                 audio_tgt = audio_tgt - self.audio_token_offsets
@@ -151,10 +152,12 @@ def get_token_offsets(self):
         """Computes token offsets for tokenizers that require them"""
         token_offsets = None
         if self.hparams.audio_token_offsets:
-            token_offsets = (torch.arange(
-                self.hparams.audio_tokens_per_step,
-                device=self.device
-            ) * self.hparams.audio_num_tokens)[None, None, :]
+            token_offsets = (
+                torch.arange(
+                    self.hparams.audio_tokens_per_step, device=self.device
+                )
+                * self.hparams.audio_num_tokens
+            )[None, None, :]
         return token_offsets
 
     @torch.no_grad()
@@ -270,7 +273,7 @@ def on_stage_start(self, stage, epoch):
         elif stage == sb.Stage.TEST:
             self.evaluator.on_evaluate_start(stage, epoch)
             self.is_evaluating = True
-        
+
         self.audio_token_offsets = self.get_token_offsets()
 
     def on_stage_end(self, stage, stage_loss, epoch):
@@ -325,7 +328,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
     def get_summary_stats(self):
         """Retrieves the stats that needs to be reported on every trial
         in the train log, as indicated in eval_summary_log in eval.yaml
-        
+
         Returns
         -------
         eval_summary_stats : dict
@@ -337,7 +340,7 @@ def get_summary_stats(self):
         }
         self._check_threshold(eval_summary_stats)
         return eval_summary_stats
-    
+
     def _check_threshold(self, eval_summary_stats):
         """Checks threshold values for the defined stats and terminates
         the trials if the parameters are not met. This is necessary because
@@ -361,7 +364,9 @@ def _check_threshold(self, eval_summary_stats):
             elif threshold_type == "max":
                 meets = value <= threshold_value
             else:
-                raise ValueError(f"Invalid threshold definition: {key}, check eval_threshold")
+                raise ValueError(
+                    f"Invalid threshold definition: {key}, check eval_threshold"
+                )
             if not meets:
                 eval_summary_stats["broken"] = True
                 for key, value in self.hparams.eval_threshold_set.items():
@@ -556,8 +561,7 @@ def audio_ref_pipeline(wav):
         and representation_mode == RepresentationMode.DISCRETE
     ):
         silence_token = get_silence_token(
-            hparams[model_key],
-            model_kwargs=hparams.get("token_model_kwargs"),
+            hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"),
         )
         if silence_token.dim() == 2:
             silence_token = silence_token.squeeze(-1)
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
index 152db4c87..6c2dd1c8d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
@@ -240,8 +240,7 @@ def summarize(self, field=None):
                 "descriptive"
             ]
             for stat_key, value in descriptive_statistics(
-                items=self.details[evaluator_key],
-                key=metric_key,
+                items=self.details[evaluator_key], key=metric_key,
             ).items()
         }
         if field is not None:
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index e0ae084a3..92ea570da 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -20,8 +20,11 @@
 import shutil
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
-from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio
-from speechbrain.dataio.dataio import write_audio
+from speechbrain.dataio.dataio import (
+    clean_padding_,
+    length_to_mask,
+    write_audio,
+)
 from speechbrain.utils.distributed import run_on_main
 from speechbrain.utils.data_utils import batch_pad_right
 import re
@@ -30,7 +33,7 @@
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
-from evaluation import SpeechEvaluationMetricStats
+from evaluation import SpeechEvaluationMetricStats  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -55,7 +58,7 @@ def __init__(
         self.evaluation_metric = SpeechEvaluationMetricStats(
             self.hparams, self.device
         )
-            
+
     def create_waveform(self, audio, length):
         """Creates a waveform from a discrete or continuous audio
         representation
@@ -75,7 +78,11 @@ def create_waveform(self, audio, length):
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
-        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int()
+        audio = (
+            (audio - hparams["audio_token_shift"] - self.offsets)
+            .clip(min=0.0)
+            .int()
+        )
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav
@@ -99,14 +106,13 @@ def compute_forward(self, batch, stage):
         prompt, prompt_length = batch.prompt
         batch_size, prompt_max_len, num_tracks = prompt.shape
         nar_track = torch.randint(
-            1, num_tracks, (batch_size,),
-            device=self.device
+            1, num_tracks, (batch_size,), device=self.device
         )
         logits_ar, logits_nar = self.modules.model(
             dec_seq=batch.prompt.data,
             dec_seq_lengths=batch.prompt.lengths,
             prefix_len=batch.prefix_length / prompt_max_len,
-            nar_level_idx=nar_track
+            nar_level_idx=nar_track,
         )
         return logits_ar, logits_nar, nar_track
 
@@ -142,14 +148,16 @@ def compute_objectives(self, predictions, batch, stage):
         batch_idx = torch.arange(batch_size, device=prompt.device)
         targets_nar = prompt[batch_idx, 1:, nar_track]
         prompt_max_len = prompt.size(1)
-        length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len)
-        prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not()
+        length_mask = length_to_mask(
+            prompt_length * prompt_max_len, prompt_max_len
+        )
+        prefix_mask = length_to_mask(
+            prefix_length, prompt_max_len
+        ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
         loss_ar = self.hparams.compute_cost(
-            log_probabilities=logits_ar_sm,
-            targets=targets_ar,
-            mask=mask
+            log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
         )
         self.loss_metric_ar.append(
             ids=batch.uttid,
@@ -159,9 +167,7 @@ def compute_objectives(self, predictions, batch, stage):
             reduction="batch",
         )
         loss_nar = self.hparams.compute_cost(
-            log_probabilities=logits_nar_sm,
-            targets=targets_nar,
-            mask=mask,
+            log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
         )
         self.loss_metric_nar.append(
             ids=batch.uttid,
@@ -185,20 +191,17 @@ def on_stage_start(self, stage, epoch):
             `None` during the test stage.
         """
         self.offsets = get_offsets(
-            self.hparams.vocab_size,
-            self.hparams.audio_tokens_per_step,
+            self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
         )[None, None, :].to(self.device)
 
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
             metric=self.hparams.compute_cost, batch_eval=True,
         )
         self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost,
-            batch_eval=True,
+            metric=self.hparams.compute_cost, batch_eval=True,
         )
         self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost,
-            batch_eval=True,
+            metric=self.hparams.compute_cost, batch_eval=True,
         )
 
         # TOOO: Reestablish evaluation
@@ -288,10 +291,7 @@ def evaluate_batch(self, batch, stage):
                 wav = self.create_waveform(audio_tokens, audio_length)
                 wav = wav.squeeze(1)
                 self.save_samples(
-                    batch=batch,
-                    wav=wav,
-                    length=audio_length,
-                    stage=stage
+                    batch=batch, wav=wav, length=audio_length, stage=stage
                 )
                 self.evaluation_metric.append(
                     ids=batch.uttid,
@@ -375,13 +375,14 @@ def inference(self, batch):
         prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
         inference_results = [
             self.modules.model.inference(
-                prefix=prefix_item.unsqueeze(0),
-                opts=self._get_inference_opts()
-            )            
+                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
+            )
             for prefix_item in prefix_items
         ]
         inferred_tokens = [
-            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step)
+            result[0][0]
+            if result[0]
+            else torch.zeros(1000, self.hparams.audio_tokens_per_step)
             for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
@@ -389,8 +390,12 @@ def inference(self, batch):
         return audio, audio_length
 
     def _get_inference_opts(self):
-        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :]
-        tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None]
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
+            None, :
+        ]
+        tracks = torch.arange(
+            self.hparams.audio_tokens_per_step, device=self.device
+        )[:, None]
         track_start = (
             self.hparams.text_num_tokens
             + self.hparams.special_num_tokens
@@ -404,10 +409,7 @@ def _get_inference_opts(self):
             | (idx == self.hparams.bos_index)
         ).logical_not()
         return self.hparams.inference_opts(
-            masks={
-                self.hparams.bos_index: mask
-            },
-            device=self.device,
+            masks={self.hparams.bos_index: mask}, device=self.device,
         )
 
     def save_samples(self, batch, wav, length, stage):
@@ -435,7 +437,7 @@ def _get_eval_output_folder(self, stage):
             Path(self.hparams.output_folder) / "eval" / stage.name.lower()
         )
         if epoch is not None:
-            output_folder = output_folder / str(epoch)        
+            output_folder = output_folder / str(epoch)
         output_folder.mkdir(exist_ok=True, parents=True)
         return output_folder
 
@@ -481,12 +483,11 @@ def dataio_prepare(hparams):
     label_encoder = hparams["label_encoder"]
     input_feature = INPUT_FEATURE_MAP[hparams["input"]]
     offsets = get_offsets(
-        hparams["vocab_size"],
-        hparams["audio_tokens_per_step"]
+        hparams["vocab_size"], hparams["audio_tokens_per_step"]
     ).unsqueeze(0)
     if hparams["flip_layers"]:
         offsets = offsets.flip(-1)
-    
+
     tokens_loader = hparams.get("tokens_loader")
 
     @sb.utils.data_pipeline.takes("label")
@@ -505,7 +506,9 @@ def tokens_pipeline(label):
         return label_encoder.encode_sequence_torch(label)
 
     @sb.utils.data_pipeline.takes("uttid", "tokens")
-    @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length")
+    @sb.utils.data_pipeline.provides(
+        "audio", "prefix", "prompt", "prefix_length", "length"
+    )
     def prompt_pipeline(id, tokens):
         audio = tokens_loader.tokens_by_uttid(
             id, num_codebooks=hparams["audio_tokens_per_step"]
@@ -553,7 +556,7 @@ def sig_pipeline(wav):
         "audio",
         "prompt",
         "prefix_length",
-        "length"
+        "length",
     ]
     if use_spk_emb:
         prepared_features.append("spk_emb")
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
index 4f633cee4..0117d9afe 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
@@ -37,10 +37,10 @@ config: config.yaml
 checkpoint: ckpt_00190000.pth
 sample_rate: 16000
 save_embedding: False
-num_codebooks : 4
+num_codebooks: 4
 save_path: /home/ubuntu/sq-codec/SQ-Codec
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
   save_path: !ref <save_path>
   checkpoint: !ref <checkpoint>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
index d23c25f96..5fe91bbce 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
@@ -38,10 +38,10 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean
 checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
 sample_rate: 24000
 save_embedding: False
-num_codebooks : 1
+num_codebooks: 1
 vocab_size: 4096
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
   save_path: !ref <save_folder>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 5a54f72df..3979ba731 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -52,7 +52,7 @@
             "skip_prep": hparams["skip_prep"],
         },
     )
-    
+
     tokens_extractor = hparams["tokens_extractor"]
     data_folder = hparams["data_folder"]
     datasets = []
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
index fe202c90d..44b4388c2 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
@@ -39,10 +39,10 @@ config: config.yaml
 checkpoint: ckpt_00190000.pth
 sample_rate: 16000
 save_embedding: False
-num_codebooks : 4
+num_codebooks: 4
 save_path: /home/ubuntu/sq-codec/SQ-Codec
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
   save_path: !ref <save_path>
   checkpoint: !ref <checkpoint>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
index bc1b56ddb..d1bb576a7 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
@@ -40,10 +40,10 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean
 checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
 sample_rate: 24000
 save_embedding: False
-num_codebooks : 1
+num_codebooks: 1
 vocab_size: 4096
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
   save_path: !ref <save_folder>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index 377b5955c..aa7ee2c4b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -10,7 +10,6 @@
 import speechbrain as sb
 import json
 import logging
-import math
 import csv
 import torch
 import torchaudio
@@ -19,7 +18,6 @@
 from pathlib import Path
 from types import SimpleNamespace
 from torch.nn import ModuleDict
-from tqdm.auto import tqdm
 from data import undo_batch
 from eval import vocoder_to_device
 from torch.utils.flop_counter import FlopCounterMode
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 600ee5f39..31f8ae33a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -85,7 +85,7 @@ def create_waveform(self, audio, length, emb):
         -------
         wav : torch.Tensor
         """
-        self.modules.tokenizer.device = self.device        
+        self.modules.tokenizer.device = self.device
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
@@ -582,9 +582,7 @@ def tokens_pipeline(label):
                 model_kwargs=hparams.get("token_model_kwargs"),
             )
         else:
-            silence_padding = get_silence_repr(
-                hparams["ssl_model"],
-            )
+            silence_padding = get_silence_repr(hparams["ssl_model"],)
     else:
         silence_padding = (
             torch.ones(audio_tokens_per_step, dtype=torch.int64)
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 1c76a2440..010f3b26b 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -12,7 +12,6 @@
 
 import math
 import torch
-import inspect
 from torch import nn
 from torch.nn import functional as F
 from speechbrain.lobes.models.transformer.Transformer import (
@@ -2127,7 +2126,7 @@ def get_silence_repr(model, sample_length=100000, device=None):
         A silecnce tensor
     """
     audio = torch.zeros(1, sample_length, device=device)
-    length = torch.ones(1, device=device)    
+    length = torch.ones(1, device=device)
     audio_repr = model(audio, length)
     silence = audio_repr.mean(dim=1)[0]
     return silence

From 99395f838f3157afe14b605c7f56f598f094613e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 22 Jan 2025 01:36:40 -0500
Subject: [PATCH 076/270] DASB: Update sample rates

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml   | 2 +-
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index 0d08747cc..04adb7926 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -84,7 +84,7 @@ gate_loss_max_weight: 1.
 
 # Feature parameters
 sample_rate: 22050
-model_sample_rate: 16000
+model_sample_rate: 24000
 max_audio_length: 1000
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index df0a82050..a2b90e83a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -86,7 +86,7 @@ gate_loss_max_weight: 1.
 
 # Feature parameters
 sample_rate: 22050
-model_sample_rate: 16000
+model_sample_rate: 24000
 max_audio_length: 1000
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10

From 0971b8e50203b89d2b72841643c8b53674eb3548 Mon Sep 17 00:00:00 2001
From: poonehmousavi <mousavi.pooneh@gmail.com>
Date: Thu, 23 Jan 2025 00:36:16 +0000
Subject: [PATCH 077/270] fix bug and update LibriSPeech recepie

---
 .../DASB/LibriSpeech/extraction/extract.py    |  4 ++--
 .../LibriSpeech/extraction/hparams/dac.yaml   | 11 +++++----
 .../extraction/hparams/discrete_ssl.yaml      | 24 ++++++++++---------
 .../extraction/hparams/encodec.yaml           | 11 +++++----
 .../LibriSpeech/extraction/hparams/mimi.yaml  | 10 ++++----
 .../extraction/hparams/speech_tokenizer.yaml  | 12 ++++++----
 .../extraction/hparams/sqcodec.yaml           | 20 +++++++++-------
 .../extraction/hparams/wavtokenizer.yaml      | 16 +++++++------
 benchmarks/DASB/model/sq_codec.py             | 13 +++++-----
 benchmarks/DASB/utils/tokenizer_interface.py  |  7 ++----
 speechbrain                                   |  2 +-
 11 files changed, 70 insertions(+), 60 deletions(-)

diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 5a54f72df..814d252be 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -46,13 +46,13 @@
             "tr_splits": hparams["train_splits"],
             "dev_splits": hparams["dev_splits"],
             "te_splits": hparams["test_splits"],
-            "save_folder": hparams["output_folder"],
+            "save_folder": hparams["cached_data_folder"],
             "merge_lst": hparams["train_splits"],
             "merge_name": "train.csv",
             "skip_prep": hparams["skip_prep"],
         },
     )
-    
+
     tokens_extractor = hparams["tokens_extractor"]
     data_folder = hparams["data_folder"]
     datasets = []
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
index d2d935ed0..349597c55 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
index 7d4938625..cd8ae126e 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
@@ -35,13 +36,14 @@ dataloader_opts:
   num_workers: !ref <num_workers>
 
 ### Configuration for  discrete SSL model
-# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                            |
-# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------|
-# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS |
-# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
-# | Wav2Vec2   | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | WIP                                      |
+# | SSL Model  | HF Encoder                             | K-Means Dataset | K-Means Size | SSL Layers           | Vocoder Model                               |
+# |------------|----------------------------------------|-----------------|--------------|----------------------|---------------------------------------------|
+# | WavLM      | microsoft/wavlm-large                  | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wavlm-k1000-LibriTTS    |
+# | HuBERT     | facebook/hubert-large-ll60k            | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-hubert-k1000-LibriTTS   |
+# | Wav2Vec2   | facebook/wav2vec2-large                | LibriSpeech960  | 1000         | 1, 3, 7, 12, 18, 23  | speechbrain/hifigan-wav2vec2-k1000-LibriTTS |
 
-# ssl_model_type: hubert, wavlm, wav2vec2
+
+# ssl_model_type: HuBERT, WavLM, Wav2Vec2
 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large,  facebook/wav2vec2-large
 ssl_model_type: WavLM
 ssl_hub: microsoft/wavlm-large
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
index ee0a7e910..9f6c8b4ed 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
index e2dad7f95..f9720b170 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
@@ -13,15 +13,16 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
 batch_size: 1
 num_workers: 8
@@ -48,6 +49,7 @@ tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
   save_path: !ref <save_folder>
   num_codebooks: !ref <num_codebooks>
+  sample_rate: !ref <sample_rate>
 
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 161d4e870..3090e9f79 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
@@ -45,6 +46,7 @@ save_embedding: False
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
+  sample_rate: !ref <sample_rate>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
index fe202c90d..9d5a6c24e 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
@@ -39,14 +40,15 @@ config: config.yaml
 checkpoint: ckpt_00190000.pth
 sample_rate: 16000
 save_embedding: False
-num_codebooks : 4
-save_path: /home/ubuntu/sq-codec/SQ-Codec
+num_codebooks: 4
+tokenizer_save_path: !PLACEHOLDER
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
-  save_path: !ref <save_path>
+  save_path: !ref <tokenizer_save_path>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
+  sample_rate: !ref <sample_rate>
 
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
index bc1b56ddb..976614a3d 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
@@ -13,17 +13,18 @@ train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
 data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+cached_data_folder: !ref <output_folder> # e.g., path/to/cache
 train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
 dev_splits: ["dev-clean"]
 test_splits: ["dev-clean", "test-clean", "test-other"]
 skip_prep: False
-train_csv: !ref <output_folder>/train.csv
-valid_csv: !ref <output_folder>/dev-clean.csv
+train_csv: !ref <cached_data_folder>/train.csv
+valid_csv: !ref <cached_data_folder>/dev-clean.csv
 test_csv:
-  - !ref <output_folder>/test-clean.csv
-  - !ref <output_folder>/test-other.csv
+  - !ref <cached_data_folder>/test-clean.csv
+  - !ref <cached_data_folder>/test-other.csv
 
-batch_size: 8
+batch_size: 1
 num_workers: 8
 src_key: wav
 id_key: id
@@ -40,15 +41,16 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean
 checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
 sample_rate: 24000
 save_embedding: False
-num_codebooks : 1
+num_codebooks: 1
 vocab_size: 4096
 
-# wavtokenizer model 
+# wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
   save_path: !ref <save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
+  sample_rate: !ref <sample_rate>
   freeze: True
 
 
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 4ac4b74ad..0e1ffe3f8 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -123,7 +123,8 @@ def build_codec_model(self, config):
         """
         exp_model_config = OmegaConf.load(config)
         scalar_codec = ScalarModel(**exp_model_config.generator.config)
-        parameter_dict = torch.load(self.ckpt_path)
+        device = next(iter(scalar_codec.parameters())).device
+        parameter_dict = torch.load(self.ckpt_path, map_location=device)
         scalar_codec.load_state_dict(parameter_dict["codec_model"])
         return scalar_codec
 
@@ -148,9 +149,9 @@ def _flatten_codebooks(self, arr, offset_size=None):
         ), "Input array must have 3 dimensions [B, N, D]"
         N, B, D = arr.shape
         arr = arr.copy()
-        if offset_size is not None:
-            for n in range(N):
-                arr[n, :, :] += offset_size * n
+        # if offset_size is not None:
+        #     for n in range(N):
+        #         arr[n, :, :] += offset_size * n
         flattened_arr = arr.transpose(1, 2, 0).reshape(B, N * D)
         return flattened_arr
 
@@ -205,8 +206,8 @@ def decode(self, codes):
             T % self.n_codebook == 0
         ), "Length T must be divisible by n_codebook"
         codes = codes.view(B, -1, self.n_codebook).permute(2, 0, 1)
-        for i in range(self.n_codebook):
-            codes[i, :, :] -= i * self.dim_codebook
+        # for i in range(self.n_codebook):
+        #     codes[i, :, :] -= i * self.dim_codebook
         emb_quant = []
         for i in range(self.n_codebook):
             tmp_list = decimal_to_ternary_matrix(codes[i, :, :], D=9) - 1
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index a6103de4c..be73fda74 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -259,7 +259,6 @@ class SpeechTokenizerWrapper(SpeechTokenizer, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         SpeechTokenizer.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
-        self.sample_rate = 16000
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
@@ -380,16 +379,15 @@ class MimiTokenizer(Mimi, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         Mimi.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
-        self.sample_rate= self.sampling_rate
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         self.eval()
         tokens, _ = self.encode(signal, lengths)
         if num_codebooks:
-            if tokens.shape[-1] < num_codebooks:
+            if tokens.shape[1] < num_codebooks:
                 raise ValueError(
-                    f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested"
+                    f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested"
                 )
             tokens = tokens[:, :num_codebooks, :]
         return tokens.movedim(-1, -2)
@@ -436,7 +434,6 @@ class WavTokenizerWrapper(WavTokenizer, BaseTokenizer):
     def __init__(self, *args, **kwargs):
         WavTokenizer.__init__(self, *args, **kwargs)
         BaseTokenizer.__init__(self)
-        self.sample_rate = 24000
 
     @torch.no_grad()
     def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
diff --git a/speechbrain b/speechbrain
index e602161f4..f07cfc76b 160000
--- a/speechbrain
+++ b/speechbrain
@@ -1 +1 @@
-Subproject commit e602161f4d305e13a26fc71b7dbe4a4cfeaa8847
+Subproject commit f07cfc76bd4b864c598a9ed5948caa3fe3176516

From 3d3e04ca233785633e4c0f0ecc5cc21206408345 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 22 Jan 2025 21:02:58 -0500
Subject: [PATCH 078/270] DASB: Tokotron: Add validation batch size
 customization (to avoid OOM)

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index 04adb7926..3b7d1d5e8 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -55,6 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 150
 batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -118,7 +119,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:

From 85353929e67da96a028a5d634ac3745736234e9a Mon Sep 17 00:00:00 2001
From: Pooneh Mousavi <mousavi.pooneh@gmail.com>
Date: Thu, 23 Jan 2025 00:03:03 -0500
Subject: [PATCH 079/270] Update README.md

---
 benchmarks/DASB/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md
index 0ad632979..1c3f78818 100644
--- a/benchmarks/DASB/README.md
+++ b/benchmarks/DASB/README.md
@@ -158,8 +158,6 @@ bash run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/t
 
 This workflow ensures flexibility, efficiency, and reproducibility for both training scenarios. Adapt the recipes as needed for your specific requirements!
 
-Here's a polished and formatted version for clarity, suitable for a README or documentation:
-
 
 # 🎛️ Hyperparameter Tuning
 

From 16912d56a3ca4a49ef88ab10dbc407df48aa3b85 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 23 Jan 2025 09:54:38 -0500
Subject: [PATCH 080/270] DASB: Tokotron: Minor fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index e35635b2b..1880b3049 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -449,6 +449,7 @@ def create_waveform(self, audio, length):
                 audio = clean_padding(audio + self.audio_token_offsets, length)
             wav = self.modules.tokenizer.tokens_to_sig(audio)
             wav = clean_padding(wav, length)
+            wav = wav.to(self.device)
         return wav
 
     def is_eval_epoch(self, epoch):
@@ -742,14 +743,17 @@ def apply_overfit_test(hparams, dataset):
     """
     if hparams["overfit_test"]:
         if isinstance(dataset, tuple):
-            dataset_train, _, _ = dataset
+            dataset_train, dataset_valid, _ = dataset
             dataset_train = apply_overfit_test(hparams, dataset_train)
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys()))
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
+
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
@@ -757,6 +761,7 @@ def apply_overfit_test(hparams, dataset):
                 "train": dataset_train,
                 "valid": dataset_eval,
                 "test": dataset_eval,
+                "sample": dataset_eval,
             }
         else:
             result = dataset.overfit_test(

From ec47b0de911d372c76b000f5619715ad6d7c2b4b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 23 Jan 2025 11:55:39 -0500
Subject: [PATCH 081/270] DASB: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 1880b3049..1ac700742 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -752,11 +752,10 @@ def apply_overfit_test(hparams, dataset):
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
-            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
-
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
             result = {
                 "train": dataset_train,
                 "valid": dataset_eval,

From 5dba59d717c7495cdb1c102b1ae8af44c28366f3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 23 Jan 2025 21:42:34 -0500
Subject: [PATCH 082/270] DASB: Tokotron: Update priors

---
 .../TTS/tokotron/hparams/train_dac.yaml       |   8 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   6 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   8 +-
 .../TTS/tokotron/hparams/train_mimi.yaml      |   8 +-
 .../hparams/train_speech_tokenizer.yaml       |   8 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |   8 +-
 .../tokotron/hparams/train_wavtokenizer.yaml  |   8 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   2 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |   4 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   4 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   4 +-
 .../hparams/train_speech_tokenizer.yaml       |   4 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml | 124 ++++++++----------
 .../TTS/valle/hparams/train_encodec.yaml      |   8 +-
 14 files changed, 93 insertions(+), 111 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 946c6d8c1..3cdaf3c84 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -73,7 +73,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -149,8 +149,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 827da9a25..40890f6a2 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
@@ -187,8 +187,8 @@ token_model_kwargs:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index a34a5b2eb..ccd736e9b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -67,7 +67,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -138,8 +138,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index 3b7d1d5e8..0c9ae43f8 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
@@ -72,7 +72,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -143,8 +143,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ff51f8e32..f6f2d756a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -71,7 +71,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -142,8 +142,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 0143ef65b..014e0d707 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -73,7 +73,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -144,8 +144,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index a2b90e83a..e02457ae8 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -73,7 +73,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -144,8 +144,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index f0127973c..3dc005074 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -70,7 +70,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 5670aa208..46076fe1f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -177,8 +177,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 88bc91aef..83f4fd6e7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -256,8 +256,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 layerwise_renorm: True
 d_ffn: 2048
 transformer_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index be49b69f6..b5696d7a6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -183,8 +183,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index aac74070a..0ba67441b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -179,8 +179,8 @@ extract_features_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index b5eb30f62..6a0d31fe8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/discrete_ssl
+experiment_name: valle/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
@@ -70,7 +70,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
@@ -82,28 +82,23 @@ overfit_test_epoch_data_count: 1000
 
 # index
 pad_index: 0
-bos_index: 0
-bos_width: 1
-eos_index: 0
-eos_width: 1
-audio_token_shift: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
 
 # stages related parameters
 lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
-guided_attention_weight: 50.0
-guided_attention_sigma: 0.5
-gate_loss_weight: 1.0
-gate_threshold: 0.5
-gate_loss_beta: 0.2
-gate_loss_gamma: 0.01
-gate_loss_max_weight: 1.
 
 # Feature parameters
 sample_rate: 22050
 model_sample_rate: 16000
-max_audio_length: 1000
+max_audio_length: 2000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
 infer_max_audio_length: !ref <max_audio_length>
 debug_infer_max_audio_length: 10
 
@@ -117,14 +112,6 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         text: !ref <token_list_file_text>
         phonemes: !ref <token_list_file_phn>
 
-# Gate offset
-gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
-    beta: !ref <gate_loss_beta>
-    gamma: !ref <gate_loss_gamma>
-    max_weight: !ref <gate_loss_max_weight>
-silence_padding: !ref <gate_offset>
-use_silence_padding: True
-
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
@@ -179,14 +166,13 @@ token_model_kwargs:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 512
-nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
-d_ffn: 2048
-transformer_dropout: 0.2
-target_dropout: 0.2
-activation: !name:torch.nn.GELU
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
 vocab_size: 1000
 audio_emb_size: 1024
 audio_emb_freeze: False
@@ -200,35 +186,41 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
 audio_tokens_per_step: 6
-attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
-    input_num_tokens: !ref <input_num_tokens>
-    audio_num_tokens: !ref <vocab_size>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    d_model: !ref <d_model>
-    d_ffn: !ref <d_ffn>
-    nhead: !ref <nhead>
-    enc_num_layers: !ref <enc_num_layers>
-    dec_num_layers: !ref <dec_num_layers>
-    dropout: !ref <transformer_dropout>
-    target_dropout: !ref <target_dropout>
-    activation: !ref <activation>
-    attention_type: !ref <attention_type>
-    gate_threshold: !ref <gate_threshold>
-    gate_offset: !ref <gate_offset>
-    audio_emb_size: !ref <audio_emb_size>
-    audio_emb_freeze: !ref <audio_emb_freeze>
-    max_audio_length: !ref <max_audio_length>
-    eos_mode: !ref <eos_mode>
-    infer_max_audio_length: !ref <infer_max_audio_length>
-    audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
-    scale_factor: !ref <scale_factor>
-    representation_mode: discrete
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
     save_path: !ref <kmeans_cache_dir>
@@ -240,25 +232,15 @@ tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
 modules:
     model: !ref <model>
     tokenizer: !ref <tokenizer>
-    compute_cost: !ref <compute_cost>
 
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:model.Tokotron.TokotronLoss
-    guided_attention_weight: !ref <guided_attention_weight>
-    guided_attention_sigma: !ref <guided_attention_sigma>
-    gate_weight: !ref <gate_loss_weight>
-    gate_beta: !ref <gate_loss_beta>
-    gate_gamma: !ref <gate_loss_gamma>
-    gate_max_weight: !ref <gate_loss_max_weight>
-    silence_padding: !ref <silence_padding>
-    eos_mode: !ref <eos_mode>
-    eos_index: !ref <eos_index>
-    eos_width: !ref <eos_width>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    audio_token_shift: !ref <audio_token_shift>
-    representation_mode: discrete
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
 
 
 lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 39b28b437..4db913d00 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -67,7 +67,7 @@ bos_index: 0
 bos_width: 1
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -138,8 +138,8 @@ sample_dataloader_opts:
 # Transformer
 d_model: 512
 nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)"
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2

From f7116a8cd26eeeb0fe414113c5a87b95a14fcad6 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 23 Jan 2025 22:13:17 -0500
Subject: [PATCH 083/270] DASB: Fixes

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml       | 2 +-
 .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml   | 4 ++--
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml   | 2 +-
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml      | 2 +-
 .../TTS/tokotron/hparams/train_speech_tokenizer.yaml        | 2 +-
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml   | 2 +-
 .../LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml   | 2 +-
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml       | 6 +++---
 .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml   | 6 +++---
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml   | 6 +++---
 .../TTS/tokotron/hparams/train_speech_tokenizer.yaml        | 6 +++---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml      | 2 +-
 14 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 3cdaf3c84..1ae232aca 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -150,7 +150,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 40890f6a2..83c2017fc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -90,7 +90,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -188,7 +188,7 @@ token_model_kwargs:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index ccd736e9b..3c7284821 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -139,7 +139,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index 0c9ae43f8..eac124447 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -144,7 +144,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index f6f2d756a..6b8888153 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -143,7 +143,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 014e0d707..e7af427ad 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -145,7 +145,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index e02457ae8..07e63e45b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -145,7 +145,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 3dc005074..c6ec91445 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -89,7 +89,7 @@ special_tokens: ["<bos>", "<eos>", "<eot>"]
 special_num_tokens: 4
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 46076fe1f..805384b8d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -76,7 +76,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -178,7 +178,7 @@ extract_features_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 83f4fd6e7..11b7e5af6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -93,7 +93,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 batch_size_guided: 2
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
@@ -115,7 +115,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -257,7 +257,7 @@ extract_features_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 layerwise_renorm: True
 d_ffn: 2048
 transformer_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index b5696d7a6..9a5838923 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -66,7 +66,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -87,7 +87,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -184,7 +184,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 0ba67441b..703878092 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -58,7 +58,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)"
+batch_size: 16 
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -79,7 +79,7 @@ eos_width: 1
 audio_token_shift: 0
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
@@ -180,7 +180,7 @@ extract_features_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 z_dim: 128
 hidden_dim: 2048
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 6a0d31fe8..31ed1cf23 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -89,7 +89,7 @@ special_tokens: ["<bos>", "<eos>", "<eot>"]
 special_num_tokens: 4
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)"
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 4db913d00..e541f4ae0 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -139,7 +139,7 @@ sample_dataloader_opts:
 d_model: 512
 nhead: 4
 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
 d_ffn: 2048
 transformer_dropout: 0.2
 target_dropout: 0.2

From 199a37ca608dac0064272033029453e4192cd439 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 27 Jan 2025 00:02:58 -0500
Subject: [PATCH 084/270] DASB: Tokotron: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 1ac700742..881d973c4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -546,17 +546,11 @@ def audio_ref_pipeline(wav):
     use_silence_padding = hparams.get("use_silence_padding", True)
 
     if representation_mode == RepresentationMode.DISCRETE:
-        layers_key = "token_model_layers"
         model_key = "tokenizer"
     else:
-        layers_key = "ssl_model_layers"
         model_key = "ssl_model"
 
-    audio_tokens_per_step = (
-        len(hparams[layers_key])
-        if layers_key in hparams
-        else hparams["audio_tokens_per_step"]
-    )
+    audio_tokens_per_step = hparams["audio_tokens_per_step"]
     if (
         use_silence_padding
         and representation_mode == RepresentationMode.DISCRETE

From dd7f3d3cff19cbfef7fabaeaaba55c980caa9e7b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:44:57 -0500
Subject: [PATCH 085/270] DASB: Tokotron: Fix layer selection for Discrete SSL

---
 .../tokotron/hparams/train_discrete_ssl.yaml  |  6 ++--
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 32 +++++++++++++++++--
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 83c2017fc..a1be07c07 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -59,8 +59,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_model_layers: !ref <ssl_model_layers>
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
@@ -181,7 +181,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 token_model_kwargs:
-    SSL_layers: !ref <token_model_layers>
+    SSL_layers: !ref <speech_model_layers>
 
 ####################### Model parameters ###########################
 # Transformer
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 881d973c4..05d8805c6 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -275,6 +275,7 @@ def on_stage_start(self, stage, epoch):
             self.is_evaluating = True
 
         self.audio_token_offsets = self.get_token_offsets()
+        self.token_model_kwargs = getattr(self.hparams, "token_model_kwargs", {})
 
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
@@ -447,7 +448,7 @@ def create_waveform(self, audio, length):
         with torch.no_grad():
             if self.audio_token_offsets is not None:
                 audio = clean_padding(audio + self.audio_token_offsets, length)
-            wav = self.modules.tokenizer.tokens_to_sig(audio)
+            wav = self.modules.tokenizer.tokens_to_sig(audio, **self.token_model_kwargs)
             wav = clean_padding(wav, length)
             wav = wav.to(self.device)
         return wav
@@ -574,12 +575,20 @@ def audio_ref_pipeline(wav):
     )
 
     tokens_loader = hparams.get("tokens_loader")
+    if "speech_model_layers" in hparams:
+        tokens_loader_kwargs = {
+            "num_codebooks": get_selected_layer_indexes(hparams)
+        }
+    else:
+        tokens_loader_kwargs = {
+            "num_codebooks": audio_tokens_per_step
+        }
 
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
     def audio_pipeline(id):
         audio = tokens_loader.tokens_by_uttid(
-            id, num_codebooks=audio_tokens_per_step
+            id, **tokens_loader_kwargs
         )
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
@@ -678,6 +687,25 @@ def init_sequence_encoder(hparams):
     return encoder
 
 
+def get_selected_layer_indexes(hparams):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    hparams : dict
+        Hyperparameters
+    """
+    selected_layers = hparams.get("speech_model_layers")
+    available_layers = hparams.get("available_speech_model_layers")
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [
+        available_layers.index(layer)
+        for layer in selected_layers
+    ]
+    return layer_idx
+
+
 def read_token_list(file_name):
     """Reads a simple text file with tokens (e.g. characters or phonemes) listed
     one per line

From 46c8ba4434693ad89f71655a1cea04f6460a459a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 27 Jan 2025 22:31:03 -0500
Subject: [PATCH 086/270] DASB: VALL-E: Add LibriTTS

---
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |  23 +-
 .../DASB/LibriTTS/TTS/valle/evaluation.py     | 357 ++++++++++++++++++
 .../LibriTTS/TTS/valle/hparams/arpabet.txt    |  50 +++
 .../LibriTTS/TTS/valle/hparams/char_en.txt    |  38 ++
 .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml |  57 +++
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  19 +-
 .../TTS/valle/hparams/train_encodec.yaml      | 136 +++----
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   |  10 +-
 benchmarks/DASB/model/valle.py                |  13 +-
 9 files changed, 609 insertions(+), 94 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index bafd769cc..9e9d91dc3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -7,11 +7,7 @@ eval_interval: 1
 eval_subset: null
 eval_asr_beam_size: 66
 eval_asr_type: encoder_decoder
-eval_asr_source: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
-    whisper: openai/whisper-small
+eval_asr_source:  openai/whisper-small
 eval_spk_sim_source: microsoft/wavlm-base-sv
 evaluations: utmos,asr,spk_sim
 tmp_folder: null
@@ -24,19 +20,10 @@ eval_utmos_judge_id: null
 eval_perf: False
 
 
-eval_asr: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      overrides:
-        lm_weight: 0.0
-        test_beam_size: !ref <eval_asr_beam_size>
-    whisper: !name:eval.WhisperASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      savedir: !ref <pretrained_model_save_folder>
+eval_asr: !name:eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
 
 eval_utmos: !name:eval.UTMOSSpeechEvaluator
   source: !ref <eval_utmos_source>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
new file mode 100644
index 000000000..9fd6da808
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -0,0 +1,357 @@
+import json
+import torch
+import logging
+import re
+import csv
+from speechbrain.utils.metric_stats import MetricStats
+from types import SimpleNamespace
+from pathlib import Path
+from utils.data import undo_batch
+from torch import nn
+
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechEvaluationMetricStats(MetricStats):
+    """An aggregate metric combining multiple speech evaluators
+
+    Arguments
+    ---------
+    hparams : dict | SimpleNamespace | object
+        Raw hyperparameters for evaluation
+
+    device : str
+        The device on which evaluation will be performed
+
+    """
+
+    def __init__(self, hparams, device="cpu"):
+        if isinstance(hparams, dict):
+            hparams = SimpleNamespace(**hparams)
+        self.hparams = hparams
+        self.device = device
+        modules = self.hparams.modules
+        self.modules = nn.ModuleDict(modules).to(self.device)
+        self.enabled_evaluators = set(self.hparams.evaluations.split(","))
+        evaluators = hparams.evaluators
+        if evaluators:
+            self.evaluators = {
+                key: evaluator_f(run_opts={"device": device})
+                for key, evaluator_f in evaluators.items()
+                if key in self.enabled_evaluators
+            }
+        else:
+            self.evaluators = {}
+
+        if not self.evaluators:
+            logger.warn(
+                "No evaluators were defined - this run will produce samples only"
+            )
+
+    def on_evaluation_start(self, output_folder="eval"):
+        """Invoked at the beginning of the evaluation cycle.
+
+        Arguments
+        ---------
+        output_folder : str | path-like
+            The folder to which results will be output
+
+        """
+        logger.info("Starting evaluation")
+        output_folder = Path(output_folder)
+        self.output_folder = (
+            output_folder
+            if output_folder.is_absolute()
+            else self.hparams.output_folder / output_folder
+        )
+        self.output_folder.mkdir(parents=True, exist_ok=True)
+
+        self.files = []
+        details_keys = list(self.evaluators.keys())
+        self.details = {evaluator_key: [] for evaluator_key in details_keys}
+        self.read_reports()
+        self.create_reports()
+        self.item_ids = []
+
+    def on_evaluation_end(self):
+        """Invoked at the beginning of the evaluation cycle. The default
+        implementation is a no-op
+        """
+        logger.info("Ending evaluation")
+        self.write_summary()
+
+    def create_reports(self):
+        """Creates report files and report writers"""
+        self.report_files = {}
+        self.report_writers = {}
+        for evaluator_key in self.enabled_evaluators:
+            columns = self.get_report_columns(evaluator_key)
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            self.files.append(file_name)
+            resume = file_name.exists() and file_name.stat().st_size > 0
+            report_file = open(file_name, "a+")
+            self.report_files[evaluator_key] = report_file
+            writer = csv.DictWriter(report_file, columns)
+            if not resume:
+                writer.writeheader()
+            self.report_writers[evaluator_key] = writer
+
+    def read_reports(self):
+        """Invoked when resuming"""
+        for evaluator_key in self.enabled_evaluators:
+            file_name = self.output_folder / f"{evaluator_key}.csv"
+            if file_name.exists():
+                logger.info("%s exists, reading")
+                with open(file_name) as report_file:
+                    reader = csv.DictReader(report_file)
+                    for row in reader:
+                        del row["uttid"]
+                        row = {
+                            key: handle_number(value)
+                            for key, value in row.items()
+                        }
+                        self.details[evaluator_key].append(row)
+
+    def get_tracker_file_name(self):
+        """Determines the file name of the tracker file"""
+        suffix = (
+            f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else ""
+        )
+        file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt"
+        return self.output_folder / file_name
+
+    def get_report_columns(self, evaluator_key):
+        """Returns the columns for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            the identifier of the evaluator
+
+        Returns
+        -------
+        columns : list[str]
+            a list of column headers
+        """
+        bogus_wavs = torch.randn(2, 10000, device=self.device)
+        bogus_length = torch.tensor([1.0, 1.0], device=self.device)
+        evaluator = self.evaluators[evaluator_key]
+        result = evaluator.evaluate(
+            wavs=bogus_wavs,
+            length=bogus_length,
+            text=["BOGUS"] * len(bogus_wavs),
+            wavs_ref=bogus_wavs,
+            length_ref=bogus_length,
+        )
+
+        return ["uttid"] + list(result.details.keys())
+
+    def append(self, ids, wav, length, text, wav_ref, length_ref):
+        """Appends the result of a single item
+
+        Arguments
+        ---------
+        ids : str
+            Utterance IDs
+        wav : torch.Tensor
+            Synthesized waveforms
+        length : torch.Tensor
+            Relative lengths of the synthesized waveforms
+        text : list
+            Ground truth text
+        wav_ref : torch.Tensor
+            Reference (ground truth) waveforms
+        length_ref : torch.Tensor
+            Reference lengths
+        """
+        with torch.no_grad():
+            self.item_ids.extend(ids)
+            for evaluator_key, evaluator in self.evaluators.items():
+                result = evaluator.evaluate(
+                    wavs=wav,
+                    length=length,
+                    text=text,
+                    wavs_ref=wav_ref,
+                    length_ref=length_ref,
+                    sample_rate_ref=self.hparams.sample_rate,
+                    sample_rate=self.hparams.model_sample_rate,
+                )
+                details = undo_batch(result.details)
+                self.write_result(evaluator_key, ids, details)
+                self.details[evaluator_key].extend(details)
+
+    def write_result(self, evaluator_key, ids, details):
+        """Outputs the result details to the report for the specified evaluator
+
+        Arguments
+        ---------
+        evaluator_key : str
+            The evaluator key
+        ids : list
+            The list of IDs
+        details : list
+            a list of evaluation details, one dictionary per item
+        """
+        writer = self.report_writers[evaluator_key]
+        for uttid, details_item in zip(ids, details):
+            report_details = {
+                "uttid": uttid,
+                **details_item,
+            }
+            writer.writerow(ascii_only(flatten(report_details)))
+        self.report_files[evaluator_key].flush()
+
+    def write_summary(self, file_name=None):
+        """Outputs summarized statistics
+
+        Arguments
+        ---------
+        file_name : str | path-like
+            An alternative path to save the file
+        """
+        summary = self.summarize()
+        if file_name is None:
+            file_name = self.output_folder / "summary.json"
+        self.files.append(file_name)
+        with open(file_name, "w") as output_file:
+            json.dump(summary, output_file, indent=4)
+
+    def summarize(self, field=None):
+        """Computes the summarized statistics
+
+        Arguments
+        ---------
+        field : str, optional
+            If specified, it will return a specific field
+
+        Returns
+        -------
+        result : dict | float
+            The summary - or the specified field from the sum
+        """
+        result = {
+            f"{evaluator_key}_{stat_key}": value
+            for evaluator_key in self.enabled_evaluators
+            if evaluator_key in self.details
+            for metric_key in self.hparams.eval_summary[evaluator_key][
+                "descriptive"
+            ]
+            for stat_key, value in descriptive_statistics(
+                items=self.details[evaluator_key], key=metric_key,
+            ).items()
+        }
+        if field is not None:
+            result = result[field]
+        return result
+
+    def clear(self):
+        """Deletes all the files that have been created"""
+        for file_name in self.files:
+            file_name.unlink()
+
+
+RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+")
+
+
+def ascii_only(values):
+    """Removes any non-ASCII characters from a dictionary
+
+    Arguments
+    ---------
+    values : dict
+        A dictionary of values
+
+    Returns
+    -------
+    result : dict
+        The same dictionary - but with non-ASCII strings removed"""
+    return {
+        key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
+        for key, value in values.items()
+    }
+
+
+def descriptive_statistics(items, key):
+    """Computes descriptive statistics for the summary
+
+    Arguments
+    ---------
+    items : list
+        a list of dictionaries with metric values for each item
+    key : str
+        The key of the metric for which the statistics will be computed
+
+    Returns
+    -------
+    statistics : dict
+        The desccriptive statistics computed
+            <key>_mean : the arithmetic mean
+            <key>_std : the standard deviation
+            <key>_min : the minimum value
+            <key>_max : the maximum value
+            <key>_median : the median value
+            <key>_q1 : the first quartile
+            <key>_q3 : the third quartile
+            <key>_iqr : the interquartile ratio
+    """
+    values = torch.tensor([item[key] for item in items])
+    quantiles = torch.tensor([0.25, 0.5, 0.75])
+    q1, median, q3 = values.quantile(quantiles)
+    stats = {
+        "mean": values.mean(),
+        "std": values.std(),
+        "min": values.min(),
+        "max": values.max(),
+        "median": median,
+        "q1": q1,
+        "q3": q3,
+        "iqr": q3 - q1,
+    }
+    return {
+        f"{key}_{stat_key}": value.item() for stat_key, value in stats.items()
+    }
+
+
+def flatten(value):
+    """Converts tensors to scalars and lists of strings to strings
+
+    Arguments
+    ---------
+    value : dict
+        the dictionary to flatten
+
+    Returns
+    -------
+    result : dict
+        a flattened dictionary
+    """
+    return {
+        key: item_value.item() if torch.is_tensor(item_value) else item_value
+        for key, item_value in value.items()
+    }
+
+
+RE_INTEGER = re.compile(r"^-?\d+$")
+RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$")
+
+
+def handle_number(value):
+    """Converts a value to a number, if applicable. Strings
+    that look like integers or floats will be converted to integers
+    or floats.
+
+    Arguments
+    ---------
+    value : str
+        a string value
+
+    Returns
+    -------
+    result : object
+        The processed result"""
+    if RE_INTEGER.match(value):
+        value = int(value)
+    elif RE_FLOAT.match(value):
+        value = float(value)
+    return value
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt
new file mode 100644
index 000000000..105a1dd9d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt
@@ -0,0 +1,50 @@
+AA
+AE
+AH
+AO
+AW
+AY
+B
+CH
+D
+DH
+EH
+ER
+EY
+F
+G
+HH
+IH
+IY
+JH
+K
+L
+M
+N
+NG
+OW
+OY
+P
+R
+S
+SH
+T
+TH
+UH
+UW
+V
+W
+Y
+Z
+ZH
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt
new file mode 100644
index 000000000..f43d3b08d
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+'
+"
+!
+(
+)
+,
+-
+.
+:
+;
+?
+ 
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
index e69de29bb..129cf9337 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -0,0 +1,57 @@
+eval_dataset: valid
+eval_suffix: ""
+eval_sample_rate: 16000
+eval_spk_sim_sample_rate: 16000
+eval_samples: null
+eval_interval: 1
+eval_subset: null
+eval_asr_beam_size: 66
+eval_asr_type: encoder_decoder
+eval_asr_source: openai/whisper-small
+eval_spk_sim_source: microsoft/wavlm-base-sv
+evaluations: utmos,asr,spk_sim
+tmp_folder: null
+eval_utmos_source: chaanks/wav2vec2-small
+eval_utmos_save_path: !ref <pretrained_model_save_folder>/utmos
+eval_utmos_model_name: utmos.ckpt
+eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main
+eval_utmos_domain_id: null
+eval_utmos_judge_id: null
+eval_perf: False
+
+
+eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
+
+eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
+  source: !ref <eval_utmos_source>
+  save_path: !ref <eval_utmos_save_path>
+  model_name: !ref <eval_utmos_model_name>
+  model_url: !ref <eval_utmos_model_url>
+  domain_id: !ref <eval_utmos_domain_id>
+  judge_id: !ref <eval_utmos_judge_id>
+
+eval_spk_sim: !name:utils.eval.SpkSimWavLM
+  source: !ref <eval_spk_sim_source>
+  savedir: !ref <pretrained_model_save_folder>
+  model_sample_rate: !ref <eval_spk_sim_sample_rate>
+
+evaluators:
+  utmos: !ref <eval_utmos>
+  asr: !ref <eval_asr>
+  spk_sim: !ref <eval_spk_sim>
+
+eval_summary:
+  asr:
+    descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"]
+  utmos:
+    descriptive: ["utmos"]
+  spk_sim:
+    descriptive: ["score"]
+
+eval_summary_log:
+  utmos: utmos_utmos_mean
+  dwer: asr_dwer_median
+  spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 31ed1cf23..51315b9eb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -20,16 +20,24 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER
-cached_data_folder: !PLACEHOLDER
-prepare_save_folder: !ref <cached_data_folder>
+cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
-vocoder_model_name: !ref unithifigan-dasb-<ssl_model_type>-discrete
-vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
+data_mode: lite
 train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
@@ -37,6 +45,7 @@ progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
+
 tokens_folder: !PLACEHOLDER
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
@@ -60,6 +69,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
 
 ssl_model_layers: [1, 3, 7, 12, 18, 23]
 token_model_layers: !ref <ssl_model_layers>
+flip_layers: false
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
@@ -100,6 +110,7 @@ max_audio_length: 2000
 text_max_length: 500
 n_ctx: !ref <max_audio_length> + <text_max_length>
 infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index e541f4ae0..9d9c1f278 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -14,21 +14,29 @@ save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
-token_model_src: "facebook/encodec_24khz"
-g2p_src: flexthink/soundchoice-g2p
-# Model type
-representation_mode: discrete
 
 # Data files
 data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER
 prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
 pretrained_model_save_folder: !ref <prepare_save_folder>
+ssl_model_type: wavlm
+representation_mode: discrete
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
+data_mode: lite
 train_json: !ref <prepare_save_folder>/train.json
 valid_json: !ref <prepare_save_folder>/valid.json
 test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
 frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
@@ -37,15 +45,13 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
-
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
-
+flip_layers: True
 splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
-
-
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -63,26 +69,26 @@ overfit_test_epoch_data_count: 1000
 
 # index
 pad_index: 0
-bos_index: 0
-bos_width: 1
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
 
 # stages related parameters
 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
-guided_attention_weight: 50.0
-guided_attention_sigma: 0.5
-gate_loss_weight: 1.0
-gate_threshold: 0.5
-gate_loss_beta: 0.2
-gate_loss_gamma: 0.01
-gate_loss_max_weight: 1.
 
 # Feature parameters
 sample_rate: 22050
-model_sample_rate: 24000
-max_audio_length: 1000
+model_sample_rate: 16000
+max_audio_length: 2000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
 infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -95,14 +101,6 @@ token_list_file: !apply:speechbrain.utils.hparams.choice
         text: !ref <token_list_file_text>
         phonemes: !ref <token_list_file_phn>
 
-# Gate offset
-gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
-    beta: !ref <gate_loss_beta>
-    gamma: !ref <gate_loss_gamma>
-    max_weight: !ref <gate_loss_max_weight>
-
-silence_padding: !ref <gate_offset>
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -136,16 +134,14 @@ sample_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 512
-nhead: 4
-enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
-dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
-d_ffn: 2048
-transformer_dropout: 0.2
-target_dropout: 0.2
-activation: !name:torch.nn.GELU
-audio_num_tokens: 1024
-audio_emb_size: 1024
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
 text_num_tokens: 39
@@ -155,31 +151,43 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
-audio_tokens_per_step: 2
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
 bandwidth: 1.5
 attention_type: regularMHA
 
 ############################## models ################################
 
-model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rule:line-length
-    input_num_tokens: !ref <input_num_tokens>
-    audio_num_tokens: !ref <audio_num_tokens>
-    audio_tokens_per_step: !ref <audio_tokens_per_step>
-    d_model: !ref <d_model>
-    d_ffn: !ref <d_ffn>
-    nhead: !ref <nhead>
-    enc_num_layers: !ref <enc_num_layers>
-    dec_num_layers: !ref <dec_num_layers>
-    dropout: !ref <transformer_dropout>
-    target_dropout: !ref <target_dropout>
-    activation: !ref <activation>
-    attention_type: !ref <attention_type>
-    gate_threshold: !ref <gate_threshold>
-    gate_offset: !ref <gate_offset>
-    audio_emb_size: !ref <audio_emb_size>
-    audio_emb_freeze: !ref <audio_emb_freeze>
-    max_audio_length: !ref <max_audio_length>
-    infer_max_audio_length: !ref <infer_max_audio_length>
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
@@ -198,14 +206,10 @@ modules:
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !new:model.Tokotron.TokotronLoss
-    guided_attention_weight: !ref <guided_attention_weight>
-    guided_attention_sigma: !ref <guided_attention_sigma>
-    gate_weight: !ref <gate_loss_weight>
-    gate_beta: !ref <gate_loss_beta>
-    gate_gamma: !ref <gate_loss_gamma>
-    gate_max_weight: !ref <gate_loss_max_weight>
-    silence_padding: !ref <silence_padding>
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+   apply_log: True
 
 lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index ebcc78015..4045e89ca 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -78,6 +78,7 @@ def create_waveform(self, audio, length):
         audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int()
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
+        wav = wav.to(self.device)
         return wav
 
     def compute_forward(self, batch, stage):
@@ -401,7 +402,7 @@ def _get_inference_opts(self):
         track_end = track_start + self.hparams.vocab_size
         mask = (
             ((idx >= track_start) & (idx < track_end))
-            | (idx == self.hparams.bos_index)
+            | (idx == self.hparams.eos_index)
         ).logical_not()
         return self.hparams.inference_opts(
             masks={
@@ -698,14 +699,17 @@ def apply_overfit_test(hparams, dataset):
     """
     if hparams["overfit_test"]:
         if isinstance(dataset, tuple):
-            dataset_train, _, _ = dataset
+            dataset_train, dataset_valid, _ = dataset
             dataset_train = apply_overfit_test(hparams, dataset_train)
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys()))
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
+            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
+
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
@@ -797,10 +801,8 @@ def undo_padding_tensor(batch, lengths):
         overrides=overrides,
     )
 
-    from ljspeech_prepare import prepare_ljspeech
 
     # Data preparation, to be run on only one process.
-    if not hparams["skip_prep"]:
     from libritts_prepare import prepare_libritts
 
     # Data preparation, to be run on only one process.
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 245ac0fd9..b85e68345 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -262,6 +262,10 @@ def inference(
         modality_index = prev_tok.flatten()
         mask = modality_index_to_mask(modality_index, opts)
         mask_cache = []
+        modality_tokens = torch.tensor(
+            list(opts.masks.keys()),
+            device=prefix.device
+        )
 
         for step in range(maxlen):
             #  (3.2) AR loop
@@ -288,9 +292,14 @@ def inference(
 
             # (3.3) detect modality swtich
             mask_cache.append(mask.clone())
-            modality_change_mask = torch.logical_and(
-                prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
+            modality_change_mask = torch.isin(
+                prev_tok[:, 0],
+                modality_tokens
             )
+            # Note: The ESPNET VALL-E had
+            # modality_change_mask = torch.logical_and(
+            #    prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
+            #)
             if torch.any(modality_change_mask):
                 modality_index = torch.where(
                     modality_change_mask, prev_tok[:, 0], modality_index,

From ba6bddb896dd40b45fe954c8d8067c831cfe0cc3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 28 Jan 2025 00:21:24 -0500
Subject: [PATCH 087/270] DASB: VALL-E: Fixes/Updates

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   3 +
 .../TTS/valle/hparams/train_encodec.yaml      |   8 +-
 .../LibriTTS/TTS/valle/libritts_prepare.py    |   1 +
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 130 +++++++++++++++++-
 4 files changed, 136 insertions(+), 6 deletions(-)
 create mode 120000 benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 51315b9eb..36002334a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -271,3 +271,6 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 9d9c1f278..8ef4e455f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -50,7 +50,7 @@ g2p_src: flexthink/soundchoice-g2p
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
-flip_layers: True
+flip_layers: False
 splits: ["train", "valid", "test"]
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
@@ -86,7 +86,8 @@ sample_rate: 22050
 model_sample_rate: 16000
 max_audio_length: 2000
 text_max_length: 500
-n_ctx: !ref <max_audio_length> + <text_max_length>
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 debug_infer_max_audio_length: 10
@@ -227,3 +228,6 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
 
 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py
new file mode 120000
index 000000000..489ab4011
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py
@@ -0,0 +1 @@
+../../libritts_prepare.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 4045e89ca..14259634c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -22,8 +22,10 @@
 from hyperpyyaml import load_hyperpyyaml
 from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio
 from speechbrain.dataio.dataio import write_audio
+from speechbrain.utils.data_utils import pad_right_to
 from speechbrain.utils.distributed import run_on_main
 from speechbrain.utils.data_utils import batch_pad_right
+from functools import partial
 import re
 import string
 
@@ -489,6 +491,7 @@ def dataio_prepare(hparams):
         offsets = offsets.flip(-1)
     
     tokens_loader = hparams.get("tokens_loader")
+    spk_prompt_length = hparams["spk_prompt_length"]
 
     @sb.utils.data_pipeline.takes("label")
     @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
@@ -505,13 +508,29 @@ def tokens_pipeline(label):
         """Processes the transcriptions to generate proper labels"""
         return label_encoder.encode_sequence_torch(label)
 
-    @sb.utils.data_pipeline.takes("uttid", "tokens")
+    def spk_prompt(uttid, spk_sample):
+        # Sample a speaker-matched embedding
+        selected_uttid = spk_sample[uttid]
+        audio = tokens_loader.tokens_by_uttid(
+            selected_uttid, num_codebooks=hparams["audio_tokens_per_step"]
+        )
+        if audio.size(0) > spk_prompt_length:
+            offset = torch.randint(0, audio.size(0), (1,)).item()
+        else:
+            offset = 0
+        # Retrieve the embedding value from the dataset
+        audio_spk_prompt, _ = pad_right_to(
+            audio[offset:offset + spk_prompt_length],
+            (spk_prompt_length, audio.size(1))
+        )
+        return audio_spk_prompt
+
+    @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt")
     @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length")
-    def prompt_pipeline(id, tokens):
+    def prompt_pipeline(id, tokens, spk_prompt):
         audio = tokens_loader.tokens_by_uttid(
             id, num_codebooks=hparams["audio_tokens_per_step"]
         )
-
         if hparams["flip_layers"]:
             audio = audio.flip(-1)
         yield audio
@@ -521,6 +540,8 @@ def prompt_pipeline(id, tokens):
                 torch.ones(1, num_tracks) * hparams["bos_index"],
                 tokens.unsqueeze(-1).expand(len(tokens), num_tracks),
                 torch.ones(1, num_tracks) * hparams["eot_index"],
+                spk_prompt + hparams["audio_token_shift"] + offsets,
+                torch.ones(1, num_tracks) * hparams["eop_index"],
             ]
         )
         yield prefix
@@ -542,7 +563,7 @@ def sig_pipeline(wav):
         sig = sb.dataio.dataio.read_audio(wav)
         return sig
 
-    dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline]
+    dynamic_items = [text_pipeline, tokens_pipeline]
 
     init_sequence_encoder(hparams)
     use_spk_emb = hparams.get("use_spk_emb", False)
@@ -560,6 +581,7 @@ def sig_pipeline(wav):
         prepared_features.append("spk_emb")
         output_keys.append("spk_emb")
 
+    resample_fn = {}
     for dataset in data_info:
         dataset_dynamic_items = list(dynamic_items)
         dataset_output_keys = list(output_keys)
@@ -572,6 +594,27 @@ def sig_pipeline(wav):
             dynamic_items=dataset_dynamic_items,
             output_keys=dataset_output_keys,
         )
+        spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams)
+        spk_sample = {}
+        spk_prompt_pipeline = partial(
+            spk_prompt,
+            spk_sample=spk_sample,
+        )
+        dynamic_dataset.add_dynamic_item(
+            func=spk_prompt_pipeline,
+            takes=["uttid"],
+            provides=["spk_prompt"],
+        )
+        dynamic_dataset.add_dynamic_item(prompt_pipeline)
+        resample_fn[dataset] = partial(
+            resample_spk,
+            spk_idx=spk_idx,
+            sample=spk_sample,
+            dataset=dynamic_dataset,
+            spk_samplers=spk_samplers,
+        )
+        resample_fn[dataset](epoch=0)    
+
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
@@ -597,6 +640,7 @@ def sig_pipeline(wav):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
+
     return datasets
 
 
@@ -613,6 +657,84 @@ def get_offsets(vocab_size, tracks):
     return torch.arange(tracks) * vocab_size
 
 
+def group_by_speaker(dataset, hparams):
+    """Groups utterance IDs in a dataset by speaker, for selection. The selection
+    is stable based on the seed - calling this method multiple times will always
+    result in the same order
+
+    Arguments
+    ---------
+    dataset : torch.Tensor
+        the dataset from which to select items
+    hparams : dict
+        hyperparameters
+
+    Returns
+    -------
+    spk_idx : dict
+        a str -> str with a list of utterance IDs
+        for every speaker
+    spk_samplers : dict
+        a reproducible sampler for every speaker
+    spk_samplers_it : dict
+        an iterator for each sampler
+    """
+    spk_uttid = {}
+    spk_samplers = {}
+    speakers = []
+    generator = torch.Generator()
+    generator.manual_seed(hparams["seed"])
+
+    # Group by speaker
+    with dataset.output_keys_as(["spk_id", "uttid"]):
+        for idx, item in enumerate(dataset):
+            spk_id = item["spk_id"]
+            if spk_id not in spk_uttid:
+                spk_uttid[spk_id] = []
+            spk_uttid[spk_id].append(item["uttid"])
+            speakers.append(spk_id)
+
+    # Create a reproducible sampler
+    for spk_id in speakers:
+        sampler = hparams["spk_sampler"](data_source=spk_uttid[spk_id])
+        spk_samplers[spk_id] = sampler
+
+    return spk_uttid, spk_samplers
+
+
+def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch):
+    """Selects new samples
+
+    Arguments
+    ---------
+    spk_idx : dict
+        Data item indexes grouped by speaker
+    spk_samplers : dict
+        A sampler for each speaker
+    spk_samplers_it : dict
+        An iterator for each speaker
+    epoch : int
+        The epoch number
+
+    Returns
+    -------
+    sample : dict
+        a dictionary with uttids as keys and matching
+        indexes as values
+    """
+    if epoch is None:
+        epoch = 0
+    spk_samplers_it = {}
+    for spk_id, sampler in spk_samplers.items():
+        sampler.set_epoch(epoch)
+        spk_samplers_it[spk_id] = iter(sampler)
+    with dataset.output_keys_as(["uttid", "spk_id"]):
+        for item in dataset:
+            spk_item_idx = next(spk_samplers_it[item["spk_id"]])
+            dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx]
+            sample[item["uttid"]] = dataset_item_idx
+
+
 def init_sequence_encoder(hparams):
     """Initialize a sequence encoder
 

From d8a720c966c1ebbf0f1ffe9e1f680e00fd0f5b9b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 28 Jan 2025 11:07:20 -0500
Subject: [PATCH 088/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 14259634c..43bcb1745 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -830,11 +830,11 @@ def apply_overfit_test(hparams, dataset):
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
-            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
-
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
+
             result = {
                 "train": dataset_train,
                 "valid": dataset_eval,

From 11c427bab53b27aa9f596cdd0d0756b77ab49f65 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:30:36 -0500
Subject: [PATCH 089/270] DASB: VALL-E: Fixes

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml      | 11 +++++++----
 .../LibriTTS/TTS/valle/hparams/train_encodec.yaml  | 13 ++++++++-----
 benchmarks/DASB/LibriTTS/TTS/valle/train.py        | 14 ++++++--------
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 36002334a..77e1e56a7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -81,6 +81,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 50
 batch_size: 16 
+valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
@@ -102,10 +103,11 @@ special_num_tokens: 4
 lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
+betas: [0.9, 0.95]
 
 # Feature parameters
-sample_rate: 22050
-model_sample_rate: 16000
+sample_rate: 24000
+model_sample_rate: 24000
 max_audio_length: 2000
 text_max_length: 500
 n_ctx: !ref <max_audio_length> + <text_max_length>
@@ -155,7 +157,7 @@ train_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
@@ -244,8 +246,9 @@ modules:
     model: !ref <model>
     tokenizer: !ref <tokenizer>
 
-opt_class: !name:torch.optim.Adam
+opt_class: !name:torch.optim.AdamW
     lr: !ref <lr>
+    betas: !ref <betas>
 
 
 compute_cost: !name:model.valle.masked_nll_loss
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 8ef4e455f..74b31ecad 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -57,7 +57,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
+valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
@@ -80,10 +81,11 @@ special_num_tokens: 5
 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
+betas: [0.9, 0.95]
 
 # Feature parameters
-sample_rate: 22050
-model_sample_rate: 16000
+sample_rate: 24000
+model_sample_rate: 24000
 max_audio_length: 2000
 text_max_length: 500
 spk_prompt_length: 150
@@ -112,7 +114,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
@@ -204,8 +206,9 @@ modules:
     tokenizer: !ref <tokenizer>
 
 # define two optimizers here for two-stage training
-opt_class: !name:torch.optim.Adam
+opt_class: !name:torch.optim.AdamW
     lr: !ref <lr>
+    betas: !ref <betas>
 
 compute_cost: !name:model.valle.masked_nll_loss
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 43bcb1745..8c8b1ada7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -77,7 +77,6 @@ def create_waveform(self, audio, length):
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
-        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int()
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         wav = wav.to(self.device)
@@ -384,10 +383,11 @@ def inference(self, batch):
             for prefix_item in prefix_items
         ]
         inferred_tokens = [
-            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step)
+            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step, device=self.device)
             for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
+        audio_length = audio_length.to(self.device)
         audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
         return audio, audio_length
 
@@ -418,7 +418,7 @@ def save_samples(self, batch, wav, length, stage):
         samples = undo_padding_tensor(wav, length)
         for uttid, sample in zip(batch.uttid, samples):
             file_name = output_folder / f"pred_{uttid}.wav"
-            write_audio(file_name, sample, self.hparams.model_sample_rate)
+            write_audio(file_name, sample.cpu(), self.hparams.model_sample_rate)
 
     def save_eval(self, stage):
         """Saves evaluation results
@@ -563,7 +563,7 @@ def sig_pipeline(wav):
         sig = sb.dataio.dataio.read_audio(wav)
         return sig
 
-    dynamic_items = [text_pipeline, tokens_pipeline]
+    dynamic_items = [text_pipeline, tokens_pipeline, sig_pipeline]
 
     init_sequence_encoder(hparams)
     use_spk_emb = hparams.get("use_spk_emb", False)
@@ -586,7 +586,6 @@ def sig_pipeline(wav):
         dataset_dynamic_items = list(dynamic_items)
         dataset_output_keys = list(output_keys)
         if dataset != "train":
-            dataset_dynamic_items.append(sig_pipeline)
             dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
         dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
             json_path=data_info[dataset],
@@ -633,9 +632,8 @@ def sig_pipeline(wav):
         hparams["train_dataloader_opts"]["shuffle"] = False
 
     elif hparams["sorting"] == "random":
-        hparams["train_dataloader_opts"]["shuffle"] = True
-        pass
-
+        if not hparams["overfit_test"]:
+            hparams["train_dataloader_opts"]["shuffle"] = True
     else:
         raise NotImplementedError(
             "sorting must be random, ascending or descending"

From 2d1a46a0384f89559dd25a4c227c3859f190415f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:36:33 -0500
Subject: [PATCH 090/270] DASB: Fix ST extraction

---
 .../DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml      | 2 +-
 .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
index f91d34908..155960c27 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -40,7 +40,7 @@ freeze_embedding: False
 save_embedding: False
 
 
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
index 9a53ed27b..85148db9d 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -41,7 +41,7 @@ freeze_embedding: False
 save_embedding: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
-tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
   save_path: !ref <save_folder>
 

From e53d7c69e8f9f099d7006ae5edf5f977132cb942 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 30 Jan 2025 01:30:21 -0500
Subject: [PATCH 091/270] DASB: Add support for using Orion Trial IDs instead
 of randomness

---
 benchmarks/DASB/run_experiments.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 benchmarks/DASB/run_experiments.sh

diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
old mode 100644
new mode 100755
index e0f848aef..aacbc381e
--- a/benchmarks/DASB/run_experiments.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}"
 
 
 if [ "$rnd_dir" = True ]; then
-    rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
-    output_folder="$output_folder/$rnd_dirname"
+    if [[ ! -z "$ORION_TRIAL_ID" ]]; then
+      # Use the Orion Trial ID to ensure interrupted trials are resumed
+      output_folder="$output_folder/$ORION_TRIAL_ID"
+    else
+      rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
+      output_folder="$output_folder/$rnd_dirname"
+    fi
 fi
 
 # Make sure  the output_folder is created
@@ -201,4 +206,4 @@ done
 
 
 echo 'Final Results (Performance Aggregation)'
-python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
\ No newline at end of file
+python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt

From 602f41f0c2f1a76a5fbdeecabb579f77c58967ed Mon Sep 17 00:00:00 2001
From: Pooneh Mousavi <mousavi.pooneh@gmail.com>
Date: Thu, 30 Jan 2025 10:35:43 -0500
Subject: [PATCH 092/270] Update run_experiments.sh

fix bug for orion resuming
---
 benchmarks/DASB/run_experiments.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
index e0f848aef..aacbc381e 100644
--- a/benchmarks/DASB/run_experiments.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}"
 
 
 if [ "$rnd_dir" = True ]; then
-    rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
-    output_folder="$output_folder/$rnd_dirname"
+    if [[ ! -z "$ORION_TRIAL_ID" ]]; then
+      # Use the Orion Trial ID to ensure interrupted trials are resumed
+      output_folder="$output_folder/$ORION_TRIAL_ID"
+    else
+      rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6)
+      output_folder="$output_folder/$rnd_dirname"
+    fi
 fi
 
 # Make sure  the output_folder is created
@@ -201,4 +206,4 @@ done
 
 
 echo 'Final Results (Performance Aggregation)'
-python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt
\ No newline at end of file
+python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a  $output_folder/aggregated_performance.txt

From 4f5153e3412229dbdc0b925a4d2758698df13771 Mon Sep 17 00:00:00 2001
From: Pooneh Mousavi <mousavi.pooneh@gmail.com>
Date: Thu, 30 Jan 2025 12:13:41 -0500
Subject: [PATCH 093/270] Update run_hparam_optimization.sh

fix final run resuming
---
 benchmarks/DASB/run_hparam_optimization.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 2ad1dddf3..bf7c9b1fa 100644
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -415,6 +415,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all --testing True $additional_flags
+  --rnd_dir False --testing True $additional_flags
 
-echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file
+echo "The test performance with best hparams is available at  $output_folder/best"

From 1d0aec0f8f3ecb5561b36b5deda6625b1546f19c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 30 Jan 2025 12:17:04 -0500
Subject: [PATCH 094/270] DASB: Disable random directory name generation for
 the final test phase

---
 benchmarks/DASB/run_hparam_optimization.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 benchmarks/DASB/run_hparam_optimization.sh

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
old mode 100644
new mode 100755
index 2ad1dddf3..468015d08
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -415,6 +415,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir $store_all --testing True $additional_flags
+  --rnd_dir False --testing True $additional_flags
 
 echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file

From e0bb2655956878e84792cf029377e9f78f300914 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 30 Jan 2025 23:05:54 -0500
Subject: [PATCH 095/270] DASB: Fixed the codebook count

---
 benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
index 22e15ef75..482f3739f 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
@@ -36,7 +36,7 @@ dataloader_opts:
 # Tokenizer parameters
 model_hub: kyutai/mimi
 vocab_size: 1024
-num_codebooks: 23
+num_codebooks: 32
 sample_rate: 24000
 # Feature parameters
 encoder_dim: 1024

From 5f5105f1d76d9d96e566f2eb4c8f7790ab2b9386 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 30 Jan 2025 23:19:04 -0500
Subject: [PATCH 096/270] DASB: Extraction fixes/updates

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  7 +--
 .../extraction/hparams/wavtokenizer.yaml      |  2 +-
 .../LibriTTS/extraction/hparams/mimi.yaml     | 57 ++++++++++++++++++
 .../LibriTTS/extraction/hparams/sqcodec.yaml  | 57 ++++++++++++++++++
 .../extraction/hparams/wavtokenizer.yaml      | 59 +++++++++++++++++++
 5 files changed, 177 insertions(+), 5 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index c6ec91445..8f1c22767 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -57,14 +57,13 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: True
-token_model_layers: !ref <ssl_model_layers>
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -175,7 +174,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 token_model_kwargs:
-    SSL_layers: !ref <token_model_layers>
+    SSL_layers: !ref <speech_model_layers>
 
 ####################### Model parameters ###########################
 # Transformer
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
index 5fe91bbce..3a0a935ff 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
@@ -32,7 +32,7 @@ dataloader_opts:
   shuffle: True
   num_workers: !ref <num_workers>
 
-# EnCodec parameters
+# WavTokenizer parameters
 model_hub: novateur/WavTokenizer-medium-music-audio-75token
 config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
index e69de29bb..9e64347c7 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
@@ -0,0 +1,57 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+####################### Model parameters ###########################
+# Tokenizer parameters
+model_hub: kyutai/mimi
+vocab_size: 1024
+num_codebooks: 32
+sample_rate: 24000
+encoder_dim: 1024
+freeze_embedding: False
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  num_codebooks: !ref <num_codebooks>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
new file mode 100644
index 000000000..cf46b3f5a
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
@@ -0,0 +1,57 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/sqcodec
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# SQCodec parameters
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sample_rate: 16000
+save_embedding: False
+num_codebooks: 4
+save_path: /home/ubuntu/sq-codec/SQ-Codec
+
+
+# SQCodec model
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+  save_path: !ref <save_path>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
new file mode 100644
index 000000000..c7581bbe7
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
@@ -0,0 +1,59 @@
+# ############################################################################
+# Auido Tokenizer: Speech Tokenizer
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/speech_tokenizer
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# WavTokenizer parameters
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+sample_rate: 24000
+save_embedding: False
+num_codebooks: 1
+vocab_size: 4096
+
+# wavtokenizer model
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+  source: !ref <model_hub>
+  save_path: !ref <save_folder>
+  checkpoint: !ref <checkpoint>
+  config: !ref <config>
+  freeze: True
+
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>

From d02e8702b936ca40c68ba1784b9536e38eafb90c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 31 Jan 2025 00:11:38 -0500
Subject: [PATCH 097/270] DASB: Clean-up

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml       | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 77e1e56a7..688b7ace3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -38,14 +38,8 @@ train_split: !apply:speechbrain.utils.hparams.choice
         full: ["train-clean-100", "train-clean-360", "train-other-500"]
 valid_split: ["dev-clean"]
 test_split: ["test-clean"]
-frozen_split_path: null
-sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
-progress_meta: !ref <progress_folder>/meta.yaml
-num_audio_samples: 32
-samples_interval: 5
-
 tokens_folder: !PLACEHOLDER
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
@@ -66,15 +60,12 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
-
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_model_layers: !ref <ssl_model_layers>
+speech_model_layers: [1, 3, 7, 12, 18, 23]
 flip_layers: false
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -175,7 +166,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 token_model_kwargs:
-    SSL_layers: !ref <token_model_layers>
+    SSL_layers: !ref <speech_model_layers>
 
 ####################### Model parameters ###########################
 # Transformer

From 0f2561d55491bcf02b38a415480f1cf1eab3367e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 31 Jan 2025 01:05:40 -0500
Subject: [PATCH 098/270] DASB: Tokotron: Config updates

---
 benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml      | 4 ++--
 .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml     | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
index 0117d9afe..378315bcf 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
@@ -32,7 +32,7 @@ dataloader_opts:
   shuffle: True
   num_workers: !ref <num_workers>
 
-# EnCodec parameters
+# SQCodec parameters
 config: config.yaml
 checkpoint: ckpt_00190000.pth
 sample_rate: 16000
@@ -40,7 +40,7 @@ save_embedding: False
 num_codebooks: 4
 save_path: /home/ubuntu/sq-codec/SQ-Codec
 
-# wavtokenizer model
+# SQCodec model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
   save_path: !ref <save_path>
   checkpoint: !ref <checkpoint>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 11b7e5af6..ce3347e54 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -14,8 +14,9 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 ssl_model_type: wavlm
 representation_mode: discrete

From c9578e85e9ab88103fd425f07b6ce2dc8d15ef72 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 31 Jan 2025 11:31:18 -0500
Subject: [PATCH 099/270] DASB: Cosmetic changes (pre-commit hooks)

---
 .../LJSpeech/TTS/tokotron/hparams/eval.yaml   |  22 +---
 .../TTS/tokotron/hparams/train_dac.yaml       |   2 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   2 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   2 +-
 .../TTS/tokotron/hparams/train_mimi.yaml      |   2 +-
 .../hparams/train_speech_tokenizer.yaml       |   2 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |   2 +-
 .../tokotron/hparams/train_wavtokenizer.yaml  |   2 +-
 .../DASB/LJSpeech/TTS/tokotron/train.py       |  29 ++---
 .../DASB/LJSpeech/TTS/valle/hparams/eval.yaml |  22 +---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   2 +-
 .../LibriTTS/TTS/tokotron/hparams/eval.yaml   |   4 +-
 .../TTS/tokotron/hparams/train_dac.yaml       |   2 +-
 .../tokotron/hparams/train_discrete_ssl.yaml  |   2 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   2 +-
 .../hparams/train_speech_tokenizer.yaml       |   2 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   4 +-
 .../TTS/valle/hparams/train_encodec.yaml      |   2 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 109 +++++++++---------
 19 files changed, 95 insertions(+), 121 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index f805e23f6..9cdf08aab 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -2,11 +2,7 @@ eval_sample_rate: 16000
 eval_samples: null
 eval_interval: 1
 eval_asr_type: whisper
-eval_asr_source: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
-    whisper: openai/whisper-small
+eval_asr_source: openai/whisper-small
 evaluations: utmos,asr
 tmp_folder: null
 eval_utmos_source: chaanks/wav2vec2-small
@@ -26,18 +22,10 @@ eval_utmos: !name:eval.UTMOSSpeechEvaluator
   domain_id: !ref <eval_utmos_domain_id>
   judge_id: !ref <eval_utmos_judge_id>
 
-eval_asr: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      overrides:
-        lm_weight: 0.0
-    whisper: !name:eval.WhisperASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      savedir: !ref <pretrained_model_save_folder>
+eval_asr: !name:eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
 
 evaluators:
   utmos: !ref <eval_utmos>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index 1ae232aca..f94d25d74 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index a1be07c07..1c0c765f7 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 3c7284821..3355ac511 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index eac124447..e80edb2b0 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 6b8888153..fb839c897 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index e7af427ad..3b667e2f8 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index 07e63e45b..81bcee2ca 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 150
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 5.0
 sorting: random
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 05d8805c6..8e6e59197 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -275,7 +275,9 @@ def on_stage_start(self, stage, epoch):
             self.is_evaluating = True
 
         self.audio_token_offsets = self.get_token_offsets()
-        self.token_model_kwargs = getattr(self.hparams, "token_model_kwargs", {})
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
 
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
@@ -448,7 +450,9 @@ def create_waveform(self, audio, length):
         with torch.no_grad():
             if self.audio_token_offsets is not None:
                 audio = clean_padding(audio + self.audio_token_offsets, length)
-            wav = self.modules.tokenizer.tokens_to_sig(audio, **self.token_model_kwargs)
+            wav = self.modules.tokenizer.tokens_to_sig(
+                audio, **self.token_model_kwargs
+            )
             wav = clean_padding(wav, length)
             wav = wav.to(self.device)
         return wav
@@ -580,16 +584,12 @@ def audio_ref_pipeline(wav):
             "num_codebooks": get_selected_layer_indexes(hparams)
         }
     else:
-        tokens_loader_kwargs = {
-            "num_codebooks": audio_tokens_per_step
-        }
+        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}
 
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
     def audio_pipeline(id):
-        audio = tokens_loader.tokens_by_uttid(
-            id, **tokens_loader_kwargs
-        )
+        audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs)
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -699,10 +699,7 @@ def get_selected_layer_indexes(hparams):
     available_layers = hparams.get("available_speech_model_layers")
     if not (selected_layers and available_layers):
         return None
-    layer_idx = [
-        available_layers.index(layer)
-        for layer in selected_layers
-    ]
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
     return layer_idx
 
 
@@ -770,14 +767,18 @@ def apply_overfit_test(hparams, dataset):
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
-            dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys()))
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
-            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
             result = {
                 "train": dataset_train,
                 "valid": dataset_eval,
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
index b80347c82..08587ce23 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml
@@ -2,11 +2,7 @@ eval_sample_rate: 16000
 eval_samples: null
 eval_interval: 1
 eval_asr_type: whisper
-eval_asr_source: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech
-    whisper: openai/whisper-small
+eval_asr_source: openai/whisper-small
 evaluations: utmos,asr
 tmp_folder: null
 eval_utmos_source: chaanks/wav2vec2-small
@@ -26,18 +22,10 @@ eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
   domain_id: !ref <eval_utmos_domain_id>
   judge_id: !ref <eval_utmos_judge_id>
 
-eval_asr: !apply:speechbrain.utils.hparams.choice
-  value: !ref <eval_asr_type>
-  choices:
-    encoder_decoder: !name:utils.eval.EncoderDecoderASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      overrides:
-        lm_weight: 0.0
-    whisper: !name:utils.eval.WhisperASRSpeechEvaluator
-      source: !ref <eval_asr_source>
-      sample_rate: !ref <eval_sample_rate>
-      savedir: !ref <pretrained_model_save_folder>
+eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator
+  source: !ref <eval_asr_source>
+  sample_rate: !ref <eval_sample_rate>
+  savedir: !ref <pretrained_model_save_folder>
 
 evaluators:
   utmos: !ref <eval_utmos>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 8f1c22767..f4a003a0d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -69,7 +69,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 
+batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
 sorting: random
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
index 9e9d91dc3..b39b11009 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml
@@ -7,7 +7,7 @@ eval_interval: 1
 eval_subset: null
 eval_asr_beam_size: 66
 eval_asr_type: encoder_decoder
-eval_asr_source:  openai/whisper-small
+eval_asr_source: openai/whisper-small
 eval_spk_sim_source: microsoft/wavlm-base-sv
 evaluations: utmos,asr,spk_sim
 tmp_folder: null
@@ -54,4 +54,4 @@ eval_summary:
 eval_summary_log:
   utmos: utmos_utmos_mean
   dwer: asr_dwer_median
-  spk_sim: spk_sim_score_mean
\ No newline at end of file
+  spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 805384b8d..1a55d1c02 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 
+batch_size: 16
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index ce3347e54..0b128b7a9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -94,7 +94,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 
+batch_size: 16
 batch_size_guided: 2
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 9a5838923..9e5c6826a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -66,7 +66,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 
+batch_size: 16
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 703878092..86ebee501 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -58,7 +58,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
-batch_size: 16 
+batch_size: 16
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 688b7ace3..8ee2a0468 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -61,7 +61,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 speech_model_layers: [1, 3, 7, 12, 18, 23]
-flip_layers: false
+flip_layers: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
@@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
-batch_size: 16 
+batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 74b31ecad..f1981bd88 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -213,7 +213,7 @@ opt_class: !name:torch.optim.AdamW
 compute_cost: !name:model.valle.masked_nll_loss
 
 log_softmax: !new:speechbrain.nnet.activations.Softmax
-   apply_log: True
+    apply_log: True
 
 lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 8c8b1ada7..4f11022f4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -20,8 +20,11 @@
 import shutil
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
-from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio
-from speechbrain.dataio.dataio import write_audio
+from speechbrain.dataio.dataio import (
+    clean_padding_,
+    length_to_mask,
+    write_audio,
+)
 from speechbrain.utils.data_utils import pad_right_to
 from speechbrain.utils.distributed import run_on_main
 from speechbrain.utils.data_utils import batch_pad_right
@@ -32,7 +35,7 @@
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
-from evaluation import SpeechEvaluationMetricStats
+from evaluation import SpeechEvaluationMetricStats  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -57,7 +60,7 @@ def __init__(
         self.evaluation_metric = SpeechEvaluationMetricStats(
             self.hparams, self.device
         )
-            
+
     def create_waveform(self, audio, length):
         """Creates a waveform from a discrete or continuous audio
         representation
@@ -101,14 +104,13 @@ def compute_forward(self, batch, stage):
         prompt, prompt_length = batch.prompt
         batch_size, prompt_max_len, num_tracks = prompt.shape
         nar_track = torch.randint(
-            1, num_tracks, (batch_size,),
-            device=self.device
+            1, num_tracks, (batch_size,), device=self.device
         )
         logits_ar, logits_nar = self.modules.model(
             dec_seq=batch.prompt.data,
             dec_seq_lengths=batch.prompt.lengths,
             prefix_len=batch.prefix_length / prompt_max_len,
-            nar_level_idx=nar_track
+            nar_level_idx=nar_track,
         )
         return logits_ar, logits_nar, nar_track
 
@@ -144,14 +146,16 @@ def compute_objectives(self, predictions, batch, stage):
         batch_idx = torch.arange(batch_size, device=prompt.device)
         targets_nar = prompt[batch_idx, 1:, nar_track]
         prompt_max_len = prompt.size(1)
-        length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len)
-        prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not()
+        length_mask = length_to_mask(
+            prompt_length * prompt_max_len, prompt_max_len
+        )
+        prefix_mask = length_to_mask(
+            prefix_length, prompt_max_len
+        ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
         loss_ar = self.hparams.compute_cost(
-            log_probabilities=logits_ar_sm,
-            targets=targets_ar,
-            mask=mask
+            log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
         )
         self.loss_metric_ar.append(
             ids=batch.uttid,
@@ -161,9 +165,7 @@ def compute_objectives(self, predictions, batch, stage):
             reduction="batch",
         )
         loss_nar = self.hparams.compute_cost(
-            log_probabilities=logits_nar_sm,
-            targets=targets_nar,
-            mask=mask,
+            log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
         )
         self.loss_metric_nar.append(
             ids=batch.uttid,
@@ -187,20 +189,17 @@ def on_stage_start(self, stage, epoch):
             `None` during the test stage.
         """
         self.offsets = get_offsets(
-            self.hparams.vocab_size,
-            self.hparams.audio_tokens_per_step,
+            self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
         )[None, None, :].to(self.device)
 
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
             metric=self.hparams.compute_cost, batch_eval=True,
         )
         self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost,
-            batch_eval=True,
+            metric=self.hparams.compute_cost, batch_eval=True,
         )
         self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost,
-            batch_eval=True,
+            metric=self.hparams.compute_cost, batch_eval=True,
         )
 
         # TOOO: Reestablish evaluation
@@ -290,10 +289,7 @@ def evaluate_batch(self, batch, stage):
                 wav = self.create_waveform(audio_tokens, audio_length)
                 wav = wav.squeeze(1)
                 self.save_samples(
-                    batch=batch,
-                    wav=wav,
-                    length=audio_length,
-                    stage=stage
+                    batch=batch, wav=wav, length=audio_length, stage=stage
                 )
                 self.evaluation_metric.append(
                     ids=batch.uttid,
@@ -377,13 +373,16 @@ def inference(self, batch):
         prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
         inference_results = [
             self.modules.model.inference(
-                prefix=prefix_item.unsqueeze(0),
-                opts=self._get_inference_opts()
-            )            
+                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
+            )
             for prefix_item in prefix_items
         ]
         inferred_tokens = [
-            result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step, device=self.device)
+            result[0][0]
+            if result[0]
+            else torch.zeros(
+                1000, self.hparams.audio_tokens_per_step, device=self.device
+            )
             for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
@@ -392,8 +391,12 @@ def inference(self, batch):
         return audio, audio_length
 
     def _get_inference_opts(self):
-        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :]
-        tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None]
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
+            None, :
+        ]
+        tracks = torch.arange(
+            self.hparams.audio_tokens_per_step, device=self.device
+        )[:, None]
         track_start = (
             self.hparams.text_num_tokens
             + self.hparams.special_num_tokens
@@ -407,10 +410,7 @@ def _get_inference_opts(self):
             | (idx == self.hparams.eos_index)
         ).logical_not()
         return self.hparams.inference_opts(
-            masks={
-                self.hparams.bos_index: mask
-            },
-            device=self.device,
+            masks={self.hparams.bos_index: mask}, device=self.device,
         )
 
     def save_samples(self, batch, wav, length, stage):
@@ -438,7 +438,7 @@ def _get_eval_output_folder(self, stage):
             Path(self.hparams.output_folder) / "eval" / stage.name.lower()
         )
         if epoch is not None:
-            output_folder = output_folder / str(epoch)        
+            output_folder = output_folder / str(epoch)
         output_folder.mkdir(exist_ok=True, parents=True)
         return output_folder
 
@@ -484,12 +484,11 @@ def dataio_prepare(hparams):
     label_encoder = hparams["label_encoder"]
     input_feature = INPUT_FEATURE_MAP[hparams["input"]]
     offsets = get_offsets(
-        hparams["vocab_size"],
-        hparams["audio_tokens_per_step"]
+        hparams["vocab_size"], hparams["audio_tokens_per_step"]
     ).unsqueeze(0)
     if hparams["flip_layers"]:
         offsets = offsets.flip(-1)
-    
+
     tokens_loader = hparams.get("tokens_loader")
     spk_prompt_length = hparams["spk_prompt_length"]
 
@@ -520,13 +519,15 @@ def spk_prompt(uttid, spk_sample):
             offset = 0
         # Retrieve the embedding value from the dataset
         audio_spk_prompt, _ = pad_right_to(
-            audio[offset:offset + spk_prompt_length],
-            (spk_prompt_length, audio.size(1))
+            audio[offset : offset + spk_prompt_length],
+            (spk_prompt_length, audio.size(1)),
         )
         return audio_spk_prompt
 
     @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt")
-    @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length")
+    @sb.utils.data_pipeline.provides(
+        "audio", "prefix", "prompt", "prefix_length", "length"
+    )
     def prompt_pipeline(id, tokens, spk_prompt):
         audio = tokens_loader.tokens_by_uttid(
             id, num_codebooks=hparams["audio_tokens_per_step"]
@@ -575,7 +576,7 @@ def sig_pipeline(wav):
         "audio",
         "prompt",
         "prefix_length",
-        "length"
+        "length",
     ]
     if use_spk_emb:
         prepared_features.append("spk_emb")
@@ -595,14 +596,9 @@ def sig_pipeline(wav):
         )
         spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams)
         spk_sample = {}
-        spk_prompt_pipeline = partial(
-            spk_prompt,
-            spk_sample=spk_sample,
-        )
+        spk_prompt_pipeline = partial(spk_prompt, spk_sample=spk_sample,)
         dynamic_dataset.add_dynamic_item(
-            func=spk_prompt_pipeline,
-            takes=["uttid"],
-            provides=["spk_prompt"],
+            func=spk_prompt_pipeline, takes=["uttid"], provides=["spk_prompt"],
         )
         dynamic_dataset.add_dynamic_item(prompt_pipeline)
         resample_fn[dataset] = partial(
@@ -612,8 +608,7 @@ def sig_pipeline(wav):
             dataset=dynamic_dataset,
             spk_samplers=spk_samplers,
         )
-        resample_fn[dataset](epoch=0)    
-
+        resample_fn[dataset](epoch=0)
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
@@ -824,14 +819,18 @@ def apply_overfit_test(hparams, dataset):
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
-            dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys()))
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
-            dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys()))
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
 
             result = {
                 "train": dataset_train,
@@ -921,7 +920,6 @@ def undo_padding_tensor(batch, lengths):
         overrides=overrides,
     )
 
-
     # Data preparation, to be run on only one process.
     from libritts_prepare import prepare_libritts
 
@@ -951,7 +949,6 @@ def undo_padding_tensor(batch, lengths):
             },
         )
 
-
     # We can now directly create the datasets for training, valid, and test
     datasets = dataio_prepare(hparams)
 

From 7270d4ecc2b374c09c3bfc3a5c58f693893c4096 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 31 Jan 2025 15:05:16 -0500
Subject: [PATCH 100/270] DASB: Add the ability to turn off evaluation for
 debugging purposes.

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 8e6e59197..506207f96 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -264,15 +264,16 @@ def on_stage_start(self, stage, epoch):
         self.use_spk_emb = getattr(self.hparams, "use_spk_emb", False)
 
         self.is_evaluating = False
-        if stage == sb.Stage.VALID:
-            if self.is_eval_epoch(epoch):
+        if self.hparams.eval_enabled:
+            if stage == sb.Stage.VALID:
+                if self.is_eval_epoch(epoch):
+                    self.evaluator.on_evaluate_start(stage, epoch)
+                    self.is_evaluating = True
+                else:
+                    logger.info("No evaluation on epoch %d", epoch)
+            elif stage == sb.Stage.TEST:
                 self.evaluator.on_evaluate_start(stage, epoch)
                 self.is_evaluating = True
-            else:
-                logger.info("No evaluation on epoch %d", epoch)
-        elif stage == sb.Stage.TEST:
-            self.evaluator.on_evaluate_start(stage, epoch)
-            self.is_evaluating = True
 
         self.audio_token_offsets = self.get_token_offsets()
         self.token_model_kwargs = getattr(

From 2b22169c285f7f79edf241408e6ad1fbde6a6904 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 31 Jan 2025 16:41:22 -0500
Subject: [PATCH 101/270] DASB: Add the ability to turn off evaluation

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
index 9cdf08aab..dcdc6d920 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml
@@ -1,3 +1,11 @@
+# ############################################################################
+# Evaluation Hyperparameters
+# Common to old models, appended to main hyperparameters
+#
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+eval_enabled: True
 eval_sample_rate: 16000
 eval_samples: null
 eval_interval: 1

From 6eaa206f97d6f859a64b08215eb2972d670b5d02 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 2 Feb 2025 23:25:58 -0500
Subject: [PATCH 102/270] DASB: Tokotron: SQCodec update to use ternary coding

---
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |  33 +-
 .../DASB/LJSpeech/TTS/tokotron/train.py       |   5 +
 benchmarks/DASB/model/Tokotron.py             | 332 ++++++++++++++++--
 3 files changed, 330 insertions(+), 40 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 3b667e2f8..21dee91e3 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -140,6 +140,8 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+transform_audio: !name:model.Tokotron.tokens_to_ternary
+
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512
@@ -154,7 +156,7 @@ audio_num_tokens: 19683
 audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
-audio_token_offsets: True
+audio_token_offsets: False
 text_num_tokens: 39
 phn_num_tokens: 52
 input_num_tokens: !apply:speechbrain.utils.hparams.choice
@@ -162,7 +164,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
     choices:
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
-audio_tokens_per_step: 1
+audio_tokens_per_step: 4
+ternary_num_digits: 9
+ternary_num_positions: !ref <audio_tokens_per_step> * <ternary_num_digits>
 bandwidth: 1.5
 attention_type: regularMHA
 
@@ -187,6 +191,29 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
     audio_emb_freeze: !ref <audio_emb_freeze>
     max_audio_length: !ref <max_audio_length>
     infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_emb: !ref <audio_emb>
+    out_proj: !ref <out_proj>
+    multihead_input: False
+    inference: !ref <inference>
+
+inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference
+    gate_offset: !ref <gate_offset>
+    gate_threshold: !ref <gate_threshold>
+    tokens_per_step: !ref <audio_tokens_per_step>
+    bos_idx: !ref <bos_index>
+    audio_token_shift: 0
+    max_steps: !ref <infer_max_audio_length>
+    representation_mode: !ref <representation_mode>
+    transform_audio: !name:model.Tokotron.tokens_to_ternary
+    feed_audio: !name:model.Tokotron.ternary_logits_to_tokens
+
+audio_emb: !new:model.Tokotron.TernaryInput
+    emb_size: !ref <audio_emb_size>
+    num_positions: !ref <ternary_num_positions>
+
+out_proj: !new:model.Tokotron.TernaryPredictionHead
+    d_model: !ref <d_model>
+    num_positions: !ref <ternary_num_positions>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
     save_path: !ref <sq_codec_save_path>
@@ -209,6 +236,8 @@ compute_cost: !new:model.Tokotron.TokotronLoss
     gate_gamma: !ref <gate_loss_gamma>
     gate_max_weight: !ref <gate_loss_max_weight>
     silence_padding: !ref <silence_padding>
+    seq_cost: !name:model.Tokotron.ternary_loss
+    multihead_output: False
 
 lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 506207f96..d40ec20f0 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -88,6 +88,7 @@ def compute_forward(self, batch, stage):
         if self.use_spk_emb:
             emb = {"spk": batch.spk_emb.data.squeeze(1)}
 
+        audio = self.transform_audio(audio)
         predictions = self.modules.model(
             input_tokens=tokens,
             input_length=tokens_length,
@@ -210,6 +211,8 @@ def compute_objectives(self, predictions, batch, stage):
         batch = batch.to(self.device)
         predictions, features = predictions
         _, _, audio_tgt, audio_tgt_length = features
+
+        audio_tgt = self.transform_audio(audio_tgt)
         loss_details = self.hparams.compute_cost(
             predictions=predictions,
             audio=audio_tgt,
@@ -280,6 +283,8 @@ def on_stage_start(self, stage, epoch):
             self.hparams, "token_model_kwargs", {}
         )
 
+        self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity())
+
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
 
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 010f3b26b..c795f049b 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -25,10 +25,11 @@
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.embedding import Embedding
 from speechbrain.nnet.linear import Linear
-from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss
+from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, bce_loss
 from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
+from model.sq_codec import decimal_to_ternary_matrix
 
 from enum import Enum
 from collections import namedtuple
@@ -157,8 +158,10 @@ def __init__(
         show_inference_progress=True,
         audio_token_shift=0,
         multihead_input=True,
+        multihead_output=True,
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
+        out_proj=None,
     ):
         super().__init__()
         self.num_tokens = num_tokens
@@ -182,9 +185,11 @@ def __init__(
             if self.representation_mode == RepresentationMode.DISCRETE
             else audio_dim
         )
-        self.out_proj = Linear(
-            input_size=d_model, n_neurons=self.out_dim * tokens_per_step,
-        )
+        if out_proj is None:
+            out_proj = Linear(
+                input_size=d_model, n_neurons=self.out_dim * tokens_per_step,
+            )
+        self.out_proj = out_proj
         self.gate = Linear(input_size=d_model, n_neurons=1)
         if audio_emb is None:
             if self.representation_mode == RepresentationMode.DISCRETE:
@@ -222,6 +227,7 @@ def __init__(
         self.multihead_input = multihead_input
         self.d_model = d_model
         self.d_model_sqrt = math.sqrt(d_model)
+        self.multihead_output = multihead_output
 
     def decode(
         self,
@@ -371,16 +377,17 @@ def forward(
             pos_embs_src,
         )
         lin_out = self.out_proj(dec_out)
-        batch_size, audio_max_len, num_tokens = lin_out.shape
-        lin_out_heads = lin_out.reshape(
-            batch_size,
-            audio_max_len,
-            self.tokens_per_step,
-            num_tokens // self.tokens_per_step,
-        )
+        if self.multihead_output:
+            batch_size, audio_max_len, num_tokens = lin_out.shape
+            lin_out = lin_out.reshape(
+                batch_size,
+                audio_max_len,
+                self.tokens_per_step,
+                num_tokens // self.tokens_per_step,
+            )
         gate_out = self.gate(dec_out).squeeze(-1)
         return TokotronDecoderOutput(
-            lin_out_heads,
+            lin_out,
             gate_out,
             dec_self_attn,
             dec_attn,
@@ -400,6 +407,68 @@ def init_audio_emb(self, emb):
         self.audio_emb.initialize(emb)
 
 
+class TernaryPredictionHead(nn.Module):
+    """An alternative prediction head that predicts a fixed number of ternary digits
+    for each position (as used in SQ-Codec)
+    
+    Arguments
+    ---------
+    d_model : int
+        The model dimension
+    num_positions : int
+        the number of positions
+    """
+    def __init__(self, d_model, num_positions):
+        super().__init__()
+        self.num_positions = num_positions
+        self.d_model = d_model
+        self.num_positions = num_positions
+        self.lin_p = Linear(
+            input_size=d_model,
+            n_neurons=num_positions * 2
+        )
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        """Computes the forward pass
+        
+        Arguments
+        ---------
+        x : torch.Tensor
+            The decoder output (Batch x Length x d_model)
+
+        Returns
+        -------
+        p : torch.Tensor
+            A tensor of shape (Batch x Length x num_positions x 2) where
+            p[:, :, :, 0] -> the probability of the ternary digit being at least 0
+            p[:, :, :, 0] -> the probability of the ternary digit being at least 1
+        """
+        batch_size, max_len, _ = x.shape
+        p = self.sigmoid(self.lin_p(x))
+        p = p.reshape(batch_size, max_len, self.num_positions, 2)
+        return p
+
+
+class TernaryInput(nn.Module):
+    def __init__(self, emb_size, num_positions):
+        super().__init__()
+        self.num_positions = num_positions
+        self.in_proj = Linear(
+            input_size=num_positions * 3,
+            n_neurons=emb_size,
+        )
+
+    def forward(self, x):
+        batch_size, max_len = x.shape[:2]
+        x_onehot = torch.nn.functional.one_hot(
+            (x + 1).long(),
+            3
+        ).reshape(batch_size, max_len, self.num_positions * 3)
+        in_proj = self.in_proj(x_onehot.float())
+        return in_proj
+
+
 class TokotronTransformerAutoregressiveInference(nn.Module):
     """A greedy autoregressive inference implementation
 
@@ -439,6 +508,8 @@ def __init__(
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
         show_inference_progress=True,
+        transform_audio=None,
+        feed_audio=None
     ):
         super().__init__()
         self.decoder = None
@@ -451,6 +522,10 @@ def __init__(
         self.representation_mode = RepresentationMode(representation_mode)
         self.audio_dim = audio_dim
         self.show_inference_progress = show_inference_progress
+        if transform_audio is None:
+            transform_audio = nn.Identity()
+        self.transform_audio = transform_audio
+        self.feed_audio = feed_audio
 
     def bind(self, model):
         """Binds this inference implementation to a model
@@ -522,6 +597,7 @@ def forward(self, enc_out, length, emb=None):
                 steps_range = tqdm(steps_range, desc="Inference")
             for idx in steps_range:
                 # One autoregressive step
+                audio = self.transform_audio(audio)
                 step_out = self.decoder.forward(
                     enc_out=enc_out,
                     src_length=length,
@@ -530,7 +606,9 @@ def forward(self, enc_out, length, emb=None):
                 )
                 audio_out = step_out.out
 
-                if self.representation_mode == RepresentationMode.DISCRETE:
+                if self.feed_audio:
+                    audio_out = self.feed_audio(audio_out)
+                elif self.representation_mode == RepresentationMode.DISCRETE:
                     audio_out = audio_out.argmax(-1)
 
                 # The model outputs predictions without BOS. Add the BOS back for the
@@ -701,11 +779,13 @@ def __init__(
         eos_mode=EosMode.GATE,
         inference=None,
         audio_token_shift=0,
-        decoder_mode=DecoderMode.AUTOREGRESSIVE,
         scale_factor=5.0,
         representation_mode=RepresentationMode.DISCRETE,
         audio_dim=1024,
         emb=None,
+        audio_emb=None,
+        out_proj=None,
+        multihead_input=False
     ):
         super().__init__()
         self.in_emb = Embedding(
@@ -724,11 +804,6 @@ def __init__(
             activation=activation,
             normalize_before=True,
         )
-        self.decoder_mode = DecoderMode(decoder_mode)
-        audio_emb = None
-        if self.decoder_mode == DecoderMode.FORWARD:
-            audio_emb = nn.Identity()
-            audio_emb_size = d_model
         self.decoder = TokotronTransformerDecoder(
             num_tokens=audio_num_tokens + self.audio_token_shift,
             tokens_per_step=audio_tokens_per_step,
@@ -748,9 +823,11 @@ def __init__(
             gate_threshold=gate_threshold,
             gate_offset=gate_offset,
             audio_token_shift=audio_token_shift,
-            multihead_input=self.decoder_mode == DecoderMode.AUTOREGRESSIVE,
+            multihead_input=multihead_input,
+            multihead_output=out_proj is None,
             representation_mode=representation_mode,
             audio_dim=audio_dim,
+            out_proj=out_proj,
         )
         self.bos_idx = bos_idx
         self.attention_type = attention_type
@@ -904,17 +981,11 @@ def forward(
             src_key_padding_mask=src_key_padding_mask,
             pos_embs=pos_embs_encoder,
         )
-        if self.decoder_mode == DecoderMode.AUTOREGRESSIVE:
-            tgt = audio
-            tgt_length = audio_length
-        else:
-            tgt = scale(enc_out, self.scale_factor)
-            tgt_length = input_length
         enc_out = self.add_emb(enc_out, emb)
         dec_out = self.decoder(
             enc_out=enc_out,
-            tgt=tgt,
-            tgt_length=tgt_length,
+            tgt=audio,
+            tgt_length=audio_length,
             src_length=input_length,
             src_key_padding_mask=src_key_padding_mask,
             pos_embs_src=pos_embs_encoder,
@@ -1218,6 +1289,7 @@ def __init__(
         representation_mode=RepresentationMode.DISCRETE,
         audio_clip_min=-10.0,
         audio_clip_max=10.0,
+        multihead_output=True,
     ):
         super().__init__()
         self.guided_attention_weight = guided_attention_weight
@@ -1246,6 +1318,7 @@ def __init__(
             self.register_buffer("audio_eos", audio_eos)
         self.audio_clip_min = audio_clip_min
         self.audio_clip_max = audio_clip_max
+        self.multihead_output = multihead_output
 
     def forward(
         self,
@@ -1278,9 +1351,12 @@ def forward(
             out = out.log_softmax(dim=-1)
         batch_size, out_len, heads, tok_dim = out.shape
         max_len = out_len - 1
-        out_reshaped = (
-            out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim)
-        )[:, :max_len]
+        if self.multihead_output:
+            out_reshaped = (
+                out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim)
+            )[:, :max_len]
+        else:
+            out_reshaped = out
         if self.eos_mode == EosMode.TOKEN:
             # NOTE: Shift only the tokens, but not EOS
             padding_lengths = torch.ones(batch_size, device=audio.device)
@@ -1294,7 +1370,10 @@ def forward(
             )
 
         tok_len = audio.size(1)
-        if self.representation_mode == RepresentationMode.DISCRETE:
+        if not self.multihead_output:
+            audio_reshaped = audio
+            lengths_reshaped = audio_length
+        elif self.representation_mode == RepresentationMode.DISCRETE:
             audio_reshaped = audio.transpose(1, 2).reshape(
                 batch_size * heads, max_len
             )
@@ -1313,18 +1392,21 @@ def forward(
                 )
 
         audio_reshaped = audio_reshaped[:, :max_len]
-        lengths_reshaped = (
-            audio_length.unsqueeze(-1)
-            .expand(batch_size, heads)
-            .reshape(batch_size * heads)
-        )
+        if self.multihead_output:        
+            lengths_reshaped = (
+                audio_length.unsqueeze(-1)
+                .expand(batch_size, heads)
+                .reshape(batch_size * heads)
+            )
+        else:
+            lengths_reshaped = audio_length            
         seq_loss = self.seq_cost(
             out_reshaped[:, :tok_len],
             audio_reshaped,
             length=lengths_reshaped,
             reduction=reduction,
         )
-        if reduction == "batch":
+        if reduction == "batch" and self.multihead_output:
             seq_loss = seq_loss.reshape(batch_size, heads).mean(-1)
         lengths_abs = audio_length * out_len
 
@@ -2252,3 +2334,177 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys):
             token_collate_fn, silence_token=silence_token, token_keys=token_keys
         ),
     }
+
+
+def ternary_matrix_to_decimal(matrix):
+    """
+    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
+
+    Arguments
+    ---------
+    matrix : numpy.ndarray
+        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
+        of ternary digits, and N is the number of ternary numbers in each batch.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 2D numpy array of shape (B, N), where each value represents the decimal
+        equivalent of the corresponding ternary number in the input matrix.
+    """
+    (
+        B,
+        D,
+        N,
+    ) = (
+        matrix.shape
+    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
+    powers_of_three = 3 **torch.arange(D)  # [3^0, 3^1, ..., 3^(D-1)]
+
+    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
+    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
+
+    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
+    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+
+    return decimals
+
+
+def logits_to_ternary(logits):
+    """Converts a tensor with two logits to a ternary matrix
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits (Batch x Length x num_positions x 2)
+
+    Returns
+    -------
+    result : torch.Tensor
+        The corresponding ternary matrix
+    """
+    gte0 = logits[..., 0] >= 0.5
+    gte1 = logits[..., 1] >= 0.5
+    val_minus_1 = torch.tensor(-1, device=logits.device)
+    val_zero = torch.tensor(0, device=logits.device)
+    val_plus_1 = torch.tensor(1, device=logits.device)
+    return torch.where(
+        gte0,
+        torch.where(
+            gte1,
+            val_plus_1,
+            val_zero
+        ),
+        val_minus_1
+    )
+
+def ternary_matrix_to_decimal(matrix):
+    """
+    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
+
+    Arguments
+    ---------
+    matrix : numpy.ndarray
+        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
+        of ternary digits, and N is the number of ternary numbers in each batch.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 2D numpy array of shape (B, N), where each value represents the decimal
+        equivalent of the corresponding ternary number in the input matrix.
+    """
+    (
+        B,
+        D,
+        N,
+    ) = (
+        matrix.shape
+    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
+    powers_of_three = 3 ** torch.arange(D, device=matrix.device)  # [3^0, 3^1, ..., 3^(D-1)]
+
+    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
+    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
+
+    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
+    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+
+    return decimals
+
+
+def ternary_to_decimal(ternary, n_codebook=4):
+    """Converts ternary digits to their decimal equivalent
+    
+    Arguments
+    ---------
+    ternary : torch.Tensor
+        (Batch x Length x num_positions) - ternary digits
+    n_codebooks : torch.Tensor
+        The number of coedbooks"""
+    chunks = ternary.chunk(n_codebook, dim=1)
+    codec_ls = []
+    # TODO: Vectorize
+    for i, chunk in enumerate(chunks):
+        chunk = chunk + 1
+        tmp_codec = ternary_matrix_to_decimal(chunk)
+        codec_ls.append(tmp_codec)
+    codec_ls = torch.stack(codec_ls)
+    return codec_ls.permute(1, 2, 0)
+
+
+def ternary_logits_to_tokens(logits):
+    """Converts ternary logits to tokens (as used for SQ-Codec)
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits
+
+    Returns
+    -------
+    tokens : torch.Tensor
+        Token IDs
+    """
+    ternary_matrix = logits_to_ternary(logits)
+    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2))
+    return tokens
+
+
+def tokens_to_ternary(tokens):
+    """Converts a sequence of tokens to a ternary matrix
+    
+    Arguments
+    ---------
+    tokens : torch.Tensor
+        A (Batch x Length x Codebooks) tensor of tokens
+    
+    Returns
+    -------
+    result : t""" 
+    batch_size = tokens.size(0)
+    n_codebook = tokens.size(2)
+    tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
+    ternary_matrix = torch.cat([
+        decimal_to_ternary_matrix(item, D=9) - 1
+        for item in tokens
+    ], dim=1)
+    return ternary_matrix.transpose(1, 2)
+
+
+def ternary_loss(predictions, targets, length=None, reduction="mean"):
+    tgt_gte0 = targets >= 0.
+    tgt_gte1 = targets >= 1.
+    loss_gte0 = bce_loss(
+        predictions[:, :, :, 0],
+        tgt_gte0,
+        length=length,
+        reduction=reduction,
+    )
+    loss_gte1 = bce_loss(
+        predictions[:, :, :, 0],
+        tgt_gte1,
+        length=length,
+        reduction=reduction,
+    )
+    loss = loss_gte0 + loss_gte1
+    return loss
\ No newline at end of file

From a99fddb94def647a02171c13cf93702901fcc34d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 2 Feb 2025 23:54:39 -0500
Subject: [PATCH 103/270] DASB: Device fix

---
 benchmarks/DASB/model/sq_codec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 0e1ffe3f8..7901675e1 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1300,7 +1300,7 @@ def decimal_to_ternary_matrix(decimals, D):
         corresponds to a batch, and each column is represented as a ternary number.
     """
     B, T = decimals.shape
-    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long)
+    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device)
     for pos in range(D):
         ternary_matrix[:, pos, :] = decimals % 3  # Modulo operation
         decimals //= 3  # Floor division for next ternary digit

From 650cf2e60d76bd289334d6597232270c6f04b510 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 3 Feb 2025 11:49:16 -0500
Subject: [PATCH 104/270] DASB: Tokotron: Add the ability to add an
 "initialization model" when no checkpoint is available

---
 .../TTS/tokotron/hparams/train_discrete_ssl.yaml    |  2 +-
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py      | 13 ++++++++++++-
 benchmarks/DASB/model/Tokotron.py                   |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 0b128b7a9..233aee30a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -38,7 +38,7 @@ frozen_split_path: null
 sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
-progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 590fb10f7..fa65e2d10 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -353,9 +353,20 @@ def on_fit_start(self):
         if self.checkpointer is not None and not getattr(
             self, "_ckpt_recovered", False
         ):
-            self.checkpointer.recover_if_possible()
+            checkpoint = self.checkpointer.recover_if_possible()
+            if not checkpoint:
+                self.check_init()
             self._ckpt_recovered = True
 
+    def check_init(self):
+        init_from = getattr(self.hparams, "init_from", None)
+        if init_from is not None:
+            init_from_path = Path(init_from)
+            model_path = init_from_path / "model.ckpt"
+            with open(model_path, "rb") as model_file:
+                model_state_dict = torch.load(model_file, map_location=self.device)
+                self.modules.model.load_state_dict(model_state_dict)
+
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):
         """Evaluate one batch, override for different procedure than train.
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index c795f049b..92a1cbd49 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -785,7 +785,7 @@ def __init__(
         emb=None,
         audio_emb=None,
         out_proj=None,
-        multihead_input=False
+        multihead_input=True
     ):
         super().__init__()
         self.in_emb = Embedding(

From b43b5652c6c94075ab2585978695b22807c0de99 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 3 Feb 2025 12:05:52 -0500
Subject: [PATCH 105/270] DASB: A small fix for cases where strides are not
 compatble (not necessarily a bug - it depends on how the tensor was obtained)

---
 benchmarks/DASB/utils/tokenizer_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index be73fda74..0ab019b58 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -499,7 +499,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
         self.eval()
-        signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs)
+        signal = self.decode(tokens.reshape(tokens.shape[0], -1), **kwargs)
         return signal.squeeze(1)
 
     @torch.no_grad()

From 693d499c883bb893e185bed36be8402616ef4669 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 13:36:13 -0500
Subject: [PATCH 106/270] DASB: Extra logging

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index fa65e2d10..2d5ff461a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -361,11 +361,13 @@ def on_fit_start(self):
     def check_init(self):
         init_from = getattr(self.hparams, "init_from", None)
         if init_from is not None:
+            logger.info("Initializing with pre-trained weights from %s", init_from)
             init_from_path = Path(init_from)
             model_path = init_from_path / "model.ckpt"
             with open(model_path, "rb") as model_file:
                 model_state_dict = torch.load(model_file, map_location=self.device)
                 self.modules.model.load_state_dict(model_state_dict)
+            logger.info("Successfully initialized with pre-trained weights from %s", init_from)
 
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):

From 7b79ffcc42e70e1864e866231b23a0e5941eab71 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:58:09 -0500
Subject: [PATCH 107/270] DASB: Fix maximum validation set size

---
 benchmarks/DASB/LibriTTS/extraction/extract.py | 1 +
 benchmarks/DASB/LibriTTS/libritts_prepare.py   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
index 87de6f84b..a3db84984 100644
--- a/benchmarks/DASB/LibriTTS/extraction/extract.py
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -51,6 +51,7 @@
             "save_json_test": hparams["test_json"],
             "sample_rate": hparams["sample_rate"],
             "skip_prep": hparams["skip_prep"],
+            "max_valid_size": None
         },
     )
 
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
index 6d0ca9f0a..cb26eb085 100644
--- a/benchmarks/DASB/LibriTTS/libritts_prepare.py
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -105,7 +105,7 @@ def prepare_libritts(
     if valid_split:
         wav_list = prepare_split(data_folder, valid_split)
         # TODO add better way to speedup evaluation
-        if len(wav_list) > max_valid_size:
+        if max_valid_size is not None and len(wav_list) > max_valid_size:
             wav_list = random.sample(wav_list, max_valid_size)
         create_json(wav_list, save_json_valid, sample_rate, model_name)
     if test_split:

From 24bebfe7fcda4ea400b6c028503b94b065ed7555 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 17:21:12 -0500
Subject: [PATCH 108/270] DASB: Add the ability to change the saved folder for
 Encodec

---
 benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
index d3cd83c3e..b7ae76969 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/encodec
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -47,7 +48,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   sample_rate: !ref <sample_rate>
   bandwidth: !ref <bandwidth>
   flat_embeddings: False

From 7ede118828fa5631b067e6e72e7e69e2b27a052a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 21:59:16 -0500
Subject: [PATCH 109/270] DASB: Fixes

---
 .../tokotron/hparams/train_discrete_ssl.yaml  | 34 ++-----------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 233aee30a..2db7cd944 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -70,8 +70,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
-ssl_model_layers: [1, 3, 7, 12, 18, 23]
-token_model_layers: !ref <ssl_model_layers>
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
 select_layers: null
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
@@ -166,15 +166,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp
 silence_padding: !ref <gate_offset>
 use_silence_padding: True
 
-# Guides
-guides_enabled: False
-guides_start_epoch: 40
-guides_spk: False
-guides_spk_discrete: True
-guides_spk_loss_weight: 0.2
-guides_asr: True
-guides_asr_loss_weight: 0.1
-
 
 # Token model (pretrained)
 ssl_model: !apply:speechbrain.utils.hparams.choice
@@ -201,14 +192,6 @@ spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_h
     source: !ref <spk_emb_src>
     savedir: !ref <pretrained_model_save_folder>/ecapa
 
-spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class
-    source: !ref <spk_emb_discrete_src>
-    savedir: !ref <pretrained_model_save_folder>/ecapa-<ssl_model_type>
-    pymodule_file: custom_interface.py
-    classname: DiscreteSpkEmb
-    overrides:
-        ssl_layer_num_selected: !ref <token_model_layers>
-
 # Dataloader options
 train_dataloader_opts:
     batch_size: !ref <batch_size>
@@ -240,19 +223,8 @@ sample_dataloader_opts:
             value: !ref <pad_index>
 
 token_model_kwargs:
-    SSL_layers: !ref <token_model_layers>
+    SSL_layers: !ref <speech_model_layers>
 
-extract_features_opts:
-    dataloader_opts:
-        batch_size: !ref <extract_features_batch_size>
-        num_workers: !ref <num_workers>
-    ssl_model: !ref <ssl_model>
-    ssl_model_layers: !ref <ssl_model_layers>
-    token_model_layers: !ref <token_model_layers>
-    sample_rate: !ref <sample_rate>
-    model_sample_rate: !ref <model_sample_rate>
-    spk_emb_model: !ref <spk_emb_model>
-    data_folder_alignments: !ref <data_folder_alignments>
 ####################### Model parameters ###########################
 # Transformer
 d_model: 512

From 123248d9099acf22faa3c7faedcbb92dca7b3ac3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 23:23:55 -0500
Subject: [PATCH 110/270] DASB: Tokotron: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 2d5ff461a..47c3b8939 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -827,7 +827,10 @@ def read_token_list(file_name):
     result: list
         a list of tokens
     """
-    if not Path(file_name).exists():
+    file_name = Path(file_name)
+    if not file_name.is_absolute():
+        file_name = Path(__file__).parent / "hparams" / file_name
+    if not file_name.exists():
         raise ValueError(f"Token file {file_name} not found")
     with open(file_name) as token_file:
         return [line.strip("\r\n") for line in token_file if line]

From 0b11188b9d32b2f8745c0b983902b1c7561d507c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 23:34:36 -0500
Subject: [PATCH 111/270] DASB: Tokotron: Fixes

---
 .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 2db7cd944..efcde8c58 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -149,8 +149,8 @@ debug_infer_max_audio_length: 10
 
 # Label encoder
 label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
-token_list_file_text: ./hparams/char_en.txt
-token_list_file_phn: ./hparams/arpabet.txt
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
 token_list_file: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:

From 4eaa7cdf6e6d50e09437e245bef02eb88aeb26e9 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Feb 2025 23:52:36 -0500
Subject: [PATCH 112/270] DASB: Fixes

---
 .../DASB/LibriTTS/TTS/tokotron/train.py       | 59 ++++++++-----------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 47c3b8939..f11269392 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -163,38 +163,6 @@ def _compute_spk(self, wav, wav_length):
         )
         return spk_emb_pred
 
-    def _get_selected_layer_idx(self):
-        selected_layers = None
-        if (
-            hasattr(self.hparams, "select_layers")
-            and self.hparams.select_layers
-        ):
-            layers = self.hparams.select_layers
-            model_layers_map = {
-                layer: idx
-                for idx, layer in enumerate(self.hparams.token_model_layers)
-            }
-            selected_layers = [model_layers_map[layer] for layer in layers]
-        return selected_layers
-
-    # TODO: Move this elsewhere
-    def select_layers(self, audio_ssl):
-        """Applies layer squishing, if enabled
-
-        Arguments
-        ---------
-        audio_ssl : torch.Tensor
-            SSL features
-
-        Returns
-        -------
-        audio_ssl : torch.Tensor
-            SSL features, squished if enabled
-        """
-        if self.layer_idx:
-            audio_ssl = audio_ssl[:, :, self.layer_idx]
-        return audio_ssl
-
     def compute_objectives(self, predictions, batch, stage):
         """Computes the loss given the predicted and targeted outputs. We here
         do multi-task learning and the loss is a weighted sum of the ctc + seq2seq
@@ -258,7 +226,6 @@ def on_stage_start(self, stage, epoch):
             self.modules.vocoder, "model"
         ):
             self.modules.vocoder.model.device = self.device
-        self.layer_idx = self._get_selected_layer_idx()
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
             metric=self.hparams.compute_cost, batch_eval=True,
         )
@@ -558,13 +525,17 @@ def tokens_pipeline(label):
         )
 
     tokens_loader = hparams.get("tokens_loader")
+    if "speech_model_layers" in hparams:
+        tokens_loader_kwargs = {
+            "num_codebooks": get_selected_layer_indexes(hparams)
+        }
+    else:
+        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}    
 
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
     def audio_pipeline(id):
-        audio = tokens_loader.tokens_by_uttid(
-            id, num_codebooks=audio_tokens_per_step
-        )
+        audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs)
         audio_pad = feature_pad_to(
             audio, len(audio) + silence_padding_len, silence_padding
         )
@@ -813,6 +784,22 @@ def init_sequence_encoder(hparams):
     return encoder
 
 
+def get_selected_layer_indexes(hparams):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    hparams : dict
+        Hyperparameters
+    """
+    selected_layers = hparams.get("speech_model_layers")
+    available_layers = hparams.get("available_speech_model_layers")
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
 def read_token_list(file_name):
     """Reads a simple text file with tokens (e.g. characters or phonemes) listed
     one per line

From 60e7d9eb21bc73f580f15d8d44c6c46ea4fc94ae Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Feb 2025 00:48:14 -0500
Subject: [PATCH 113/270] DASB: Tokotron: Fixes

---
 .../DASB/LibriTTS/TTS/tokotron/train.py       | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index f11269392..9d18705e2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -333,7 +333,15 @@ def check_init(self):
             model_path = init_from_path / "model.ckpt"
             with open(model_path, "rb") as model_file:
                 model_state_dict = torch.load(model_file, map_location=self.device)
-                self.modules.model.load_state_dict(model_state_dict)
+                tgt_state_dict = self.modules.model.state_dict()
+                ignore_keys = []
+                for k, v in model_state_dict.items():
+                    if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape:
+                        logger.warning("Ignoring shape mismatch for %s", k)
+                        ignore_keys.append(k)
+                for k in ignore_keys:
+                    del model_state_dict[k]
+                self.modules.model.load_state_dict(model_state_dict, strict=False)
             logger.info("Successfully initialized with pre-trained weights from %s", init_from)
 
     @torch.no_grad()
@@ -499,6 +507,9 @@ def tokens_pipeline(label):
         audio_tokens_per_step = len(hparams["token_model_layers"])
     else:
         audio_tokens_per_step = hparams["audio_tokens_per_step"]
+    layer_idx = None
+    if "speech_model_layers" in hparams:
+        layer_idx = get_selected_layer_indexes(hparams)
     if use_silence_padding:
         if representation_mode == RepresentationMode.DISCRETE:
             silence_padding = get_silence_token(
@@ -514,6 +525,10 @@ def tokens_pipeline(label):
         )
 
     silence_padding = silence_padding.cpu()
+    if layer_idx:
+        silence_padding = silence_padding[layer_idx]
+    else:
+        silence_padding = silence_padding[:audio_tokens_per_step]
     silence_padding_len = int(math.ceil(hparams["silence_padding"]))
     bos_width = hparams.get("bos_width", 1)
     audio_bos_prefix = (
@@ -525,9 +540,9 @@ def tokens_pipeline(label):
         )
 
     tokens_loader = hparams.get("tokens_loader")
-    if "speech_model_layers" in hparams:
+    if layer_idx is not None:
         tokens_loader_kwargs = {
-            "num_codebooks": get_selected_layer_indexes(hparams)
+            "num_codebooks": layer_idx
         }
     else:
         tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}    

From 54df7ed4912375a393d3d89ba7dc22d8268f40dd Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Feb 2025 10:57:43 -0500
Subject: [PATCH 114/270] DASB: Tokotron LibriTTS: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 9d18705e2..f167e2f64 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -89,8 +89,12 @@ def create_waveform(self, audio, length, emb):
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
-        wav = self.modules.tokenizer.tokens_to_sig(audio)
-        clean_padding_(wav, length)
+        with torch.no_grad():
+            wav = self.modules.tokenizer.tokens_to_sig(
+                audio, **self.token_model_kwargs
+            )
+            clean_padding_(wav, length)
+            wav = wav.to(self.device)
         return wav
 
     def compute_forward(self, batch, stage):
@@ -279,6 +283,9 @@ def on_stage_start(self, stage, epoch):
         elif stage == sb.Stage.TEST:
             self.evaluator.on_evaluate_start(stage, epoch)
             self.is_evaluating = True
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed

From 3aa7de39695cc308cd7471d18f5bb3974021570b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 00:58:29 -0500
Subject: [PATCH 115/270] DASB: Fixes

---
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 9e5c6826a..cbef6a840 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -240,7 +240,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
+    save_path: !ref <prepared_models_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
     flat_embeddings: False

From 10f820221fd744a11151579e5399e328d5ccdc6e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 01:05:30 -0500
Subject: [PATCH 116/270] DASB: Fixes

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index 3355ac511..c2ffc13bf 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -184,7 +184,7 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
+    save_path: !ref <prepared_models_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
     flat_embeddings: False

From 2cd7c6a2a32bf6e1549d075d7f3036c7adb6c274 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 01:08:04 -0500
Subject: [PATCH 117/270] DASB: Fixes

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml       | 2 +-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index c2ffc13bf..d16403558 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -184,7 +184,7 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <prepared_models_folder>
+    save_path: !ref <pretrained_model_save_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
     flat_embeddings: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index cbef6a840..c7ca08adc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -240,7 +240,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <prepared_models_folder>
+    save_path: !ref <pretrained_model_save_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
     flat_embeddings: False

From 7e1bf0f5286db1f7ac6d080d0a3d51383153ebfa Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 09:48:49 -0500
Subject: [PATCH 118/270] DASB: Fixes

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 1 -
 .../DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml  | 2 +-
 .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml  | 3 +--
 .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml     | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 1c0c765f7..b92a76255 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -233,7 +233,6 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     representation_mode: discrete
 
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 3090e9f79..9c8baf3bf 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -45,7 +45,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   sample_rate: !ref <sample_rate>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 86ebee501..b48bb66fa 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -236,13 +236,12 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
     source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
+    save_path: !ref <pretrained_model_save_folder>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
index 85148db9d..931e448cd 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -43,7 +43,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>

From 2c72caf7a1b998814cdff6f08682d3255b1892b0 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 09:54:20 -0500
Subject: [PATCH 119/270] DASB: Fixes

---
 .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml   | 2 +-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml       | 2 --
 .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml      | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index fb839c897..4b2fb6553 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -188,7 +188,7 @@ model: !new:model.Tokotron.TokotronTransformerModel  # yamllint disable-line rul
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
     source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
+    save_path: !ref <pretrained_model_save_folder>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index c7ca08adc..258065779 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -101,7 +101,6 @@ gate_loss_max_weight: 1.
 # Inference parameters
 inference_mode: autoregressive
 eos_mode: gate
-decoder_mode: autoregressive
 scale_factor: 4
 
 # Embedding Injection
@@ -233,7 +232,6 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-
     eos_mode: !ref <eos_mode>
     infer_max_audio_length: !ref <infer_max_audio_length>
     audio_token_shift: !ref <audio_token_shift>
-    decoder_mode: !ref <decoder_mode>
     scale_factor: !ref <scale_factor>
     emb: !ref <emb>
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
index 931e448cd..85148db9d 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -43,7 +43,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <pretrained_model_save_folder>
+  save_path: !ref <save_folder>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>

From 4b5164420a701ebd093e32c308dc47ec153f8d6a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:07:20 -0500
Subject: [PATCH 120/270] VALL-E: Cosmetic changes, hparams updates

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  10 +-
 benchmarks/DASB/model/valle.py                | 406 +++++++++++++-----
 2 files changed, 309 insertions(+), 107 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 8ee2a0468..3aa7690a1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -87,11 +87,12 @@ pad_index: 0
 bos_index: 1
 eos_index: 2
 eot_index: 3
-special_tokens: ["<bos>", "<eos>", "<eot>"]
-special_num_tokens: 4
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr: 0.002 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 betas: [0.9, 0.95]
@@ -101,7 +102,8 @@ sample_rate: 24000
 model_sample_rate: 24000
 max_audio_length: 2000
 text_max_length: 500
-n_ctx: !ref <max_audio_length> + <text_max_length>
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 debug_infer_max_audio_length: 10
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index b85e68345..ab233efff 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -14,7 +14,7 @@
 
 import logging
 import torch
-from typing import Dict, Tuple, Optional
+from typing import Tuple, Optional
 from speechbrain.dataio.dataio import length_to_mask
 
 from torch import Tensor
@@ -144,6 +144,13 @@ def forward(
             Lengths of condition part in dec_seq (B,).
         nar_level_idx : int
             the index of the non-autoregressive level to train
+
+        Returns
+        -------
+        logits_ar : torch.Tensor
+            Autoregressive predictions
+        logits_nar : torch.Tensor
+            Non-autoregressive predictions
         """
 
         assert dec_seq.dim() == 3
@@ -202,11 +209,7 @@ def prepare_input(self, dec_seq_emb, prefix_len, level):
 
     @torch.no_grad()
     def inference(
-        self,
-        prefix,
-        opts,
-        enc_seq=None,
-        suffix=None,
+        self, prefix, opts, enc_seq=None, suffix=None,
     ):
         """Vall-E Inference.
 
@@ -221,6 +224,13 @@ def inference(
         suffix : torch.Tensor
             suffix part of dec_seq (B, T, nq),
             usually the target sequence for teacher-forcing.
+
+        Returns
+        -------
+        gen_tokens_list : list
+            Generated tokens
+        gen_scores_list : list
+            The scores associated with the generated tokens
         """
 
         # (1) initialization
@@ -263,8 +273,7 @@ def inference(
         mask = modality_index_to_mask(modality_index, opts)
         mask_cache = []
         modality_tokens = torch.tensor(
-            list(opts.masks.keys()),
-            device=prefix.device
+            list(opts.masks.keys()), device=prefix.device
         )
 
         for step in range(maxlen):
@@ -292,14 +301,11 @@ def inference(
 
             # (3.3) detect modality swtich
             mask_cache.append(mask.clone())
-            modality_change_mask = torch.isin(
-                prev_tok[:, 0],
-                modality_tokens
-            )
+            modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens)
             # Note: The ESPNET VALL-E had
             # modality_change_mask = torch.logical_and(
             #    prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
-            #)
+            # )
             if torch.any(modality_change_mask):
                 modality_index = torch.where(
                     modality_change_mask, prev_tok[:, 0], modality_index,
@@ -434,14 +440,33 @@ def _initialize(self):
 
 
 class ResidualAttentionBlock(nn.Module):
+    """A VALL-E residual attention block
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of heads
+    cross_attention : bool
+        Whether to use cross-attention
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Whether to normalize queries and keys
+    dropout : float
+        The dropout probability
+    """
+
     def __init__(
         self,
-        n_state: int,
-        n_head: int,
-        cross_attention: bool = False,
-        causal: bool = False,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
+        n_state,
+        n_head,
+        cross_attention=False,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
     ):
         super().__init__()
 
@@ -471,12 +496,20 @@ def __init__(
         self.mlp_dropout = nn.Dropout(p=dropout)
 
     def forward(
-        self,
-        x: Tensor,
-        xa: Optional[Tensor] = None,
-        mask: Optional[Tensor] = None,
-        kv_cache: Optional[dict] = None,
+        self, x, xa=None, mask=None, kv_cache=None,
     ):
+        """The forward pass implementation
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            the feature tensor
+        xa : torch.Tensor
+            The tensor for cross-attention
+        mask : torch.Tensor
+            The attention mask to be applied
+
+        """
         x = x + self.attn_dropout(
             self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
         )
@@ -491,15 +524,37 @@ def forward(
 class TransformerDecoder(nn.Module):
     def __init__(
         self,
-        n_ctx: int,
-        n_state: int,
-        n_head: int,
-        n_layer: int,
-        causal: bool = True,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
+        n_ctx,
+        n_state,
+        n_head,
+        n_layer,
+        causal=True,
+        qk_norm=False,
+        dropout=0.0,
         layer_class=ResidualAttentionBlock,
     ):
+        """A custom transformer decoder implementation for VALL-E
+
+        Arguments
+        ---------
+        n_ctx : int
+            The context length
+        n_state : int
+            The number of states
+        n_head : int
+            The number of heads
+        n_layer : int
+            The number of layers
+        causal : bool
+            Whether to operate in causal mode (i.e. avoid attending
+            to future steps)
+        qk_norm : bool
+            Whether to normalize queries and keys
+        dropout : float
+            The dropout probability
+        layer_class : type
+            The layer type to be used
+        """
         super().__init__()
 
         self.pos_emb = nn.Embedding(n_ctx, n_state)
@@ -523,11 +578,24 @@ def __init__(
         self.kv_cache = None
 
     def forward(
-        self,
-        x: Tensor,
-        mask: torch.Tensor = None,
-        kv_cache: Optional[dict] = None,
+        self, x, mask=None, kv_cache=None,
     ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            the feature tensor
+        mask : torch.Tensor
+            The attention mask to be applied
+        kv_cache : dict
+            The key/value cache (for inference)
+
+        Returns
+        -------
+        result : torch.Tensor
+            The decoder output
+        """
         if self.causal and mask is not None:
             raise ValueError("Causal Transformer dones't allow mask")
 
@@ -541,17 +609,33 @@ def forward(
         return x
 
     def init(self):
+        """Initializes the key/value cache and the hooks to update it"""
         self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache)
         return self.kv_cache
 
-    def reset(self,):
+    def reset(self):
+        """Resets the key-value cache"""
         for hook in self.hooks:
             hook.remove()
         self.kv_cache = None
 
 
 class LayerNorm(nn.LayerNorm):
-    def forward(self, x: Tensor) -> Tensor:
+    """A layer normalziation wrapper"""
+
+    def forward(self, x):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The tensor to be normalized
+
+        Returns
+        -------
+        result : torch.Tensor
+            A normalzied tensor
+        """
         return super().forward(x.float()).type(x.dtype)
 
 
@@ -565,14 +649,35 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class ResidualAttentionBlockAdaLN(ResidualAttentionBlock):
+    """"The Vall-E Adaptive Residual Attention Block
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of states
+    n_head : int
+        The number of attention heads
+    cross_attention : bool
+        The number of attention heads
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Queries/Keys Normalization
+    dropout : float
+        The dropout probability
+    """
+
     def __init__(
         self,
-        n_state: int,
-        n_head: int,
-        cross_attention: bool = False,
-        causal: bool = False,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
+        n_state,
+        n_head,
+        cross_attention=False,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
     ):
         super(ResidualAttentionBlockAdaLN, self).__init__(
             n_state=n_state,
@@ -587,13 +692,23 @@ def __init__(
         self.mlp_ln = AdaLN(n_state)
 
     def forward(
-        self,
-        x: Tensor,
-        level: Tensor,
-        xa: Optional[Tensor] = None,
-        mask: Optional[Tensor] = None,
-        kv_cache: Optional[dict] = None,
+        self, x, level, xa=None, mask=None, kv_cache=None,
     ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        level : torch.Tensor
+            The level numbers for each batch element
+        xa : torch.Tensor
+            The sequence for cross attention
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
         x = x + self.attn_dropout(
             self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache)
         )
@@ -610,17 +725,40 @@ def forward(
 class ValleNARDecoder(TransformerDecoder):
     def __init__(
         self,
-        n_level: int,
-        n_ctx: int,
-        n_state: int,
-        n_head: int,
-        n_layer: int,
-        causal: bool = False,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
+        n_level,
+        n_ctx,
+        n_state,
+        n_head,
+        n_layer,
+        causal=False,
+        qk_norm=False,
+        dropout=0.0,
         layer_class=ResidualAttentionBlockAdaLN,
     ):
+        """The VALL-E non-autoregressive decoder
 
+        Arguments
+        ---------
+        n_level : int
+            The number of levels
+        n_ctx : int
+            The context length
+        n_state : int
+            The number of states
+        n_head : int
+            The number of attention heads
+        n_layer : int
+            The number of layers
+        causal : bool
+            Whether to operate in causal mode (i.e. avoid attending
+            to future steps)
+        qk_norm : bool
+            Queries/Keys Normalization
+        dropout : float
+            The dropout probability
+        layer_class : type
+            The layer class to use
+        """
         super().__init__(
             n_ctx=n_ctx,
             n_state=n_state,
@@ -636,12 +774,21 @@ def __init__(
         self.ln = AdaLN(n_state)
 
     def forward(
-        self,
-        x: Tensor,
-        level: Tensor,
-        mask: Tensor = None,
-        kv_cache: Optional[dict] = None,
+        self, x, level, mask=None, kv_cache=None,
     ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        level : torch.Tensor
+            The level numbers for each batch element
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
         if self.causal and mask is not None:
             raise ValueError("mask is not allowed when causal")
 
@@ -658,13 +805,25 @@ def forward(
 
 
 class MultiHeadAttention(nn.Module):
+    """A Multi-Head Attention implementation
+
+    Arguments
+    ---------
+    n_state : int
+        The number of states
+    n_head : int
+        The number of attention heads
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Queries/Keys Normalization
+    dropout : float
+        The dropout probability
+    """
+
     def __init__(
-        self,
-        n_state: int,
-        n_head: int,
-        causal: bool = False,
-        qk_norm: bool = False,
-        dropout: float = 0.0,
+        self, n_state, n_head, causal=False, qk_norm=False, dropout=0.0,
     ):
         super().__init__()
         assert n_state % n_head == 0
@@ -681,23 +840,22 @@ def __init__(
             self.q_norm = LayerNorm(n_state // n_head)
             self.k_norm = LayerNorm(n_state // n_head)
 
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ValueError("Install torch 2.0.1+ to support Flash Attention")
-
-        try:
-            from flash_attn import flash_attn_func
-
-            self.flash_attn_func = flash_attn_func
-        except ImportError:
-            self.flash_attn_func = None
-
     def forward(
-        self,
-        x: Tensor,
-        xa: Optional[Tensor] = None,
-        mask: Optional[Tensor] = None,
-        kv_cache: Optional[dict] = None,
+        self, x, xa=None, mask=None, kv_cache=None,
     ):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The source tensor
+        xa : torch.Tensor
+            The sequence for cross attention
+        mask : torch.Tensor
+            The attention mask
+        kv_cache : dict
+            The key/value cache (for inference)
+        """
         q = self.query(x)
 
         if kv_cache is None or xa is None or self.key not in kv_cache:
@@ -714,9 +872,23 @@ def forward(
 
         return self.out(wv)
 
-    def qkv_attention(
-        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
-    ):
+    def qkv_attention(self, q, k, v, mask=None):
+        """Computes self-attention
+
+        Arguments
+        ---------
+        q : torch.Tensor
+            The queries tensor
+        k : torch.Tensor
+            The keys tensor
+        v : torch.Tensor
+            The values tensor
+
+        Returns
+        -------
+        wv : torch.Tensor
+            The attention output
+        """
         if self.causal and mask is not None:
             raise ValueError("mask is not allowed when the attention is causal")
 
@@ -732,16 +904,6 @@ def qkv_attention(
         if self.qk_norm:
             q = self.q_norm(q)
             k = self.k_norm(k)
-
-        if self.flash_attn_func is not None and mask is None and self.training:
-            wv = self.flash_attn_func(
-                q.transpose(1, 2),
-                k.transpose(1, 2),
-                v.transpose(1, 2),
-                dropout_p=self.dropout,
-                causal=causal,
-            ).flatten(start_dim=2)
-        else:
             wv = (
                 F.scaled_dot_product_attention(
                     q, k, v, mask, is_causal=causal, dropout_p=self.dropout
@@ -754,6 +916,17 @@ def qkv_attention(
 
 
 class AdaLN(nn.Module):
+    """Adaptive Layer Normalization, a Layer Norm implementation
+    that learns an affine transformation based on the level
+    embedding
+
+    Arguemnts
+    ---------
+    n_state : int
+        The number of states
+    eps : float
+        The layer norm epsilon parameter"""
+
     def __init__(self, n_state, eps=1e-5):
         super().__init__()
         self.weight = nn.Linear(n_state, n_state, bias=False)
@@ -764,7 +937,16 @@ def __init__(self, n_state, eps=1e-5):
         self.n_state = n_state
         self.eps = eps
 
-    def forward(self, x: Tensor, level_emb: Tensor):
+    def forward(self, x, level_emb):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The tensor
+        level_emb : torch.Tensor
+            The level embedding
+        """
         w = self.weight(level_emb).unsqueeze(1)
         b = self.bias(level_emb).unsqueeze(1)
         x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps)
@@ -773,6 +955,22 @@ def forward(self, x: Tensor, level_emb: Tensor):
 
 
 def install_kv_cache_hook(model, cache):
+    """Sets up the key/value cache hook
+
+    Arguments
+    ---------
+    model : torch.nn.Module
+        The model
+    cache : dict
+        The cache content
+
+    Returns
+    -------
+    cache : torch.Tensor
+        The cache dictionary (new or copied)
+    hooks : torch.Tensor
+        The installed hooks
+    """
     cache = {**cache} if cache is not None else {}
     hooks = []
 
@@ -794,12 +992,7 @@ def install_hooks(layer: torch.nn.Module):
 
 
 def logits_to_tokens(
-    logits: torch.Tensor,
-    opts: SpeechLMInferenceOptions,
-    mask: torch.Tensor,
-    search_algo: str = None,
-    allow_eos: bool = True,
-    nq_level: int = None,
+    logits, opts, mask, search_algo=None, allow_eos=True, nq_level=None,
 ):
     """
     Select the generated tokens and their scores based on logits prediction.
@@ -818,6 +1011,13 @@ def logits_to_tokens(
         whether to allow end-of-sentence prediction
     nq_level : int, optional
         if not None, only conpute the specified codec level nq.
+
+    Returns
+    -------
+    gen_token_idx : torch.Tensor
+        The token indexes
+    gen_token_score : torch.Tensor
+        The token scores
     """
 
     assert logits.dim() == 4

From 748cc860b30bb54d49e6ec4a45a130924346c637 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:35:54 -0500
Subject: [PATCH 121/270] DASB: Fixes

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 3aa7690a1..be7e49c81 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <data_folder>/prepared
-pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_save_folder: !ref <cached_data_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
 data_mode: lite

From 858b5d47ef35736ea7511cd2aa1fd50aad1cff36 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:37:28 -0500
Subject: [PATCH 122/270] DASB: Fixes

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index be7e49c81..4b1a44a29 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -22,7 +22,6 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared
 prepare_save_folder: !ref <cached_data_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False

From 7a5ea84ca99b33f027459bbc2e2120ddf5e1e456 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:38:44 -0500
Subject: [PATCH 123/270] DASB: Fixes

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 4b1a44a29..5c8f608d2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -23,6 +23,7 @@ data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
 prepare_archive_path: null
 prepare_skip_ignore_folders: False
 data_mode: lite

From 30ee0c06f32efbc0d3c3be9290217f00810b1c14 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 14:29:52 -0500
Subject: [PATCH 124/270] DASB: Fix prefix masking for VALL-E

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 3 ++-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 92ea570da..dd619fede 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -152,7 +152,7 @@ def compute_objectives(self, predictions, batch, stage):
             prompt_length * prompt_max_len, prompt_max_len
         )
         prefix_mask = length_to_mask(
-            prefix_length, prompt_max_len
+            prefix_length * prompt_max_len, prompt_max_len
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
@@ -480,6 +480,7 @@ def dataio_prepare(hparams):
         "valid": hparams["valid_json"],
         "test": hparams["test_json"],
     }
+    
     label_encoder = hparams["label_encoder"]
     input_feature = INPUT_FEATURE_MAP[hparams["input"]]
     offsets = get_offsets(
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 4f11022f4..6b7b7d207 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -150,7 +150,7 @@ def compute_objectives(self, predictions, batch, stage):
             prompt_length * prompt_max_len, prompt_max_len
         )
         prefix_mask = length_to_mask(
-            prefix_length, prompt_max_len
+            prefix_length * prompt_max_len, prompt_max_len
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 

From 3d89d2d6bd7194d9f31300ea431856b036310113 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:06:00 -0500
Subject: [PATCH 125/270] DASB: Update loss calculation to match ESPNet

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index dd619fede..bfcd76403 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -176,7 +176,7 @@ def compute_objectives(self, predictions, batch, stage):
             mask=mask,
             reduction="batch",
         )
-        loss = loss_ar + loss_nar
+        loss = 0.5 * (loss_ar + loss_nar)
         return loss
 
     def on_stage_start(self, stage, epoch):
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 6b7b7d207..d51d53878 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -174,7 +174,7 @@ def compute_objectives(self, predictions, batch, stage):
             mask=mask,
             reduction="batch",
         )
-        loss = loss_ar + loss_nar
+        loss = 0.5 * (loss_ar + loss_nar)
         return loss
 
     def on_stage_start(self, stage, epoch):

From 779bf9932adf4921af172dc80baac4be935b4a1e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:20:43 -0500
Subject: [PATCH 126/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index bfcd76403..c21f29c2d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -152,7 +152,7 @@ def compute_objectives(self, predictions, batch, stage):
             prompt_length * prompt_max_len, prompt_max_len
         )
         prefix_mask = length_to_mask(
-            prefix_length * prompt_max_len, prompt_max_len
+            prefix_length, prompt_max_len
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index d51d53878..18adcf75f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -150,7 +150,7 @@ def compute_objectives(self, predictions, batch, stage):
             prompt_length * prompt_max_len, prompt_max_len
         )
         prefix_mask = length_to_mask(
-            prefix_length * prompt_max_len, prompt_max_len
+            prefix_length, prompt_max_len
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 

From 92c40b69e0ae69d0be0c4323ec25bbaeaf1cc73c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:29:36 -0500
Subject: [PATCH 127/270] VALL-E: Hyperparameter updates

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml  | 5 +++--
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml  | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml       | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index f4a003a0d..827ffe2e3 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -64,6 +64,7 @@ token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -88,8 +89,8 @@ special_tokens: ["<bos>", "<eos>", "<eot>"]
 special_num_tokens: 4
 
 # stages related parameters
-lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
-lr_warmup_steps: 10000
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
 lr_annealing_mode: step
 guided_attention_weight: 50.0
 guided_attention_sigma: 0.5
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 5c8f608d2..2f53cc210 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -92,7 +92,7 @@ special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
 special_num_tokens: 5
 
 # stages related parameters
-lr: 0.002 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
 lr_warmup_steps: 10000
 lr_annealing_mode: step
 betas: [0.9, 0.95]
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index f1981bd88..4bbde09be 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -78,8 +78,8 @@ special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
 special_num_tokens: 5
 
 # stages related parameters
-lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
-lr_warmup_steps: 10000
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
 lr_annealing_mode: step
 betas: [0.9, 0.95]
 

From 56187971ac86b28621b63df23765449aa63d6d94 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Feb 2025 21:43:04 -0500
Subject: [PATCH 128/270] DASB: Fix the sample rate

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 2f53cc210..dc6fe1ebc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -93,13 +93,13 @@ special_num_tokens: 5
 
 # stages related parameters
 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
-lr_warmup_steps: 10000
+lr_warmup_steps: 70000
 lr_annealing_mode: step
 betas: [0.9, 0.95]
 
 # Feature parameters
 sample_rate: 24000
-model_sample_rate: 24000
+model_sample_rate: 16000
 max_audio_length: 2000
 text_max_length: 500
 spk_prompt_length: 150

From 71cd31618563e629564a7bb15a2bea2b20c29ccb Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 01:07:57 -0500
Subject: [PATCH 129/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 4bbde09be..1a3e332c2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -194,7 +194,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-    save_path: !ref <save_folder>
+    save_path: !ref <pretrained_model_save_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
     flat_embeddings: False

From 9e4c550841abe2b58c5c237481684457a487b872 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 12:23:44 -0500
Subject: [PATCH 130/270] DASB: Encodec: Small fix

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 1a3e332c2..8d415323d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -86,7 +86,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2000
+max_audio_length: 2300
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>

From 165eaac5b293f8a16cfa28a3be4328e4cf97bbb5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:06:53 -0500
Subject: [PATCH 131/270] DASB: Add Mimi, fix defaults for VALL-E Encodec

---
 .../TTS/tokotron/hparams/train_mimi.yaml      | 278 ++++++++++++++++++
 .../TTS/valle/hparams/train_encodec.yaml      |   2 +-
 benchmarks/DASB/model/Tokotron.py             |  11 +-
 3 files changed, 284 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
new file mode 100644
index 000000000..515537417
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -0,0 +1,278 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: vocos
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 2048
+audio_emb_size: 1024
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 8
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 8d415323d..57aac47c3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -168,7 +168,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-bandwidth: 1.5
+bandwidth: 6
 attention_type: regularMHA
 
 ############################## models ################################
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 92a1cbd49..d86d52273 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -2142,7 +2142,8 @@ def get_silence_token(
     sample_length=100000,
     unsqueeze=False,
     device=None,
-    model_kwargs=None,
+    num_codebooks=None,
+
 ):
     """Attempts to find out the silence tokens for a given model,
     if applicable
@@ -2157,8 +2158,8 @@ def get_silence_token(
         Whether to add an extra dimension to the audio (needed for DAC)
     device : str | torch.Device
         The device to use
-    model_kwargs : dict
-        Additional arguments to pass to the model
+    num_codebooks : int | list
+        The number of codebooks or the codebooks to use
 
     Returns
     -------
@@ -2171,8 +2172,6 @@ def get_silence_token(
     """
     if device is None:
         device = next(model.parameters()).device
-    if model_kwargs is None:
-        model_kwargs = {}
 
     audio = torch.zeros(1, sample_length, device=device)
     if unsqueeze:
@@ -2180,7 +2179,7 @@ def get_silence_token(
     length = torch.ones(1, device=device)
     model_training = model.training
     model.eval()
-    tokens = model.sig_to_tokens(audio, length)
+    tokens = model.sig_to_tokens(audio, length, num_codebooks=num_codebooks)
     if model_training:
         model.train()
     tokens = tokens.squeeze(0)

From c1b30dbf8af1e573cf02cb360c6dea0f53136942 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:21:19 -0500
Subject: [PATCH 132/270] DASB: mimi fixes

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
index 515537417..ff173e8b5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -13,8 +13,9 @@ train_log: !ref <output_folder>/train_log.txt
 testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
+cached_data_folder: !PLACEHOLDER
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
-prepare_save_folder: !ref <data_folder>/prepared
+prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
 representation_mode: discrete
 vocoder_model_name: vocos

From c3b647e0730c36fa4aa5d339f82df5e3ad882a56 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:29:16 -0500
Subject: [PATCH 133/270] DASB: add init_from

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
index ff173e8b5..3c06d761f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -39,6 +39,7 @@ sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 

From f27ebad0c99ea3857d0cb657632395f321ae981a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:34:12 -0500
Subject: [PATCH 134/270] DASB: small updates

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index d40ec20f0..dd9d61762 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -567,7 +567,12 @@ def audio_ref_pipeline(wav):
         and representation_mode == RepresentationMode.DISCRETE
     ):
         silence_token = get_silence_token(
-            hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"),
+            hparams[model_key],
+            num_codebooks=(
+                hparams["speech_model_layers"]
+                if "speech_model_layers" in hparams
+                else audio_tokens_per_step
+            )
         )
         if silence_token.dim() == 2:
             silence_token = silence_token.squeeze(-1)

From 98408241422f8f87d1429bbdf3fe6e6116538c3c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:35:08 -0500
Subject: [PATCH 135/270] DASB: small updates

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index f167e2f64..b160ab3a5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -521,7 +521,11 @@ def tokens_pipeline(label):
         if representation_mode == RepresentationMode.DISCRETE:
             silence_padding = get_silence_token(
                 hparams["tokenizer"],
-                model_kwargs=hparams.get("token_model_kwargs"),
+                num_codebooks=(
+                    hparams["speech_model_layers"]
+                    if "speech_model_layers" in hparams
+                    else audio_tokens_per_step
+                )
             )
         else:
             silence_padding = get_silence_repr(hparams["ssl_model"],)

From b4afc68968fe82aab5d53378b709f360585e6b0f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 8 Feb 2025 21:57:13 -0500
Subject: [PATCH 136/270] DASB: Add support for alignments

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   1 +
 .../TTS/valle/hparams/train_encodec.yaml      |   1 +
 benchmarks/DASB/LibriTTS/libritts_prepare.py  | 181 +++++++++++++++++-
 3 files changed, 176 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index dc6fe1ebc..f8af5fb45 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -21,6 +21,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 # Data files
 data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER # e.g., path/to/cache
+alignments_folder: null
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
 prepare_save_folder: !ref <cached_data_folder>
 pretrained_model_save_folder: !ref <prepare_save_folder>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 57aac47c3..1a125cc81 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -18,6 +18,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 # Data files
 data_folder: !PLACEHOLDER
 cached_data_folder: !PLACEHOLDER
+alignments_folder: null
 prepare_save_folder: !ref <cached_data_folder>
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
 pretrained_model_save_folder: !ref <prepare_save_folder>
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
index cb26eb085..6ec1a3a96 100644
--- a/benchmarks/DASB/LibriTTS/libritts_prepare.py
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -11,12 +11,14 @@
 
 import torch
 import torchaudio
+import re
 from tqdm import tqdm
 
 from speechbrain.inference.text import GraphemeToPhoneme
 from speechbrain.utils.data_utils import get_all_files
 from speechbrain.utils.logger import get_logger
 from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations
+from pathlib import Path
 
 logger = get_logger(__name__)
 LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/"
@@ -38,6 +40,7 @@ def prepare_libritts(
     seed=1234,
     model_name=None,
     max_valid_size=500,
+    alignments_folder=None,
     skip_prep=False,
 ):
     """
@@ -75,6 +78,8 @@ def prepare_libritts(
         Seed value
     model_name : str
         Model name (used to prepare additional model specific data)
+    alignments_path : None
+        The path to alignments files
     skip_prep: Bool
         If True, skip preparation.
 
@@ -101,16 +106,16 @@ def prepare_libritts(
     # If specific splits are provided, creates data manifest files accordingly
     if train_split:
         wav_list = prepare_split(data_folder, train_split)
-        create_json(wav_list, save_json_train, sample_rate, model_name)
+        create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name)
     if valid_split:
         wav_list = prepare_split(data_folder, valid_split)
         # TODO add better way to speedup evaluation
         if max_valid_size is not None and len(wav_list) > max_valid_size:
             wav_list = random.sample(wav_list, max_valid_size)
-        create_json(wav_list, save_json_valid, sample_rate, model_name)
+        create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name)
     if test_split:
         wav_list = prepare_split(data_folder, test_split)
-        create_json(wav_list, save_json_test, sample_rate, model_name)
+        create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name)
 
     if skip(save_json_train, save_json_valid, save_json_test):
         logger.info("Preparation completed.")
@@ -124,12 +129,12 @@ def prepare_libritts(
         data_split = split_sets(wav_list, split_ratio)
         # Creating json files
         create_json(
-            data_split["train"], save_json_train, sample_rate, model_name
+            data_split["train"], save_json_train, sample_rate, alignments_folder, model_name
         )
         create_json(
-            data_split["valid"], save_json_valid, sample_rate, model_name
+            data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name
         )
-        create_json(data_split["test"], save_json_test, sample_rate, model_name)
+        create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name)
 
 
 def prepare_split(data_folder, split_list):
@@ -172,7 +177,7 @@ def prepare_split(data_folder, split_list):
     return wav_list
 
 
-def create_json(wav_list, json_file, sample_rate, model_name=None):
+def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None):
     """
     Creates the json file given a list of wav files.
     Arguments
@@ -183,6 +188,10 @@ def create_json(wav_list, json_file, sample_rate, model_name=None):
         The path of the output json file
     sample_rate : int
         The sample rate to be used for the dataset
+    data_folder : str
+        The path to LibriTTS
+    alignments_folder : str
+        The path to LibriTTS alignments
     model_name : str
         Model name (used to prepare additional model specific data)
     """
@@ -250,6 +259,10 @@ def create_json(wav_list, json_file, sample_rate, model_name=None):
             "label": normalized_text,
             "segment": True if "train" in json_file else False,
         }
+        if alignments_folder is not None:
+            alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file)
+            alignments = parse_alignments(alignments_file_name)
+            json_dict[uttid].update(alignments)
 
         # Characters are used for Tacotron2, phonemes may be needed for other models
         if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None:
@@ -264,6 +277,39 @@ def create_json(wav_list, json_file, sample_rate, model_name=None):
     logger.info(f"{json_file} successfully created!")
 
 
+def get_alignment_path(data_folder, alignments_folder, file_name):
+    """Returns the path in the LibriSpeech-Alignments dataset
+    corresponding to the specified file path in LibriSpeech
+
+    Arguments
+    ---------
+    data_folder: str
+        the path to LibriSpeech
+    alignments_folder: str
+        the path to LibriSpeech-Alignments
+    file_name: str
+        the file name within LibriSpeech
+
+    Returns
+    -------
+    file_name: str
+        the alignment file path
+    """
+    file_name = Path(file_name)
+    data_folder = Path(data_folder)
+    if file_name.parts[0] == "{data_root}":
+        file_name_rel = file_name.relative_to("{data_root}")
+    else:
+        file_name_rel = file_name.relative_to(data_folder)
+    data_slice = file_name_rel.parts[0]
+
+    textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent
+    textgrid_file_name = f"{file_name_rel.stem}.TextGrid"
+    textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name
+
+    return textgrid_path
+
+
 def skip(*filenames):
     """
     Detects if the data preparation has been already done.
@@ -329,3 +375,124 @@ def check_folders(*folders):
         if not os.path.exists(folder):
             return False
     return True
+
+def parse_alignments(file_name):
+    """Parses a given LibriSpeech-Alignments TextGrid file and
+    converts the results to the desired format (to be used in JSON
+    metadata)
+
+    Arguments
+    ---------
+    file_name : path-like
+        the file name of the TextGrid file
+
+    Returns
+    -------
+    details: dict
+        the metadata details
+    """
+    try:
+        import textgrids
+    except ImportError:
+        logger.error(
+            "Parsing LibriSpeech-alignments requires the"
+            "praat-textgrids package"
+        )
+        raise
+    if not file_name.exists():
+        return {
+            "has_alignments": False,
+            "phn": [],
+            "phn_stress": [],
+            "phn_start": [],
+            "phn_end": [],
+            "phn_count": 0,
+            "wrd": [],
+            "wrd_start": [],
+            "wrd_end": [],
+            "wrd_count": 0,
+            "unk_count": None
+        }
+
+    text_grid = textgrids.TextGrid()
+    text_grid.read(file_name)
+    word_intervals = [
+        {**word, "label": word["label"].upper()}
+        for word in text_grid.interval_tier_to_array("words")
+    ]
+    phn_intervals = text_grid.interval_tier_to_array("phones")
+    details = {}
+    details.update(intervals_to_dict(word_intervals, "wrd"))
+    phn = intervals_to_dict(phn_intervals, "phn")
+    phn_stress = phn["phn"]
+    phn_nostress = remove_stress_marks(phn_stress)
+    phn["phn"] = phn_nostress
+    phn["phn_stress"] = phn_stress
+    details.update(phn)
+    details["unk_count"] = sum(wrd == "<UNK>" for wrd in details["wrd"])
+    details["has_alignments"] = True
+
+    return details
+
+
+INTERVAL_MAP = [("label", ""), ("begin", "_start"), ("end", "_end")]
+INTERVAL_EMPTY_LABELS = {"", "sil", "sp", "spn"}
+
+
+def intervals_to_dict(intervals, prefix):
+    """
+    Converts a parsed list of intervals from PRAAT TextGrid
+    to a learning-friendly array
+
+    Arguments
+    ---------
+    intervals: list
+        A list of raw TextGrid intervals, as returned by
+        TextGrid.interval_tier_to_array
+    prefix: str
+        the prefix to add
+
+    Returns
+    -------
+    result: dict
+        A dictionary of the form
+            {
+                "{prefix}": <list of labels>,
+                "{prefix}_start": <list of begin values>,
+                "{prefix}_end": <list of end values>,
+                "{prefix}_count: <number of intervals>
+            }
+
+    """
+    # Remove meaningless labels
+    intervals_clean = [
+        interval
+        for interval in intervals
+        if interval["label"] not in INTERVAL_EMPTY_LABELS
+    ]
+    result = {
+        f"{prefix}{suffix}": [interval[key] for interval in intervals_clean]
+        for key, suffix in INTERVAL_MAP
+    }
+    # This will map space labels to a single one
+    result[f"{prefix}_count"] = len(intervals_clean)
+    return result
+
+
+RE_STRESS_MARK = re.compile(r"\d$")
+
+
+def remove_stress_marks(phn):
+    """Removes stress marks from a phoneme annotation
+
+    Arguments
+    ---------
+    phn: list
+        a list of phoneme annotations with or without stress marks
+
+    Returns
+    -------
+    result: list
+        a list of phoneme annotations without stress marks
+    """
+    return [RE_STRESS_MARK.sub("", item) for item in phn]

From cbea7f714f670ee8e701b45c5052741d912b3010 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 00:55:06 -0500
Subject: [PATCH 137/270] DASB: Fixed

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 18adcf75f..ec802fbd5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -945,6 +945,7 @@ def undo_padding_tensor(batch, lengths):
                     else None
                 ),
                 "seed": hparams["seed"],
+                "alignments_folder": hparams.get("alignments_folder"),
                 "model_name": hparams["model"].__class__.__name__,
             },
         )

From e48a91f72ea0583d69b96f9c3c2f7200c01824a2 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 20:05:31 -0500
Subject: [PATCH 138/270] VALL-E: Fixes, add encodec

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml      | 14 +-------------
 .../LibriTTS/TTS/valle/hparams/train_encodec.yaml  |  1 -
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 827ffe2e3..2f6f34297 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/discrete_ssl
+experiment_name: valle/discrete_ssl
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
@@ -92,18 +92,6 @@ special_num_tokens: 4
 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
 lr_warmup_steps: 70000
 lr_annealing_mode: step
-guided_attention_weight: 50.0
-guided_attention_sigma: 0.5
-gate_loss_weight: 1.0
-gate_threshold: 0.5
-gate_loss_beta: 0.2
-gate_loss_gamma: 0.01
-gate_loss_max_weight: 1.
-
-# Inference parameters
-eos_mode: gate
-decoder_mode: autoregressive
-scale_factor: 4
 
 # Feature parameters
 sample_rate: 22050
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 1a125cc81..6fc0f4b58 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -22,7 +22,6 @@ alignments_folder: null
 prepare_save_folder: !ref <cached_data_folder>
 data_folder_alignments: null # e.g., /path/to/LibriSpeech
 pretrained_model_save_folder: !ref <prepare_save_folder>
-ssl_model_type: wavlm
 representation_mode: discrete
 prepare_archive_path: null
 prepare_skip_ignore_folders: False

From 45d613079d8ff5bcb0c2ac441591a0047caee1b9 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 21:11:46 -0500
Subject: [PATCH 139/270] DASB: Add encodec

---
 .../TTS/valle/hparams/train_encodec.yaml      | 227 ++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
new file mode 100644
index 000000000..b6de2eb8a
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -0,0 +1,227 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/encodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 6
+bandwidth: 1.5
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>

From e1635df4f8ea635afaf2f462ccb29e254b390ce5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 21:28:23 -0500
Subject: [PATCH 140/270] DASB: fixes

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index c21f29c2d..c8c0198a6 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -448,7 +448,7 @@ def fit_batch(self, batch):
         return loss
 
 
-INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
+INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phonemes"}
 
 
 def dataio_prepare(hparams):

From 64b73e74375a89e24f93827db50f4397e4461582 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 21:31:24 -0500
Subject: [PATCH 141/270] DASB: Fixes

---
 benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
index 06292fd34..08d7297e5 100644
--- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
+++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
@@ -53,8 +53,6 @@ def prepare_ljspeech(
     pitch_max_f0=400,
     skip_prep=False,
     use_custom_cleaner=False,
-    extract_features=None,
-    extract_features_opts=None,
     extract_phonemes=False,
     g2p_src="speechbrain/soundchoice-g2p",
     skip_ignore_folders=False,
@@ -404,10 +402,6 @@ def prepare_json(
     pitch_min_f0,
     pitch_max_f0,
     use_custom_cleaner=False,
-    extract_features=None,
-    extract_features_context=None,
-    extract_features_folder=None,
-    extract_features_opts=None,
     extract_phonemes=False,
     g2p_src="speechbrain/soundchoice-g2p",
     device="cpu",
@@ -467,7 +461,7 @@ def prepare_json(
         extract_phonemes = True
     if extract_phonemes:
         logger.info(
-            "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while."
+            "Computing phonemes for LJSpeech labels using SpeechBrain f This may take a while."
         )
         g2p = GraphemeToPhoneme.from_hparams(
             g2p_src, run_opts={"device": device}

From 79ca7a6c3f3db20d10127f25a0ec21b5c24348ba Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 23:09:02 -0500
Subject: [PATCH 142/270] DASB: Vall-E: Multi-GPU inference fix

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 7 ++++++-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index c8c0198a6..b77ad4afa 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -373,8 +373,13 @@ def inference(self, batch):
         prefix, prefix_length = batch.prefix
         # NOTE: ESPNET VALL-E does not support batched inference
         prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference = (
+            self.modules.model.module.inference
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.inference
+        )
         inference_results = [
-            self.modules.model.inference(
+            inference(
                 prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
             )
             for prefix_item in prefix_items
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index ec802fbd5..d75defb0a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -371,8 +371,13 @@ def inference(self, batch):
         prefix, prefix_length = batch.prefix
         # NOTE: ESPNET VALL-E does not support batched inference
         prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference = (
+            self.modules.model.module.inference
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.inference
+        )        
         inference_results = [
-            self.modules.model.inference(
+            inference(
                 prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
             )
             for prefix_item in prefix_items
@@ -946,6 +951,7 @@ def undo_padding_tensor(batch, lengths):
                 ),
                 "seed": hparams["seed"],
                 "alignments_folder": hparams.get("alignments_folder"),
+                "extract_phonemes": hparams["input"] == "phonemes",
                 "model_name": hparams["model"].__class__.__name__,
             },
         )

From c6c6cf64226da780debda96014612efc94ea906a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 23:31:36 -0500
Subject: [PATCH 143/270] DASB: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index b6de2eb8a..6ea447070 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -163,8 +163,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens> +  <special_num_tokens>
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
-audio_tokens_per_step: 6
-bandwidth: 1.5
+audio_tokens_per_step: 8
+bandwidth: 6
 
 
 ############################## models ################################

From e25d1469e15fd85bf715bcdbf1c0160a0914c83b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Feb 2025 23:41:50 -0500
Subject: [PATCH 144/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index d75defb0a..87e70b407 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -951,7 +951,6 @@ def undo_padding_tensor(batch, lengths):
                 ),
                 "seed": hparams["seed"],
                 "alignments_folder": hparams.get("alignments_folder"),
-                "extract_phonemes": hparams["input"] == "phonemes",
                 "model_name": hparams["model"].__class__.__name__,
             },
         )

From 45b3d1ba99e18260b2afc2b93b05825e249979ce Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Feb 2025 06:59:59 -0500
Subject: [PATCH 145/270] DASB: CPU/GPU fixes

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index b77ad4afa..9cd5fb3f1 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -422,7 +422,7 @@ def save_samples(self, batch, wav, length, stage):
         samples = undo_padding_tensor(wav, length)
         for uttid, sample in zip(batch.uttid, samples):
             file_name = output_folder / f"pred_{uttid}.wav"
-            write_audio(file_name, sample, self.hparams.model_sample_rate)
+            write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate)
 
     def save_eval(self, stage):
         """Saves evaluation results

From 370ab8e390082e0ef5bf388e6c01dfa3f7686a16 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Feb 2025 17:20:03 -0500
Subject: [PATCH 146/270] DASB: Minor fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 11 ++++++++---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 14 ++++++++------
 benchmarks/DASB/run_experiments.sh             |  2 +-
 benchmarks/DASB/run_hparam_optimization.sh     |  4 ++--
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index dd9d61762..86e0efc26 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -914,9 +914,14 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        tts_brain.evaluate(
-            test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts,
-        )
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            tts_brain.evaluate(
+                test_set=datasets["test"],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+            )
 
     # Save final checkpoint (fixed name)
     tts_brain.checkpointer.save_checkpoint(name="latest")
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index b160ab3a5..9164d31e0 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -1015,9 +1015,11 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        tts_brain.evaluate(
-            test_set=datasets["test"],
-            test_loader_kwargs=use_silence_padding(
-                hparams["test_dataloader_opts"], silence_padding, audio_keys
-            ),
-        )
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            tts_brain.evaluate(
+                test_set=datasets["test"],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+            )
\ No newline at end of file
diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh
index aacbc381e..5dcd6b397 100755
--- a/benchmarks/DASB/run_experiments.sh
+++ b/benchmarks/DASB/run_experiments.sh
@@ -186,7 +186,7 @@ mkdir -p $cached_data_folder
 # Function to run the training experiment
 run_experiment() {
 
-python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
+eval python $dataset/$task/train.py $hparams  --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \
 $additional_flags
 
 }
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 468015d08..3029a3678 100755
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -365,7 +365,7 @@ while [ -n "$opt_flags" ]; do
     eval $orion_hunt_command
 
     # Compress the exp folder (if required)
-    if [ "$compress_exp" = True ]; then
+    if [ "$compress_exp" = True ] && [ ! -e "$output_folder_step/exp.tar.gz" ]; then
         tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp"
         if [ -d "$output_folder_step/exp" ]; then
             rm -rf "$output_folder_step/exp"
@@ -417,4 +417,4 @@ scp $best_yaml_file $final_yaml_file
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
   --rnd_dir False --testing True $additional_flags
 
-echo "The test performance with best hparams is available at  $output_folder/best"
\ No newline at end of file
+echo "The test performance with best hparams is available at  $output_folder/best"

From 256fa35e4ba1ac708efe4b005fce13c1e25db58d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Feb 2025 22:17:24 -0500
Subject: [PATCH 147/270] DASB: Fixes

---
 .../TTS/valle/hparams/train_encodec.yaml      |  3 +-
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   | 36 +++++++++++--------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index 6ea447070..3a9a1347d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -84,7 +84,7 @@ model_sample_rate: 24000
 max_audio_length: 1000
 text_max_length: 500
 n_ctx: !ref <max_audio_length> + <text_max_length>
-infer_max_audio_length: !ref <max_audio_length>
+infer_top_k: 20
 max_length_ratio: 10.0
 debug_infer_max_audio_length: 10
 
@@ -187,6 +187,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    top_k: !ref <infer_top_k>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 9cd5fb3f1..c8b35c57d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -78,11 +78,6 @@ def create_waveform(self, audio, length):
         if hasattr(self.modules.tokenizer, "codec_vocoder"):
             self.modules.tokenizer.codec_vocoder.to(self.device)
             self.modules.tokenizer.codec_vocoder.device = self.device
-        audio = (
-            (audio - hparams["audio_token_shift"] - self.offsets)
-            .clip(min=0.0)
-            .int()
-        )
         wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         return wav
@@ -288,7 +283,7 @@ def evaluate_batch(self, batch, stage):
                 audio_tokens, audio_length = self.inference(batch)
                 if self.hparams.flip_layers:
                     audio_tokens = audio_tokens.flip(2)
-                wav = self.create_waveform(audio_tokens, audio_length)
+                wav = self.create_waveform(audio_tokens, audio_length)                
                 wav = wav.squeeze(1)
                 self.save_samples(
                     batch=batch, wav=wav, length=audio_length, stage=stage
@@ -391,7 +386,10 @@ def inference(self, batch):
             for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
-        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
+        offsets = self.offsets
+        if self.hparams.flip_layers:
+            offsets = offsets.flip(2)
+        audio = (audio - self.hparams.audio_token_shift - offsets).clip(0)
         return audio, audio_length
 
     def _get_inference_opts(self):
@@ -550,7 +548,7 @@ def sig_pipeline(wav):
         sig = sb.dataio.dataio.read_audio(wav)
         return sig
 
-    dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline]
+    dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline, prompt_pipeline]
 
     init_sequence_encoder(hparams)
     use_spk_emb = hparams.get("use_spk_emb", False)
@@ -572,7 +570,6 @@ def sig_pipeline(wav):
         dataset_dynamic_items = list(dynamic_items)
         dataset_output_keys = list(output_keys)
         if dataset != "train":
-            dataset_dynamic_items.append(sig_pipeline)
             dataset_output_keys += ["sig", "label_norm_eval", "prefix"]
         dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json(
             json_path=data_info[dataset],
@@ -707,17 +704,23 @@ def apply_overfit_test(hparams, dataset):
     """
     if hparams["overfit_test"]:
         if isinstance(dataset, tuple):
-            dataset_train, _, _ = dataset
+            dataset_train, dataset_valid, _ = dataset
             dataset_train = apply_overfit_test(hparams, dataset_train)
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(
+                list(dataset_valid.pipeline.output_mapping.keys())
+            )
             result = dataset_train, dataset_eval, dataset_eval
         elif isinstance(dataset, dict):
             dataset_train = apply_overfit_test(hparams, dataset["train"])
             dataset_eval = dataset_train.filtered_sorted(
                 select_n=hparams["overfit_test_sample_count"]
             )
+            dataset_eval.set_output_keys(
+                list(dataset["valid"].pipeline.output_mapping.keys())
+            )
             result = {
                 "train": dataset_train,
                 "valid": dataset_eval,
@@ -831,6 +834,7 @@ def undo_padding_tensor(batch, lengths):
     datasets = dataio_prepare(hparams)
 
     # Apply overfit test settings
+    datasets["train"].data_ids = ["LJ001-0023"]
     datasets = apply_overfit_test(hparams, datasets)
     audio_keys = ["audio_tokens"]
 
@@ -857,7 +861,11 @@ def undo_padding_tensor(batch, lengths):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        tts_brain.evaluate(
-            test_set=datasets["test"],
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-        )
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            tts_brain.evaluate(
+                test_set=datasets["test"],
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+            )

From 9f27332c1c0f2b25f574a2d2915cbb7aaefa1d45 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Feb 2025 22:35:18 -0500
Subject: [PATCH 148/270] DASB: Review debugging code

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index c8b35c57d..d7eec7079 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -834,7 +834,6 @@ def undo_padding_tensor(batch, lengths):
     datasets = dataio_prepare(hparams)
 
     # Apply overfit test settings
-    datasets["train"].data_ids = ["LJ001-0023"]
     datasets = apply_overfit_test(hparams, datasets)
     audio_keys = ["audio_tokens"]
 

From bad8999b564b8569101eb3bcb30bf000f8b298e6 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 11 Feb 2025 11:34:49 -0500
Subject: [PATCH 149/270] VALL-E: Update token sequence initialization to
 account for special tokens

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 10 ++++++----
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index d7eec7079..d823530a9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -37,8 +37,6 @@
 
 logger = logging.getLogger(__name__)
 
-SPECIAL_TOKEN_COUNT = 1
-
 
 # Brain class for speech recognition training
 class VALLEBrain(sb.Brain):
@@ -638,10 +636,14 @@ def init_sequence_encoder(hparams):
         an encoder instance"""
     encoder = hparams["label_encoder"]
     token_list_file_name = hparams["token_list_file"]
-    tokens = read_token_list(token_list_file_name)
+    tokens = read_token_list(token_list_file_name)    
     encoder.add_unk()
+    for token in hparams["special_tokens"]:
+        token_key = token.replace("<", "").replace(">", "")
+        token_index = hparams[f"{token_key}_index"]
+        encoder.insert_label(token, token_index)
     encoder.update_from_iterable(tokens, sequence_input=False)
-    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    encoder.expect_len(len(tokens) + hparams["special_num_tokens"])
     return encoder
 
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 87e70b407..7fa415230 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -39,9 +39,6 @@
 
 logger = logging.getLogger(__name__)
 
-SPECIAL_TOKEN_COUNT = 1
-
-
 # Brain class for speech recognition training
 class VALLEBrain(sb.Brain):
     """Class that manages the training loop. See speechbrain.core.Brain."""
@@ -755,8 +752,13 @@ def init_sequence_encoder(hparams):
     token_list_file_name = hparams["token_list_file"]
     tokens = read_token_list(token_list_file_name)
     encoder.add_unk()
+    for token in hparams["special_tokens"]:
+        token_key = token.replace("<", "").replace(">", "")
+        token_index = hparams[f"{token_key}_index"]
+        encoder.insert_label(token, token_index)
+
     encoder.update_from_iterable(tokens, sequence_input=False)
-    encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT)
+    encoder.expect_len(len(tokens) + hparams["special_num_tokens"])
     return encoder
 
 

From 39ddfd16619ed6afb1797e420691a3e9dbd73a01 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:47:55 -0500
Subject: [PATCH 150/270] DASB: hparam file updates, new  hparams for
 additional tokenizers

---
 .../TTS/valle/hparams/train_mimi.yaml         | 224 ++++++++++++++
 .../TTS/valle/hparams/train_wavtokenizer.yaml | 229 ++++++++++++++
 .../extraction/hparams/speech_tokenizer.yaml  |   2 +-
 .../LibriSpeech/extraction/hparams/mimi.yaml  |   2 +-
 .../extraction/hparams/speech_tokenizer.yaml  |   2 +-
 .../extraction/hparams/wavtokenizer.yaml      |   2 +-
 .../TTS/tokotron/hparams/train_encodec.yaml   |   1 +
 .../tokotron/hparams/train_wavtokenizer.yaml  | 280 ++++++++++++++++++
 8 files changed, 738 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
new file mode 100644
index 000000000..e6d0ad87c
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -0,0 +1,224 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/mimi
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 2048
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+bandwidth: 6
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..b922c7489
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,229 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/wavtokenizer
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: True
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step:  1
+bandwidth: 6
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
index 155960c27..9d6ba7130 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -42,7 +42,7 @@ save_embedding: False
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
index f9720b170..7871d6212 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml
@@ -47,7 +47,7 @@ save_embedding: False
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   num_codebooks: !ref <num_codebooks>
   sample_rate: !ref <sample_rate>
 
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
index 9c8baf3bf..3090e9f79 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -45,7 +45,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <pretrained_model_save_folder>
+  save_path: !ref <save_folder>
   sample_rate: !ref <sample_rate>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
index 976614a3d..9a8b754eb 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml
@@ -47,7 +47,7 @@ vocab_size: 4096
 # wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
   sample_rate: !ref <sample_rate>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 258065779..4a2a7b033 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -14,6 +14,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 # Data files
 data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+alignments_folder: null
 prepare_save_folder: !ref <data_folder>/prepared
 pretrained_model_save_folder: !ref <prepare_save_folder>
 representation_mode: discrete
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..3c06d761f
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,280 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+cached_data_folder: !PLACEHOLDER
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: vocos
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+
+freeze_token_model: True
+model_hub: kyutai/mimi
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+decoder_chunk_size: -1
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 2048
+audio_emb_size: 1024
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 8
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:model.Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>

From 5acd1d3528bd850d607804a0e144545ab00f57e3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Feb 2025 11:49:20 -0500
Subject: [PATCH 151/270] VALL-E: Add files for multiple configurations

---
 .../TTS/tokotron/hparams/train_mimi.yaml      |   2 +-
 .../TTS/tokotron/hparams/train_mimi.yaml      |   2 +-
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 239 ++++++++++++++++++
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  11 +-
 .../TTS/valle/hparams/train_encodec.yaml      |  11 +-
 .../TTS/valle/hparams/train_mimi.yaml         | 234 +++++++++++++++++
 .../valle/hparams/train_speech_tokenizer.yaml | 234 +++++++++++++++++
 .../TTS/valle/hparams/train_wavtokenizer.yaml | 239 ++++++++++++++++++
 8 files changed, 963 insertions(+), 9 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index e80edb2b0..b99ac7980 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -150,7 +150,7 @@ transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
 audio_num_tokens: 2048
-audio_emb_size: 1024
+audio_emb_size: 1024    
 audio_emb_freeze: False
 audio_emb_pretrained: False
 audio_token_offsets: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
index 3c06d761f..4f0772f47 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -112,7 +112,7 @@ beam_size: 5
 
 # Feature parameters
 sample_rate: 22050
-model_sample_rate: 16000
+model_sample_rate: 24000
 max_audio_length: 5000
 infer_max_audio_length: 1000
 debug_infer_max_audio_length: 10
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
new file mode 100644
index 000000000..5cb2f4050
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -0,0 +1,239 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/dac
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 60
+epoch_size: 10000
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 2300
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 2
+
+# Model Settings
+model_type: 24khz
+model_bitrate: 8kbps
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.DACTokenizer
+    model_type: !ref <model_type>
+    model_bitrate: !ref <model_bitrate>
+    n_codebooks: !ref <audio_tokens_per_step>
+    load_pretrained: True
+    tag: latest
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index f8af5fb45..08d53d66b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -10,7 +10,6 @@ __set_seed: !apply:torch.manual_seed [!ref <seed>]
 run_name: !PLACEHOLDER
 
 # Model Type
-ssl_model_type: wavlm
 representation_mode: discrete
 output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
@@ -45,6 +44,9 @@ tokens_folder: !PLACEHOLDER
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 freeze_token_model: True
+
+# Model Settings
+ssl_model_type: wavlm
 token_model_src: !apply:speechbrain.utils.hparams.choice
     value: !ref <ssl_model_type>
     choices:
@@ -63,7 +65,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
 speech_model_layers: [1, 3, 7, 12, 18, 23]
 flip_layers: False
-token_offset: 1
+
+# Speaker Embeddings
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
@@ -71,7 +74,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 50
+number_of_epochs: 1200
+epoch_size: 10000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
@@ -147,6 +151,7 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 6fc0f4b58..3371621d0 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -45,7 +45,6 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
-token_model_src: "facebook/encodec_24khz"
 g2p_src: flexthink/soundchoice-g2p
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 tokens_loader: !new:utils.tokens.TokensLoader
@@ -56,7 +55,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 150
+number_of_epochs: 60
+epoch_size: 10000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
@@ -109,6 +109,7 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
@@ -168,8 +169,10 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+
+# Model Settings
+model_hub: facebook/encodec_24khz
 bandwidth: 6
-attention_type: regularMHA
 
 ############################## models ################################
 
@@ -193,7 +196,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
-    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    source: !ref <model_hub>
     save_path: !ref <pretrained_model_save_folder>
     sample_rate: !ref <sample_rate>
     bandwidth: !ref <bandwidth>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
new file mode 100644
index 000000000..d41dd3b98
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -0,0 +1,234 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/mimi
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 60
+epoch_size: 10000
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 2300
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+
+# Model Settings
+model_hub: kyutai/mimi
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    num_codebooks: !ref <audio_tokens_per_step>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
new file mode 100644
index 000000000..4928b2dd3
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -0,0 +1,234 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/speech_tokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 60
+epoch_size: 10000
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 2300
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+
+# Model Settings
+model_hub: fnlp/SpeechTokenizer
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
+    source: !ref <model_hub>  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
new file mode 100644
index 000000000..1796c4425
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -0,0 +1,239 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/wavtokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 60
+epoch_size: 10000
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 5.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 2300
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 1
+
+# Model Settings
+model_hub: novateur/WavTokenizer-medium-music-audio-75token
+config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
+checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
+    source: !ref <model_hub>
+    save_path: !ref <pretrained_model_save_folder>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+    freeze: True
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>

From a78f011ff4a59700d9ecb37c9f11879e30bb1409 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Feb 2025 19:00:33 -0500
Subject: [PATCH 152/270] DASB: Add Lifeteng-style curriculum, some config
 updates

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   2 +
 .../TTS/valle/hparams/train_encodec.yaml      |   2 +
 .../TTS/valle/hparams/train_mimi.yaml         |   2 +
 .../TTS/valle/hparams/train_wavtokenizer.yaml |   2 +
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   | 147 +++++++++++++-----
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |   8 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   8 +-
 .../TTS/valle/hparams/train_encodec.yaml      |   8 +-
 .../TTS/valle/hparams/train_mimi.yaml         |   8 +-
 .../valle/hparams/train_speech_tokenizer.yaml |   8 +-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |   8 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 141 ++++++++++++-----
 benchmarks/DASB/model/valle.py                |  44 ++++--
 13 files changed, 273 insertions(+), 115 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 2f6f34297..3e34a6bce 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -70,6 +70,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
 batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index 3a9a1347d..0255373ad 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -55,6 +55,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
 batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index e6d0ad87c..aeb97d1c3 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -56,6 +56,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
 batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index b922c7489..1867c0c1c 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -58,6 +58,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 # Training parameters
 input: text
 number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
 batch_size: 16
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index d823530a9..02a76476b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
+"""Recipe for training VALL-E
 
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
+Based on ESPNET VALL-E
 
+Curriculum inspired by Lifeiteng's VALL-E
+https://github.com/lifeiteng/vall-e
 
 Authors
  * Artem Ploujnikov 2024
@@ -98,14 +96,19 @@ def compute_forward(self, batch, stage):
         batch = batch.to(self.device)
         prompt, prompt_length = batch.prompt
         batch_size, prompt_max_len, num_tracks = prompt.shape
-        nar_track = torch.randint(
-            1, num_tracks, (batch_size,), device=self.device
-        )
+        if self.train_nar:
+            nar_track = torch.randint(
+                1, num_tracks, (batch_size,), device=self.device
+            )
+        else:
+            nar_track = None
         logits_ar, logits_nar = self.modules.model(
             dec_seq=batch.prompt.data,
             dec_seq_lengths=batch.prompt.lengths,
             prefix_len=batch.prefix_length / prompt_max_len,
             nar_level_idx=nar_track,
+            predict_ar=self.train_ar,
+            predict_nar=self.train_nar,
         )
         return logits_ar, logits_nar, nar_track
 
@@ -134,13 +137,8 @@ def compute_objectives(self, predictions, batch, stage):
         prompt, prompt_length = batch.prompt
         prefix_length = batch.prefix_length
 
-        logits_ar_sm = self.hparams.log_softmax(logits_ar)
-        logits_nar_sm = self.hparams.log_softmax(logits_nar)
-        batch_size, max_len, _ = prompt.shape
-        targets_ar = prompt[:, 1:, 0]
+        batch_size, prompt_max_len, _ = prompt.shape
         batch_idx = torch.arange(batch_size, device=prompt.device)
-        targets_nar = prompt[batch_idx, 1:, nar_track]
-        prompt_max_len = prompt.size(1)
         length_mask = length_to_mask(
             prompt_length * prompt_max_len, prompt_max_len
         )
@@ -149,29 +147,81 @@ def compute_objectives(self, predictions, batch, stage):
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
-        loss_ar = self.hparams.compute_cost(
-            log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
-        )
-        self.loss_metric_ar.append(
-            ids=batch.uttid,
-            log_probabilities=logits_ar_sm,
-            targets=targets_ar,
-            mask=mask,
-            reduction="batch",
-        )
-        loss_nar = self.hparams.compute_cost(
-            log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
-        )
-        self.loss_metric_nar.append(
+        loss_components = []
+
+        if self.train_ar:
+            logits_ar_sm = self.hparams.log_softmax(logits_ar)
+            targets_ar = prompt[:, 1:, 0]
+            loss_ar = self.hparams.compute_cost(
+                log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
+            )
+            loss_components.append(loss_ar)
+        else:
+            logits_ar_sm, targets_ar = None, None
+        if self.train_nar:
+            logits_nar_sm = self.hparams.log_softmax(logits_nar)
+            targets_nar = prompt[batch_idx, 1:, nar_track]
+            loss_nar = self.hparams.compute_cost(
+                log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
+            )
+            loss_components.append(loss_nar)
+        else:
+            logits_nar_sm, targets_nar = None, None
+
+        self.loss_metric.append(
             ids=batch.uttid,
-            log_probabilities=logits_nar_sm,
-            targets=targets_nar,
+            logits_ar=logits_ar_sm,
+            targets_ar=targets_ar,
+            logits_nar=logits_nar_sm,
+            targets_nar=targets_nar,
             mask=mask,
             reduction="batch",
         )
-        loss = 0.5 * (loss_ar + loss_nar)
+
+        loss = torch.mean(torch.stack(loss_components))
         return loss
 
+    def compute_loss_stats(
+        self,
+        logits_ar,
+        targets_ar,
+        logits_nar,
+        targets_nar,
+        mask,
+        reduction="batch"
+    ):
+        """Computes an autoregressive/non-autoregressive loss breakdown,
+        to be used for metrics/stats
+
+        Arguments
+        ---------
+        logits_ar : torch.Tensor
+            The autoregressive predictions
+        targets_ar : torch.Tensor
+            The targets for autoregressive predictions
+        logits_nar : torch.Tensor
+            The non-autoregressive predictions
+        targets_nar : torch.Tensor
+            The targets for non-autoregressive prediction
+        
+        Returns
+        -------
+        stats: dict
+            statistics
+        """
+        stats = {}
+        if self.train_ar:
+            stats["loss_ar"] = self.hparams.compute_cost(
+                log_probabilities=logits_ar, targets=targets_ar, mask=mask,
+                reduction=reduction,
+            )
+        if self.train_nar:
+            stats["loss_nar"] = self.hparams.compute_cost(
+                log_probabilities=logits_nar, targets=targets_nar, mask=mask,
+                reduction=reduction,
+            )
+        return stats
+
     def on_stage_start(self, stage, epoch):
         """Gets called at the beginning of each epoch.
 
@@ -188,16 +238,10 @@ def on_stage_start(self, stage, epoch):
         )[None, None, :].to(self.device)
 
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
-        )
-        self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
-        )
-        self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
+            metric=self.compute_loss_stats, batch_eval=True,
         )
+        self.apply_curriculum()
 
-        # TOOO: Reestablish evaluation
         self.is_evaluating = False
         if stage == sb.Stage.VALID:
             if self.is_eval_epoch(epoch):
@@ -209,6 +253,22 @@ def on_stage_start(self, stage, epoch):
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
 
+    def apply_curriculum(self):
+        """Applies curriculum settings, if specified, training only the autoregressive part - or
+        only the non-autoregressive part"""
+        epoch = self.hparams.epoch_counter.current
+        self.train_ar, self.train_nar = True, True
+        if self.hparams.audio_tokens_per_step == 1:
+            # NOTE: If there is only one track it's autoregressive
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
+            self.train_nar = False
+        elif (
+            self.hparams.number_of_epochs_nar is not None
+            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+        ):
+            self.train_ar = False
+
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
         in the specieied epoch
@@ -226,7 +286,12 @@ def is_eval_epoch(self, epoch):
             otherwise"""
         if epoch is None:
             epoch = self.hparams.epoch_counter.current
-        return epoch % self.hparams.eval_interval == 0
+        # NOTE: Need to get past AR-only training to be able to evaluate
+        can_evaluate = not (
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        )
+        return can_evaluate and (epoch % self.hparams.eval_interval == 0)
 
     def on_fit_start(self):
         """Gets called at the beginning of ``fit()``, on multiple processes
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 5cb2f4050..c60540da2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -56,12 +56,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 60
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 5.0
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 08d53d66b..d3e30a4f4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -74,12 +74,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 1200
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 0.01
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 3371621d0..34c5e6cb2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 60
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 5.0
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index d41dd3b98..c585acfba 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 60
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 5.0
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 4928b2dd3..fd0e3daaf 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 60
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 5.0
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 1796c4425..e59ccd34f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
 input: text
-number_of_epochs: 60
-epoch_size: 10000
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
 batch_size: 16
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 5.0
+max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 7fa415230..47e0a25e4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env/python3
-"""Recipe for training a Text-to-Speech system based on tokenized audio
+"""Recipe for training VALL-E
 
-Inspired by WhisperSpeech
-https://github.com/collabora/WhisperSpeech
-
-However, this is not an implementation of WhisperSpeech, but rather
-a radical simplification of it that uses only an acoustic model
+Based on ESPNET VALL-E
 
+Curriculum inspired by Lifeiteng's VALL-E
+https://github.com/lifeiteng/vall-e
 
 Authors
  * Artem Ploujnikov 2024
@@ -108,6 +106,8 @@ def compute_forward(self, batch, stage):
             dec_seq_lengths=batch.prompt.lengths,
             prefix_len=batch.prefix_length / prompt_max_len,
             nar_level_idx=nar_track,
+            predict_ar=self.train_ar,
+            predict_nar=self.train_nar,
         )
         return logits_ar, logits_nar, nar_track
 
@@ -136,13 +136,8 @@ def compute_objectives(self, predictions, batch, stage):
         prompt, prompt_length = batch.prompt
         prefix_length = batch.prefix_length
 
-        logits_ar_sm = self.hparams.log_softmax(logits_ar)
-        logits_nar_sm = self.hparams.log_softmax(logits_nar)
-        batch_size, max_len, _ = prompt.shape
-        targets_ar = prompt[:, 1:, 0]
+        batch_size, prompt_max_len, _ = prompt.shape
         batch_idx = torch.arange(batch_size, device=prompt.device)
-        targets_nar = prompt[batch_idx, 1:, nar_track]
-        prompt_max_len = prompt.size(1)
         length_mask = length_to_mask(
             prompt_length * prompt_max_len, prompt_max_len
         )
@@ -151,28 +146,80 @@ def compute_objectives(self, predictions, batch, stage):
         ).logical_not()
         mask = (length_mask * prefix_mask)[:, 1:]
 
-        loss_ar = self.hparams.compute_cost(
-            log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
-        )
-        self.loss_metric_ar.append(
-            ids=batch.uttid,
-            log_probabilities=logits_ar_sm,
-            targets=targets_ar,
-            mask=mask,
-            reduction="batch",
-        )
-        loss_nar = self.hparams.compute_cost(
-            log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
-        )
-        self.loss_metric_nar.append(
+        loss_components = []
+
+        if self.train_ar:
+            logits_ar_sm = self.hparams.log_softmax(logits_ar)
+            targets_ar = prompt[:, 1:, 0]
+            loss_ar = self.hparams.compute_cost(
+                log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
+            )
+            loss_components.append(loss_ar)
+        else:
+            logits_ar_sm, targets_ar = None, None
+        if self.train_nar:
+            logits_nar_sm = self.hparams.log_softmax(logits_nar)
+            targets_nar = prompt[batch_idx, 1:, nar_track]
+            loss_nar = self.hparams.compute_cost(
+                log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
+            )
+            loss_components.append(loss_nar)
+        else:
+            logits_nar_sm, targets_nar = None, None
+
+        self.loss_metric.append(
             ids=batch.uttid,
-            log_probabilities=logits_nar_sm,
-            targets=targets_nar,
+            logits_ar=logits_ar_sm,
+            targets_ar=targets_ar,
+            logits_nar=logits_nar_sm,
+            targets_nar=targets_nar,
             mask=mask,
             reduction="batch",
         )
-        loss = 0.5 * (loss_ar + loss_nar)
+
+        loss = torch.mean(torch.stack(loss_components))
         return loss
+    
+    def compute_loss_stats(
+        self,
+        logits_ar,
+        targets_ar,
+        logits_nar,
+        targets_nar,
+        mask,
+        reduction="batch"
+    ):
+        """Computes an autoregressive/non-autoregressive loss breakdown,
+        to be used for metrics/stats
+        
+        Arguments
+        ---------
+        logits_ar : torch.Tensor
+            The autoregressive predictions
+        targets_ar : torch.Tensor
+            The targets for autoregressive predictions
+        logits_nar : torch.Tensor
+            The non-autoregressive predictions
+        targets_nar : torch.Tensor
+            The targets for non-autoregressive prediction
+        
+        Returns
+        -------
+        stats: dict
+            statistics
+        """
+        stats = {}
+        if self.train_ar:
+            stats["loss_ar"] = self.hparams.compute_cost(
+                log_probabilities=logits_ar, targets=targets_ar, mask=mask,
+                reduction=reduction,
+            )
+        if self.train_nar:
+            stats["loss_nar"] = self.hparams.compute_cost(
+                log_probabilities=logits_nar, targets=targets_nar, mask=mask,
+                reduction=reduction,
+            )
+        return stats
 
     def on_stage_start(self, stage, epoch):
         """Gets called at the beginning of each epoch.
@@ -190,16 +237,10 @@ def on_stage_start(self, stage, epoch):
         )[None, None, :].to(self.device)
 
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
-        )
-        self.loss_metric_ar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
-        )
-        self.loss_metric_nar = sb.utils.metric_stats.MetricStats(
-            metric=self.hparams.compute_cost, batch_eval=True,
+            metric=self.compute_loss_stats, batch_eval=True,
         )
+        self.apply_curriculum()
 
-        # TOOO: Reestablish evaluation
         self.is_evaluating = False
         if stage == sb.Stage.VALID:
             if self.is_eval_epoch(epoch):
@@ -211,6 +252,22 @@ def on_stage_start(self, stage, epoch):
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
 
+    def apply_curriculum(self):
+        """Applies curriculum settings, if specified, training only the autoregressive part - or
+        only the non-autoregressive part"""
+        epoch = self.hparams.epoch_counter.current
+        self.train_ar, self.train_nar = True, True
+        if self.hparams.audio_tokens_per_step == 1:
+            # NOTE: If there is only one track it's autoregressive
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
+            self.train_nar = False
+        elif (
+            self.hparams.number_of_epochs_nar is not None
+            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+        ):
+            self.train_ar = False
+
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
         in the specieied epoch
@@ -228,7 +285,12 @@ def is_eval_epoch(self, epoch):
             otherwise"""
         if epoch is None:
             epoch = self.hparams.epoch_counter.current
-        return epoch % self.hparams.eval_interval == 0
+        # NOTE: Need to get past AR-only training to be able to evaluate
+        can_evaluate = not (
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        )
+        return can_evaluate and (epoch % self.hparams.eval_interval == 0)
 
     def on_fit_start(self):
         """Gets called at the beginning of ``fit()``, on multiple processes
@@ -379,8 +441,7 @@ def inference(self, batch):
             )
             for prefix_item in prefix_items
         ]
-        inferred_tokens = [
-            result[0][0]
+        inferred_tokens = [            result[0][0]
             if result[0]
             else torch.zeros(
                 1000, self.hparams.audio_tokens_per_step, device=self.device
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index ab233efff..5805cb061 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -125,6 +125,8 @@ def forward(
         prefix_len=None,
         conti_feats=None,
         nar_level_idx=1,
+        predict_ar=True,
+        predict_nar=True,
     ):
         """Vall-E forward for training
 
@@ -144,6 +146,10 @@ def forward(
             Lengths of condition part in dec_seq (B,).
         nar_level_idx : int
             the index of the non-autoregressive level to train
+        predict_ar : bool
+            Whether to make an autoregressive prediction
+        predict_nar : bool
+            Whether to make a non-autoregressive prediction
 
         Returns
         -------
@@ -161,24 +167,30 @@ def forward(
         )
 
         # Auto-Regressive part
-        input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[
-            :, :-1
-        ]  # [B, T, D]
-        h_ar = self.ar_decoder(input_ar_emb)
+        if predict_ar:
+            input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[
+                :, :-1
+            ]  # [B, T, D]
+            h_ar = self.ar_decoder(input_ar_emb)
 
         # Non-Auto-Regressive part
-        input_nar_emb = self.prepare_input(
-            dec_seq_emb, prefix_len, nar_level_idx
-        )[
-            :, 1:
-        ]  # [B, T, V]
-        max_len = dec_seq.size(1)
-        mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool()
-        mask = mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask)
-
-        logits_ar = self.lm_head(h_ar)
-        logits_nar = self.lm_head(h_nar)
+        if predict_nar:
+            input_nar_emb = self.prepare_input(
+                dec_seq_emb, prefix_len, nar_level_idx
+            )[
+                :, 1:
+            ]  # [B, T, V]
+            max_len = dec_seq.size(1)
+            mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool()
+            mask = mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+            h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask)
+
+        # Logits
+        logits_ar, logits_nar = None, None
+        if predict_ar:
+            logits_ar = self.lm_head(h_ar)
+        if predict_nar:
+            logits_nar = self.lm_head(h_nar)
 
         return logits_ar, logits_nar
 

From 953540baa3c6f7dba1a401a75c972306e5ea6dd4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 13 Feb 2025 02:03:19 -0500
Subject: [PATCH 153/270] DASB: Add init_from

---
 .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index b48bb66fa..937db0812 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -38,6 +38,7 @@ sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 

From f8b9a6721d1f18a974708e6456cfaf50bd9d1a45 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 13 Feb 2025 02:13:32 -0500
Subject: [PATCH 154/270] DASB: Add init_from

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index c585acfba..c6d1a4dfb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -42,6 +42,7 @@ sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 

From 4f8cc9c96ab284b18f835e405545e13ccf251d43 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 13 Feb 2025 13:45:59 -0500
Subject: [PATCH 155/270] DASB: VALL-E: Implement checkpoint retention based on
 dWER

---
 .../tokotron/hparams/train_discrete_ssl.yaml  |  2 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |  1 -
 .../TTS/valle/hparams/train_discrete_ssl.yaml | 10 +++++--
 .../TTS/valle/hparams/train_encodec.yaml      |  7 ++++-
 .../TTS/valle/hparams/train_mimi.yaml         |  7 ++++-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  7 ++++-
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   | 15 ++++++++++-
 .../TTS/tokotron/hparams/train_encodec.yaml   |  1 +
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |  6 +++++
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  8 +++++-
 .../TTS/valle/hparams/train_encodec.yaml      |  6 +++++
 .../TTS/valle/hparams/train_mimi.yaml         |  6 +++++
 .../valle/hparams/train_speech_tokenizer.yaml |  6 +++++
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  6 +++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 26 ++++++++++++++++---
 15 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index b92a76255..afdac42b7 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -12,7 +12,7 @@ run_name: !PLACEHOLDER
 # Model Type
 ssl_model_type: wavlm
 representation_mode: discrete
-output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 testing: True # If set to True, the test evlaution is done, otherwise skipped.
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 21dee91e3..d101e1d85 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -50,7 +50,6 @@ tokens_loader: !new:utils.tokens.TokensLoader
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
 
-
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 3e34a6bce..1fd2aa3aa 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -11,7 +11,7 @@ run_name: !PLACEHOLDER
 
 # Model Type
 ssl_model_type: wavlm
-output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 testing: True # If set to True, the test evlaution is done, otherwise skipped.
@@ -59,14 +59,20 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
 
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
-flip_layers: True
+flip_layers: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
+
 # Training parameters
 input: text
 number_of_epochs: 50
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index 0255373ad..fd1fea7cc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -44,12 +44,17 @@ kmeans_dataset: LibriSpeech
 
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
-flip_layers: True
+flip_layers: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index aeb97d1c3..eb1605f4e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -45,12 +45,17 @@ kmeans_dataset: LibriSpeech
 
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
-flip_layers: True
+flip_layers: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index 1867c0c1c..dd9d60798 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -47,12 +47,17 @@ kmeans_dataset: LibriSpeech
 
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
-flip_layers: True
+flip_layers: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 02a76476b..73af26870 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -391,6 +391,8 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 for key, value in self.hparams.eval_summary_log.items()
             }
             stage_stats.update(eval_summary_stats)
+        else:
+            eval_summary_stats = {}
 
         # Perform end-of-iteration things, like annealing, logging, etc.
         if stage == sb.Stage.VALID:
@@ -409,8 +411,13 @@ def on_stage_end(self, stage, stage_loss, epoch):
             )
 
             # Save the current checkpoint and delete previous checkpoints.
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
             self.checkpointer.save_and_keep_only(
-                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
+                **ckpt_kwargs
             )
 
     def inference(self, batch):
@@ -931,7 +938,13 @@ def undo_padding_tensor(batch, lengths):
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
             tts_brain.evaluate(
                 test_set=datasets["test"],
                 test_loader_kwargs=hparams["test_dataloader_opts"],
+                **eval_kwargs
             )
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index 4a2a7b033..e85d37ff8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -39,6 +39,7 @@ sample_path: null
 progress_folder: !ref <output_folder>/progress
 progress_current: !ref <progress_folder>/current
 progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
 num_audio_samples: 32
 samples_interval: 5
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index c60540da2..588fd6b55 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -52,6 +52,12 @@ tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index d3e30a4f4..161c911e6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -11,7 +11,7 @@ run_name: !PLACEHOLDER
 
 # Model Type
 representation_mode: discrete
-output_folder: !ref results/tokotron/<experiment_name>/<ssl_model_type>/<seed>
+output_folder: !ref results/<experiment_name>/<ssl_model_type>/<seed>
 save_folder: !ref <output_folder>/save
 train_log: !ref <output_folder>/train_log.txt
 testing: True # If set to True, the test evlaution is done, otherwise skipped.
@@ -70,6 +70,12 @@ flip_layers: False
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 34c5e6cb2..ad79fb3de 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index c6d1a4dfb..2b3b5d5b3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -52,6 +52,12 @@ tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index fd0e3daaf..eae14615f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index e59ccd34f..4120d37f5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
 splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 47e0a25e4..bde1deb8a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -402,14 +402,24 @@ def on_stage_end(self, stage, stage_loss, epoch):
 
             # The train_logger writes a summary to stdout and to the logfile.
             self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr": lr},
+                stats_meta={"epoch": epoch, "lr": lr, **eval_summary_stats},
                 train_stats=self.train_stats,
                 valid_stats=stage_stats,
             )
 
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
             # Save the current checkpoint and delete previous checkpoints.
             self.checkpointer.save_and_keep_only(
-                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+                meta={"loss": stage_stats["loss"]},
+                **ckpt_kwargs
+            )
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"epoch": epoch, "lr": lr},
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
             )
 
     def inference(self, batch):
@@ -1047,8 +1057,18 @@ def undo_padding_tensor(batch, lengths):
     )
 
     # Load best checkpoint for evaluation
-    if hparams["testing"]:
+
+    test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+    if test_summary_file.exists():
+        logging.info("Test run already completed: %s", test_summary_file)
+    else:
+        test_key_kind = hparams["test_key_kind"]
+        test_key = hparams["test_key"]
+        eval_kwargs = {
+            f"{test_key_kind}_key": test_key
+        }
         tts_brain.evaluate(
             test_set=datasets["test"],
             test_loader_kwargs=hparams["test_dataloader_opts"],
+            **eval_kwargs
         )

From 856df20804b0f95956df38faa51adefe28777ebd Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 13 Feb 2025 18:24:31 -0500
Subject: [PATCH 156/270] DASB: ESPNet Encodec support

---
 .../LJSpeech/extraction/hparams/encodec.yaml  |   3 +-
 .../extraction/hparams/espnet_encodec.yaml    |  66 ++++
 .../LJSpeech/extraction/hparams/mimi.yaml     |   3 +-
 .../extraction/hparams/speech_tokenizer.yaml  |   1 +
 .../LJSpeech/extraction/hparams/sqcodec.yaml  |   3 +-
 .../extraction/hparams/wavtokenizer.yaml      |   3 +-
 .../hparams/train_espnet_encodec.yaml         | 293 ++++++++++++++++++
 .../DASB/LibriTTS/extraction/extract.py       |   2 -
 .../extraction/hparams/espnet_encodec.yaml    |  66 ++++
 benchmarks/DASB/utils/tokenizer_interface.py  | 116 +++++++
 10 files changed, 550 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml

diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
index 6de95de73..869d1c503 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/encodec
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -47,7 +48,7 @@ save_embedding: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
   source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   sample_rate: !ref <sample_rate>
   bandwidth: !ref <bandwidth>
   flat_embeddings: False
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml
new file mode 100644
index 000000000..c03ffa936
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml
@@ -0,0 +1,66 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LJSpeech
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+frozen_split_path: null
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/valid.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <num_codebooks>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
index 482f3739f..c534bef0f 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/mimi
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -44,7 +45,7 @@ save_embedding: False
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   num_codebooks: !ref <num_codebooks>
 
 
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
index 9d6ba7130..d036e05a3 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/speech_tokenizer
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
index 378315bcf..28c7c9be9 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/sqcodec
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -42,7 +43,7 @@ save_path: /home/ubuntu/sq-codec/SQ-Codec
 
 # SQCodec model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
-  save_path: !ref <save_path>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
 
diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
index 3a0a935ff..a23c29e59 100644
--- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/wavtokenizer
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -44,7 +45,7 @@ vocab_size: 4096
 # wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
   freeze: True
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..e85d37ff8
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,293 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech
+alignments_folder: null
+prepare_save_folder: !ref <data_folder>/prepared
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+vocoder_model_name: encodec
+vocoder_model_path: !ref <pretrained_model_save_folder>/<vocoder_model_name>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+freeze_token_model: True
+token_model_src: "facebook/encodec_24khz"
+g2p_src: flexthink/soundchoice-g2p
+token_offset: 1
+vocoder_type: encodec
+vocoder_src: "charactr/vocos-encodec-24khz"
+vocoder_takes_spk_emb: False
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+inference_mode: autoregressive
+eos_mode: gate
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 5000
+infer_max_audio_length: 1000
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: ./hparams/char_en.txt
+token_list_file_phn: ./hparams/arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+# Guides
+guides_enabled: False
+
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+audio_num_tokens: 1024
+audio_emb_size: 128
+audio_emb_freeze: True
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 2
+bandwidth: 1.5
+attention_type: regularMHA
+
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        vocoder: !ref <vocoder_takes_spk_emb>
+        injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length
+    input_num_tokens: !ref <input_num_tokens>
+    audio_num_tokens: !ref <audio_num_tokens>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    emb: !ref <emb>
+
+
+tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
+    source: facebook/encodec_24khz  # Only the 24kHz version supports mono audio
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    bandwidth: !ref <bandwidth>
+    flat_embeddings: False
+    freeze: True
+    renorm_embeddings: False
+
+
+modules:
+    model: !ref <model>
+    compute_cost: !ref <compute_cost>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
index a3db84984..00799eeb4 100644
--- a/benchmarks/DASB/LibriTTS/extraction/extract.py
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -17,8 +17,6 @@
 base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 sys.path.append(base_dir)
 
-print(base_dir)
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
new file mode 100644
index 000000000..a6630188f
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
@@ -0,0 +1,66 @@
+# ############################################################################
+# Auido Tokenizer: Encodec
+# Extraction: Librispeech 960h
+# Authors: Jarod Duret 2024
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/encodec
+save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
+train_log: !ref <output_folder>/extraction_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g., /path/to/LibriSpeech
+train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500"
+dev_splits: ["dev-clean"]
+test_splits: ["dev-clean", "test-clean", "test-other"]
+skip_prep: False
+train_json: !ref <output_folder>/train.json
+valid_json: !ref <output_folder>/dev-clean.json
+test_json: !ref <output_folder>/test.json
+
+
+batch_size: 8
+num_workers: 8
+src_key: wav
+id_key: id
+
+# Dataloader options
+dataloader_opts:
+  batch_size: !ref <batch_size>
+  shuffle: True
+  num_workers: !ref <num_workers>
+
+# EnCodec parameters
+# sample_rate: [24000, 24000, 24000, 24000]
+# vocab_size: [1024, 1024, 1024, 1024]
+# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0]
+# num_codebooks: [2, 4, 8, 16, 32]
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+num_codebooks: 32
+vocab_size: 1024
+sample_rate: 24000
+save_embedding: False
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <num_codebooks>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+tokens_extractor: !new:utils.tokens.TokensExtractor
+  tokenizer: !ref <tokenizer>
+  sample_rate: !ref <sample_rate>
+  src_key: !ref <src_key>
+  id_key: !ref <id_key>
+  dataloader_opts: !ref <dataloader_opts>
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index 0ab019b58..1ba9bc21a 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -10,7 +10,9 @@
 import sys
 import os
 import torch
+import re
 from abc import ABC, abstractmethod
+from pathlib import Path
 from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
 from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import (
     DiscreteSSL,
@@ -19,6 +21,14 @@
 from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer
 from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer
 from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi
+from speechbrain.utils.superpowers import run_shell
+from speechbrain.utils.fetching import fetch
+from torch import nn
+import logging
+import shlex
+import yaml
+
+logger = logging.getLogger(__name__)
 
 base_dir = os.path.abspath(
     os.path.join(os.path.dirname(__file__), "..")
@@ -513,3 +523,109 @@ def get_pretrained_embeddings(
         raise ValueError(
             "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization."
         )
+
+
+DEFAULT_ESPNET_REPO = "https://github.com/espnet/espnet"
+
+
+class ESPNetEncodecInterface(BaseTokenizer, nn.Module):
+    """An interface for pretrained ESPNet Encodec implementations"""
+
+    def __init__(
+        self,
+        source,
+        model_ckpt,
+        model_config,
+        save_path,
+        sample_rate=24000,
+        n_codebook=32,
+        espnet_repo=DEFAULT_ESPNET_REPO,
+        espnet_commit=None,
+    ):
+        super().__init__()
+        self.source = source
+        self.model_ckpt = model_ckpt
+        self.model_config = model_config
+        self.save_path = Path(save_path)
+        self.sample_rate = sample_rate
+        self.n_codebook = n_codebook
+        self.espnet_repo = espnet_repo
+        self.espnet_commit = espnet_commit
+        self._load()
+
+    def _load(self):
+        self._load_espnet()
+        ckpt_file_name = fetch(
+            filename=self.model_ckpt,
+            source=self.source,
+            savedir=str(self.save_path),
+            save_filename=str(Path(self.model_ckpt).name)
+        )
+        config_file_name = fetch(
+            filename=self.model_config,
+            source=self.source,
+            savedir=str(self.save_path),
+            save_filename="config.yaml"
+        )
+        with open(config_file_name) as config_file:
+            config = yaml.safe_load(config_file)
+        from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec
+        self.encodec = ESPNetEncodec(**config["codec_conf"])
+        device = next(iter(self.encodec.parameters())).device
+        state_dict = torch.load(ckpt_file_name, map_location=device)
+        state_dict = {
+            re.sub("^codec.", "", key): value
+            for key, value in state_dict.items()
+        }
+        self.encodec.load_state_dict(state_dict)
+
+    def _load_espnet(self):
+        try:
+            import espnet2
+        except ModuleNotFoundError:
+            self._download_espnet()
+
+    def _download_espnet(self):
+        logger.info("espnet is not installed, installing")
+        espnet_path = self.save_path / "espnet"
+        if not espnet_path.exists():
+            logger.info("Cloining %s into %s", self.espnet_repo, espnet_path)
+            cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)])
+            run_shell(cmd)
+        else:
+            logger.info("%s already exists", espnet_path)
+        if self.espnet_commit:
+            logger.info("Checking out %s", self.espnet_commit)
+            cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit])
+            run_shell(cmd)
+        logger.info("Installing")
+        cmd = shlex.join(["pip", "install", "-e", str(espnet_path)])
+        run_shell(cmd)
+        logger.info("Installation completed")
+
+    @torch.no_grad()
+    def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
+        self.encodec.eval()
+        if signal.dim() < 3:
+            signal = signal.unsqueeze(1)
+        tokens = self.encodec.encode(signal)
+        return tokens.permute(1, 2, 0)[:, :, :self.n_codebook]
+
+    @torch.no_grad()
+    def tokens_to_sig(self, tokens, **kwargs):
+        self.encodec.eval()
+        tokens = tokens.permute(2, 0, 1)
+        signal = self.encodec.decode(tokens, **kwargs)
+        return signal.squeeze(1)
+
+    @torch.no_grad()
+    def get_pretrained_embeddings(
+        self, vocab_size=None, num_codebooks=None, **kwargs
+    ):
+        """
+        This method is not implemented for ESPNet Encodec, as it uses scalar quantization
+        and does not have any trainable quantizer or embedding.
+        """
+        raise ValueError(
+            "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization."
+        )
\ No newline at end of file

From be174df994a2cede4dea7230ce37b6a9228d60d6 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Feb 2025 00:20:32 -0500
Subject: [PATCH 157/270] DASB: Inference mode, remove an unused evaluator

---
 benchmarks/DASB/utils/eval.py | 111 ++--------------------------------
 1 file changed, 6 insertions(+), 105 deletions(-)

diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 9d5e8642f..76f2a6c2f 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -7,7 +7,6 @@
 """
 
 from speechbrain.inference.interfaces import Pretrained
-from speechbrain.inference.ASR import EncoderDecoderASR
 from speechbrain.lobes.models.huggingface_transformers import Whisper
 from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2
 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
@@ -351,105 +350,6 @@ def _replace_blanks(self, preds):
         return [" " if item == "" else item for item in preds]
 
 
-class EncoderDecoderASRSpeechEvaluator(ASRSpeechEvaluator):
-    """A speech evaluator implementation based on ASR.
-    Computes the Word Error Rate (WER), Character Error Rate (CER)
-    and a few other metrics
-
-    Arguments
-    ---------
-    sample_rate : int
-        The audio sample rate this evaluator expects
-    """
-
-    def __init__(self, source, sample_rate=None, *args, **kwargs):
-        super().__init__(sample_rate=sample_rate)
-        self.asr = EncoderDecoderASR.from_hparams(source, *args, **kwargs)
-        self.device = next(self.asr.mods.parameters()).device
-
-    def evaluate_samples(self, wavs, length, text, sample_rate):
-        wavs = self.resample(wavs, sample_rate)
-        if text is None:
-            raise ValueError("This evaluator requires ground-truth text")
-        predicted_words, scores, log_probs = self.transcribe_batch_with_details(
-            wavs, length
-        )
-        ids = range(1, len(wavs) + 1)
-        wer_metric, cer_metric = init_asr_metrics()
-        wer_metric.append(ids, predicted_words, text)
-        cer_metric.append(ids, predicted_words, text)
-        wer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores], device=wavs.device
-        )
-        cer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores], device=wavs.device
-        )
-        prob_mean = log_probs.exp().mean(dim=-1)
-        return {
-            "wer": wer,
-            "cer": cer,
-            "beam_score": scores,
-            "prob_mean": prob_mean,
-            "pred": predicted_words,
-            "target": text,
-        }
-
-    def transcribe_batch_with_details(self, wavs, wav_lens):
-        """Transcribes the input audio into a sequence of words
-
-        The waveforms should already be in the model's desired format.
-        You can call:
-        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
-        to get a correctly converted signal in most cases.
-
-        Arguments
-        ---------
-        predicted_words : list
-            The raw ASR predictions, fully decoded
-        best_scores : list
-            The best scores (from beam search)
-        best_log_probs : list
-            The best predicted log-probabilities (from beam search)
-
-
-        Returns
-        -------
-        predicted_words : list
-            The predictions
-
-        best_scores : torch.Tensor
-            The best scores (from beam search)
-
-        best_log_probs : torch.Tensor
-            The best log-probabilities
-
-        """
-        with torch.no_grad():
-            wav_lens = wav_lens.to(self.device)
-            encoder_out = self.asr.encode_batch(wavs, wav_lens)
-            (
-                hyps,
-                best_lens,
-                best_scores,
-                best_log_probs,
-            ) = self.asr.mods.decoder(encoder_out, wav_lens)
-            predicted_words = [
-                self.asr.tokenizer.decode_ids(token_seq) for token_seq in hyps
-            ]
-        return predicted_words, best_scores, best_log_probs
-
-    def to(self, device):
-        """Transfers this module to the spcieifed device
-
-        Arguments
-        ---------
-        device : str | torch.Device
-            the target device
-        """
-        self.asr = self.asr.to(device)
-        return self
-
-
 class WhisperASRSpeechEvaluator(ASRSpeechEvaluator):
     """A speech evaluator implementation based on Whisper ASR
 
@@ -995,11 +895,12 @@ def evaluate(
             length_cat_abs.int()
         ).long()  # 0 for masked tokens
         # Forward
-        embs = self.model(
-            input_values=audio,
-            attention_mask=attention_mask,
-            output_attentions=False,
-        ).embeddings
+        with torch.inference_mode():
+            embs = self.model(
+                input_values=audio,
+                attention_mask=attention_mask,
+                output_attentions=False,
+            ).embeddings
         hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)])
         scores = torch.nn.functional.cosine_similarity(
             hyp_embs, ref_embs, dim=-1

From 750f3a4ff6c8b161f39a777ba492811f3356ee84 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Feb 2025 01:15:13 -0500
Subject: [PATCH 158/270] DASB: Add customization for the validation batch size

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml   | 3 ++-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 3 ++-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml      | 3 ++-
 .../LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml    | 3 ++-
 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml  | 3 ++-
 .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml  | 3 ++-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml | 3 ++-
 7 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 1a55d1c02..01c818370 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -56,6 +56,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -143,7 +144,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index efcde8c58..61dabef41 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -95,6 +95,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 batch_size_guided: 2
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
@@ -202,7 +203,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index e85d37ff8..e45794171 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -69,6 +69,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -159,7 +160,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
index e85d37ff8..e45794171 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
@@ -69,6 +69,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -159,7 +160,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
index 4f0772f47..156e05b02 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -68,6 +68,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -153,7 +154,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 937db0812..ffb68f2a5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -60,6 +60,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -146,7 +147,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
index 3c06d761f..f4f745716 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -68,6 +68,7 @@ input: text
 number_of_epochs: 1000
 reset_annealing_epoch: null
 batch_size: 16
+valid_batch_size: !ref <batch_size>
 extract_features_batch_size: 32
 grad_accumulation_factor: 1
 max_grad_norm: 0.01
@@ -153,7 +154,7 @@ train_dataloader_opts:
             value: !ref <pad_index>
 
 valid_dataloader_opts:
-    batch_size: !ref <batch_size>
+    batch_size: !ref <valid_batch_size>
     num_workers: !ref <num_workers>
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:

From 55fc383319d819c1ca137dd222f7a3c322b0d8ec Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Feb 2025 09:25:51 -0500
Subject: [PATCH 159/270] DASB: VALL-E: Add ESPNET Encodec

---
 .../valle/hparams/train_espnet_encodec.yaml   | 238 +++++++++++++++++
 .../valle/hparams/train_espnet_encodec.yaml   | 251 ++++++++++++++++++
 2 files changed, 489 insertions(+)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..ad486d493
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,238 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/espnet-encodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 24000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_top_k: 20
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+    top_k: !ref <infer_top_k>
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <audio_tokens_per_step>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
new file mode 100644
index 000000000..112b526c4
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -0,0 +1,251 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: tokotron/encodec
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 2300
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12
+num_layers_nar: 12
+dropout: 0.2
+vocab_size: 1024
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+
+# Model Settings
+model_hub: facebook/encodec_24khz
+espnet_repo: https://github.com/espnet/espnet
+espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
+model_hub: espnet/libritts_encodec_24k
+model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
+model_config: exp/codec_encodec_ss4_24k/config.yaml
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
+  source: !ref <model_hub>
+  model_config: !ref <model_config>
+  n_codebook: !ref <audio_tokens_per_step>
+  save_path: !ref <pretrained_model_save_folder>
+  sample_rate: !ref <sample_rate>
+  model_ckpt: !ref <model_ckpt>
+  espnet_commit: !ref <espnet_commit>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>

From 07302546e5fcbd05d41dc3a88bc615829f2438b6 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Feb 2025 15:48:05 -0500
Subject: [PATCH 160/270] DASB: Add the ability to skip resampling

---
 .../DASB/LibriTTS/extraction/extract.py       |  3 ++-
 .../DASB/LibriTTS/extraction/hparams/dac.yaml |  1 +
 .../extraction/hparams/discrete_ssl.yaml      |  2 ++
 .../LibriTTS/extraction/hparams/encodec.yaml  |  1 +
 .../extraction/hparams/espnet_encodec.yaml    |  1 +
 .../LibriTTS/extraction/hparams/mimi.yaml     |  1 +
 .../extraction/hparams/speech_tokenizer.yaml  |  1 +
 .../LibriTTS/extraction/hparams/sqcodec.yaml  |  1 +
 .../extraction/hparams/wavtokenizer.yaml      |  1 +
 benchmarks/DASB/LibriTTS/libritts_prepare.py  | 22 ++++++++++++-------
 10 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py
index 00799eeb4..328fbe868 100644
--- a/benchmarks/DASB/LibriTTS/extraction/extract.py
+++ b/benchmarks/DASB/LibriTTS/extraction/extract.py
@@ -49,7 +49,8 @@
             "save_json_test": hparams["test_json"],
             "sample_rate": hparams["sample_rate"],
             "skip_prep": hparams["skip_prep"],
-            "max_valid_size": None
+            "max_valid_size": None,
+            "skip_resample": hparams["skip_resample"],
         },
     )
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
index 76870e279..836503717 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml
@@ -48,6 +48,7 @@ sample_rate: 24000
 # Feature parameters
 encoder_dim: 1024
 save_embedding: False
+skip_resample: False
 
 tokenizer: !new:utils.tokenizer_interface.DACTokenizer
   model_type: !ref <model_type>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
index 2b57a7edf..6ae14c87c 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml
@@ -52,6 +52,8 @@ freeze_ssl: True
 freeze_feature_extractor: True
 vocab_size: 1000
 save_embedding: False
+skip_resample: False
+
 
 ### Config for Tokenizer
 # Layer number should be among the supported layers for discrete SSL models(kmenas  model should be available for that layer)
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
index b7ae76969..188b38a6d 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml
@@ -44,6 +44,7 @@ num_codebooks: 32
 vocab_size: 1024
 sample_rate: 24000
 save_embedding: False
+skip_resample: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
index a6630188f..a0542b189 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml
@@ -48,6 +48,7 @@ num_codebooks: 32
 vocab_size: 1024
 sample_rate: 24000
 save_embedding: False
+skip_resample: False
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
   source: !ref <model_hub>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
index 9e64347c7..acddcd93b 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
@@ -42,6 +42,7 @@ sample_rate: 24000
 encoder_dim: 1024
 freeze_embedding: False
 save_embedding: False
+skip_resample: False
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
index 85148db9d..2b96a749b 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -39,6 +39,7 @@ sample_rate: 16000
 encoder_dim: 1024
 freeze_embedding: False
 save_embedding: False
+skip_resample: False
 
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
index cf46b3f5a..68dc9df49 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
@@ -40,6 +40,7 @@ sample_rate: 16000
 save_embedding: False
 num_codebooks: 4
 save_path: /home/ubuntu/sq-codec/SQ-Codec
+skip_resample: False
 
 
 # SQCodec model
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
index c7581bbe7..56c13508c 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
@@ -41,6 +41,7 @@ sample_rate: 24000
 save_embedding: False
 num_codebooks: 1
 vocab_size: 4096
+skip_resample: False
 
 # wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
index 6ec1a3a96..52594eaf9 100644
--- a/benchmarks/DASB/LibriTTS/libritts_prepare.py
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -42,6 +42,7 @@ def prepare_libritts(
     max_valid_size=500,
     alignments_folder=None,
     skip_prep=False,
+    skip_resample=False,
 ):
     """
     Prepares the json files for the LibriTTS dataset.
@@ -82,6 +83,8 @@ def prepare_libritts(
         The path to alignments files
     skip_prep: Bool
         If True, skip preparation.
+    skip_resample: bool
+        If True, audio will not be resampled
 
     Returns
     -------
@@ -106,16 +109,16 @@ def prepare_libritts(
     # If specific splits are provided, creates data manifest files accordingly
     if train_split:
         wav_list = prepare_split(data_folder, train_split)
-        create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name)
+        create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
     if valid_split:
         wav_list = prepare_split(data_folder, valid_split)
         # TODO add better way to speedup evaluation
         if max_valid_size is not None and len(wav_list) > max_valid_size:
             wav_list = random.sample(wav_list, max_valid_size)
-        create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name)
+        create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
     if test_split:
         wav_list = prepare_split(data_folder, test_split)
-        create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name)
+        create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
 
     if skip(save_json_train, save_json_valid, save_json_test):
         logger.info("Preparation completed.")
@@ -129,12 +132,12 @@ def prepare_libritts(
         data_split = split_sets(wav_list, split_ratio)
         # Creating json files
         create_json(
-            data_split["train"], save_json_train, sample_rate, alignments_folder, model_name
+            data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample
         )
         create_json(
-            data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name
+            data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample
         )
-        create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name)
+        create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample)
 
 
 def prepare_split(data_folder, split_list):
@@ -177,7 +180,7 @@ def prepare_split(data_folder, split_list):
     return wav_list
 
 
-def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None):
+def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False):
     """
     Creates the json file given a list of wav files.
     Arguments
@@ -194,6 +197,9 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder
         The path to LibriTTS alignments
     model_name : str
         Model name (used to prepare additional model specific data)
+    skip_resample : int
+        Skips resampling - useful when large temporary storage
+        is absent.
     """
 
     # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments
@@ -240,7 +246,7 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder
             continue
 
         # Resamples the audio file if required
-        if sig_sr != sample_rate:
+        if sig_sr != sample_rate and not skip_resample:
             resampled_signal = torchaudio.functional.resample(
                 signal, sig_sr, sample_rate
             )

From 41afc01a027f7f33061b46cb9c8f86df90d2d98e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Feb 2025 00:20:32 -0500
Subject: [PATCH 161/270] DASB: Add the switch for LM head training

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 ++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 73af26870..9ab344ff6 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -258,6 +258,7 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
+        self.modules.model.lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -268,6 +269,7 @@ def apply_curriculum(self):
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
         ):
             self.train_ar = False
+            self.modules.model.lm_head.requires_grad_(False)
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index bde1deb8a..395113edb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -257,6 +257,7 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
+        self.modules.model.lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -266,7 +267,10 @@ def apply_curriculum(self):
             self.hparams.number_of_epochs_nar is not None
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
         ):
+            # NOTE: Avoid the AR head being "taken by surprise"
             self.train_ar = False
+            self.modules.model.lm_head.requires_grad_(False)
+        
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed

From f529e621b211189a4b3aee47db0b2ad2e5e342a0 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Feb 2025 11:36:18 -0500
Subject: [PATCH 162/270] DASB: Undo the gradient change - it did not help

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 --
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 --
 2 files changed, 4 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 9ab344ff6..73af26870 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -258,7 +258,6 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
-        self.modules.model.lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -269,7 +268,6 @@ def apply_curriculum(self):
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
         ):
             self.train_ar = False
-            self.modules.model.lm_head.requires_grad_(False)
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 395113edb..e786d3495 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -257,7 +257,6 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
-        self.modules.model.lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -269,7 +268,6 @@ def apply_curriculum(self):
         ):
             # NOTE: Avoid the AR head being "taken by surprise"
             self.train_ar = False
-            self.modules.model.lm_head.requires_grad_(False)
         
 
     def is_eval_epoch(self, epoch):

From 554e52ac3e19c01af7392ee522691dc86d6c9f88 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Feb 2025 19:43:55 -0500
Subject: [PATCH 163/270] DASB: VALL-E: Add the ability to disable fixed
 batches, add the ability to limit the validation set to be run on every epoch

---
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |   8 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   8 +-
 .../TTS/valle/hparams/train_encodec.yaml      |   8 +-
 .../valle/hparams/train_espnet_encodec.yaml   |   8 +-
 .../TTS/valle/hparams/train_mimi.yaml         |   8 +-
 .../valle/hparams/train_speech_tokenizer.yaml |   8 +-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |   8 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 127 ++++++++++++++++++
 8 files changed, 176 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 588fd6b55..86d3c7ce3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -66,7 +66,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -118,7 +120,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 161c911e6..a62288062 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -84,7 +84,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -159,7 +161,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index ad79fb3de..0db811710 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -65,7 +65,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -117,7 +119,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 112b526c4..af6e9cbef 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -65,7 +65,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -117,7 +119,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 2b3b5d5b3..cee2de622 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -66,7 +66,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -118,7 +120,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index eae14615f..41c7cad68 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -65,7 +65,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -117,7 +119,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 4120d37f5..0ccaf4727 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -65,7 +65,9 @@ number_of_epochs: 100
 number_of_epochs_ar: null
 number_of_epochs_nar: null
 epoch_size: 50000
+epoch_fixed: False
 batch_size: 16
+valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
 max_grad_norm: 1.0
@@ -117,7 +119,11 @@ train_dataloader_opts:
     batch_size: !ref <batch_size>
     shuffle: True
     num_workers: !ref <num_workers>
-    looped_nominal_epoch: !ref <epoch_size> // <batch_size>        
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
     collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
         padding_kwargs:
             value: !ref <pad_index>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index e786d3495..e17e93e74 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -26,6 +26,7 @@
 from speechbrain.utils.data_utils import pad_right_to
 from speechbrain.utils.distributed import run_on_main
 from speechbrain.utils.data_utils import batch_pad_right
+from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
 from functools import partial
 import re
 import string
@@ -522,6 +523,104 @@ def fit_batch(self, batch):
         if self.hparams.lr_annealing_mode == "step":
             self.hparams.lr_annealing(self.optimizer)
         return loss
+    
+    def fit(
+        self,
+        epoch_counter,
+        train_set,
+        valid_set=None,
+        progressbar=None,
+        train_loader_kwargs={},
+        valid_loader_kwargs={},
+    ):
+        """Iterate epochs and datasets to improve objective.
+
+        Relies on the existence of multiple functions that can (or should) be
+        overridden. The following methods are used and expected to have a
+        certain behavior:
+
+        * ``fit_batch()``
+        * ``evaluate_batch()``
+        * ``update_average()``
+
+        If the initialization was done with distributed_count > 0 and the
+        distributed_backend is ddp, this will generally handle multiprocess
+        logic, like splitting the training data into subsets for each device and
+        only saving a checkpoint on the main process.
+
+        Arguments
+        ---------
+        epoch_counter : iterable
+            Each call should return an integer indicating the epoch count.
+        train_set : Dataset, DataLoader
+            A set of data to use for training. If a Dataset is given, a
+            DataLoader is automatically created. If a DataLoader is given, it is
+            used directly.
+        valid_set : Dataset, DataLoader
+            A set of data to use for validation. If a Dataset is given, a
+            DataLoader is automatically created. If a DataLoader is given, it is
+            used directly.
+        progressbar : bool
+            Whether to display the progress of each epoch in a progressbar.
+        train_loader_kwargs : dict
+            Kwargs passed to `make_dataloader()` for making the train_loader
+            (if train_set is a Dataset, not DataLoader).
+            E.G. batch_size, num_workers.
+            DataLoader kwargs are all valid.
+        valid_loader_kwargs : dict
+            Kwargs passed to `make_dataloader()` for making the valid_loader
+            (if valid_set is a Dataset, not DataLoader).
+            E.g., batch_size, num_workers.
+            DataLoader kwargs are all valid.
+
+        Returns
+        -------
+        None
+        """
+        if self.test_only:
+            logger.info(
+                "Test only mode, skipping training and validation stages."
+            )
+            return
+
+        self.on_fit_start()
+        train_set = self.make_dataloader(
+            train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs
+        )
+        epoch = self.hparams.epoch_counter.current
+        if epoch < self.hparams.number_of_epochs:
+            valid_set = sample_dataset(
+                dataset=valid_set,
+                count=self.hparams.valid_inter_data_count,
+                seed=self.hparams.seed
+            )
+
+        valid_set = self.make_dataloader(
+            valid_set,
+            stage=sb.Stage.VALID,
+            ckpt_prefix=None,
+            **valid_loader_kwargs,
+        )
+
+        if progressbar is None:
+            progressbar = not self.noprogressbar
+
+        # Only show progressbar if requested and main_process
+        enable = progressbar and sb.utils.distributed.if_main_process()
+
+        # Iterate epochs
+        for epoch in epoch_counter:
+            self._fit_train(train_set=train_set, epoch=epoch, enable=enable)
+            self._fit_valid(valid_set=valid_set, epoch=epoch, enable=enable)
+
+            # Debug mode only runs a few epochs
+            if (
+                self.debug
+                and epoch == self.debug_epochs
+                or self._optimizer_step_limit_exceeded
+            ):
+                break
+
 
 
 INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
@@ -712,6 +811,34 @@ def sig_pipeline(wav):
     return datasets
 
 
+def sample_dataset(dataset, count, seed):
+    """Selects a sample of the specified dataset in a
+    stable manner, returning the same sample on each call
+
+    Arguments
+    ---------
+    dataset : speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    count : int
+        The number of items to select
+    seed : int
+        The seed to be used
+    """
+    if len(dataset) < count:
+        return dataset
+    generator = torch.Generator()
+    generator.manual_seed(seed)
+    indexes = torch.randperm(len(dataset)).tolist()[:count]
+    data_ids = [
+        dataset.data_ids[idx]
+        for idx in indexes
+    ]
+    return FilteredSortedDynamicItemDataset(
+        dataset,
+        data_ids,
+    )
+
+
 def get_offsets(vocab_size, tracks):
     """Adds offsets to each track to treat the tokens as distinct
 

From e2d74404b2ff8424d5bef4a0fcded61fe9fa3b5a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Feb 2025 23:11:48 -0500
Subject: [PATCH 164/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index e17e93e74..db4c8e5b0 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -383,6 +383,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
         if stage == sb.Stage.TRAIN:
             self.train_stats = stage_stats
 
+        eval_summary_stats = {}
         # End evaluation and report stats
         if stage != sb.Stage.TRAIN and self.is_evaluating:
             self.evaluation_metric.on_evaluation_end()

From d7fc323d8d45d6c0726735d79dffbc2aec7785ed Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 16 Feb 2025 02:27:00 -0500
Subject: [PATCH 165/270] DASB: Update wav2vec2

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml  | 2 +-
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml     | 2 +-
 .../DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml  | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index afdac42b7..0a18b2f60 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -47,7 +47,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
     choices:
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+        wav2vec2: facebook/wav2vec2-large
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 1fd2aa3aa..85fc660a9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -45,7 +45,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
     choices:
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+        wav2vec2: facebook/wav2vec2-large
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 61dabef41..4efa9f75c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -60,7 +60,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
     choices:
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+        wav2vec2: facebook/wav2vec2-large
 g2p_src: flexthink/soundchoice-g2p
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
 kmeans_dataset: LibriSpeech
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index a62288062..2800cfd56 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -52,7 +52,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice
     choices:
         wavlm: microsoft/wavlm-large
         hubert: facebook/hubert-large-ll60k
-        wav2vec2: facebook/wav2vec2-large-960h-lv60-self
+        wav2vec2: facebook/wav2vec2-large
 g2p_src: speechbrain/soundchoice-g2p
 token_model_kmeans_src: poonehmousavi/SSL_Quantization
 kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint

From 8a9e8739e7264a287b390dd75bf8ef9f47f7851d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 16 Feb 2025 23:05:04 -0500
Subject: [PATCH 166/270] DASB: Add back LM head freezing (with a toggle)

---
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |  1 +
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  2 +
 .../TTS/valle/hparams/train_encodec.yaml      |  1 +
 .../valle/hparams/train_espnet_encodec.yaml   |  3 +-
 .../TTS/valle/hparams/train_mimi.yaml         |  2 +
 .../valle/hparams/train_speech_tokenizer.yaml |  2 +
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  1 +
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 42 +++++++++++++++----
 8 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 86d3c7ce3..9dd038b11 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -150,6 +150,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+freeze_lm_head: False
 
 ####################### Model parameters ###########################
 # Transformer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 2800cfd56..9397ac6be 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -227,6 +227,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
 
 audio_tokens_per_step: 6
 
+freeze_lm_head: False
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 0db811710..56783935f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -149,6 +149,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+freeze_lm_head: False
 
 ####################### Model parameters ###########################
 # Transformer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index af6e9cbef..0f1f51672 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -185,13 +185,14 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
 audio_tokens_per_step: 8
 
 # Model Settings
-model_hub: facebook/encodec_24khz
 espnet_repo: https://github.com/espnet/espnet
 espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
 model_hub: espnet/libritts_encodec_24k
 model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
 model_config: exp/codec_encodec_ss4_24k/config.yaml
 
+freeze_lm_head: True
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index cee2de622..8a095f441 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -188,6 +188,8 @@ audio_tokens_per_step: 8
 # Model Settings
 model_hub: kyutai/mimi
 
+freeze_lm_head: False
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 41c7cad68..bc88e091f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -187,6 +187,8 @@ audio_tokens_per_step: 8
 # Model Settings
 model_hub: fnlp/SpeechTokenizer
 
+freeze_lm_head: False
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 0ccaf4727..5a628e536 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -189,6 +189,7 @@ model_hub: novateur/WavTokenizer-medium-music-audio-75token
 config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
 checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt
 
+freeze_lm_head: False
 
 ############################## models ################################
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index db4c8e5b0..d08942cd8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -258,18 +258,19 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
+        self.modules.model.lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
         elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
-            self.train_nar = False
+            self.train_nar = False                
         elif (
             self.hparams.number_of_epochs_nar is not None
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
         ):
-            # NOTE: Avoid the AR head being "taken by surprise"
             self.train_ar = False
-        
+            if self.hparams.freeze_lm_head:
+                self.modules.model.lm_head.requires_grad_(False)
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed
@@ -455,11 +456,8 @@ def inference(self, batch):
             )
             for prefix_item in prefix_items
         ]
-        inferred_tokens = [            result[0][0]
-            if result[0]
-            else torch.zeros(
-                1000, self.hparams.audio_tokens_per_step, device=self.device
-            )
+        inferred_tokens = [
+            self._pad_inferred_sample(result)
             for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
@@ -467,6 +465,34 @@ def inference(self, batch):
         audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
         return audio, audio_length
 
+    def _pad_inferred_sample(self, result):
+        """Applies length padding to an inference result
+
+        Arguments
+        ---------
+        result : list
+            The VALL-E Inference output
+
+        Returns
+        -------
+        sample : torch.Tensor
+            A sample, padded if needed
+        """
+        if result[0]:
+            sample = result[0][0]
+        else:
+            sample = torch.zeros(
+                1000, self.hparams.audio_tokens_per_step, device=self.device
+            )
+        min_length = getattr(self.hparams, "infer_min_length", 10)
+        sample_length, tracks = sample.shape
+        if sample_length < min_length:
+            sample = pad_right_to(
+                (min_length, tracks),
+                sample
+            )
+        return sample
+
     def _get_inference_opts(self):
         idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
             None, :

From a1f5e94faa5de53d2f9386aaefa9def865bd4ae4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Feb 2025 00:26:07 -0500
Subject: [PATCH 167/270] DASB: Fix for data parallel

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index d08942cd8..12c291be1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -258,7 +258,12 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
-        self.modules.model.lm_head.requires_grad_(True)
+        lm_head = (
+            self.modules.model.module.lm_head
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.lm_head
+        )
+        lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -270,7 +275,7 @@ def apply_curriculum(self):
         ):
             self.train_ar = False
             if self.hparams.freeze_lm_head:
-                self.modules.model.lm_head.requires_grad_(False)
+                lm_head.requires_grad_(False)
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed

From e75214612259abeab8fe4891af36c9a9909d7542 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Feb 2025 00:55:26 -0500
Subject: [PATCH 168/270] DASB: Fix padding

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 12c291be1..0654b23e5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -493,9 +493,9 @@ def _pad_inferred_sample(self, result):
         sample_length, tracks = sample.shape
         if sample_length < min_length:
             sample = pad_right_to(
+                sample,
                 (min_length, tracks),
-                sample
-            )
+            )[0]
         return sample
 
     def _get_inference_opts(self):

From c6d58831be2d1e2ff5d1ce1cfbe79059c703a19d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Feb 2025 11:51:27 -0500
Subject: [PATCH 169/270] DASB: VALL-E: Fix a crash

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 0654b23e5..1fc2f1e68 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -815,6 +815,10 @@ def sig_pipeline(wav):
             spk_samplers=spk_samplers,
         )
         resample_fn[dataset](epoch=0)
+        if hparams["input"] == "phonemes":
+            dynamic_dataset = dynamic_dataset.filtered_sorted(
+                key_test={"has_alignments": lambda value: value}
+            )
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False

From 99588e396d6380d0f38a6ad1f19cd492af6073bd Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Feb 2025 14:55:31 -0500
Subject: [PATCH 170/270] DASB: VALL-E: Add LM head freezing

---
 .../LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml    | 2 ++
 .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml    | 2 ++
 .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml  | 2 ++
 .../DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml       | 3 +++
 .../LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml    | 2 ++
 benchmarks/DASB/LJSpeech/TTS/valle/train.py               | 8 ++++++++
 6 files changed, 19 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 85fc660a9..f7df72d39 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -206,6 +206,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
 
 audio_tokens_per_step: 6
 
+freeze_lm_head: False
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index fd1fea7cc..cfe293610 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -139,6 +139,8 @@ sample_dataloader_opts:
 token_model_kwargs:
     SSL_layers: !ref <speech_model_layers>
 
+freeze_lm_head: False
+
 ####################### Model parameters ###########################
 # Transformer
 d_model: 1024
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index ad486d493..5583680d5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -177,6 +177,8 @@ model_hub: espnet/libritts_encodec_24k
 model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth
 model_config: exp/codec_encodec_ss4_24k/config.yaml
 
+freeze_lm_head: True
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index eb1605f4e..05c506a85 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -175,6 +175,9 @@ audio_tokens_per_step: 8
 bandwidth: 6
 
 
+freeze_lm_head: False
+
+
 ############################## models ################################
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index dd9d60798..bcdb7e2eb 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -142,6 +142,8 @@ sample_dataloader_opts:
 token_model_kwargs:
     SSL_layers: !ref <speech_model_layers>
 
+freeze_lm_head: False
+
 ####################### Model parameters ###########################
 # Transformer
 d_model: 1024
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 73af26870..7e46f77c5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -258,6 +258,12 @@ def apply_curriculum(self):
         only the non-autoregressive part"""
         epoch = self.hparams.epoch_counter.current
         self.train_ar, self.train_nar = True, True
+        lm_head = (
+            self.modules.model.module.lm_head
+            if hasattr(self.modules.model, "module")
+            else self.modules.model.lm_head
+        )
+        lm_head.requires_grad_(True)
         if self.hparams.audio_tokens_per_step == 1:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
@@ -268,6 +274,8 @@ def apply_curriculum(self):
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
         ):
             self.train_ar = False
+            if self.hparams.freeze_lm_head:
+                lm_head.requires_grad_(False)
 
     def is_eval_epoch(self, epoch):
         """Determines whether or not evaluation should be performed

From dad02cb95eb8309b685990a0efb689538d555c14 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Feb 2025 22:08:03 -0500
Subject: [PATCH 171/270] DASB: Vall-E: Fix data-parallel

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 16 +++++++++++-----
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 15 ++++++++++-----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 7e46f77c5..b25a60d4f 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -70,12 +70,18 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        self.modules.tokenizer.device = self.device
-        if hasattr(self.modules.tokenizer, "codec_vocoder"):
-            self.modules.tokenizer.codec_vocoder.to(self.device)
-            self.modules.tokenizer.codec_vocoder.device = self.device
-        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        tokenizer = (
+            self.modules.tokenizer.module
+            if hasattr(self.modules.tokenizer, "module")
+            else self.modules.tokenizer
+        )
+        tokenizer.device = self.device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(self.device)
+            tokenizer.codec_vocoder.device = self.device
+        wav = tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
+        wav = wav.to(self.device)
         return wav
 
     def compute_forward(self, batch, stage):
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 1fc2f1e68..da6cd7083 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -72,11 +72,16 @@ def create_waveform(self, audio, length):
         -------
         wav : torch.Tensor
         """
-        self.modules.tokenizer.device = self.device
-        if hasattr(self.modules.tokenizer, "codec_vocoder"):
-            self.modules.tokenizer.codec_vocoder.to(self.device)
-            self.modules.tokenizer.codec_vocoder.device = self.device
-        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        tokenizer = (
+            self.modules.tokenizer.module
+            if hasattr(self.modules.tokenizer, "module")
+            else self.modules.tokenizer
+        )
+        tokenizer.device = self.device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(self.device)
+            tokenizer.codec_vocoder.device = self.device
+        wav = tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         wav = wav.to(self.device)
         return wav

From 63e9972c98eb28982fcf2097c1bc00eb6222e331 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 18 Feb 2025 01:30:52 -0500
Subject: [PATCH 172/270] DASB: VALL-E: Update hyperparameters

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 6 +++---
 .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml      | 4 ++--
 .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml    | 6 +++---
 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml  | 6 +++---
 .../DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml | 6 +++---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml   | 6 +++---
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 6 +++---
 .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml      | 6 +++---
 .../LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml    | 6 +++---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml  | 6 +++---
 .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml | 6 +++---
 11 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index f7df72d39..45b4f6e5e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -175,12 +175,12 @@ token_model_kwargs:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1000
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index cfe293610..fbcb5db5a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -147,8 +147,8 @@ d_model: 1024
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index 5583680d5..0249b0c64 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -141,12 +141,12 @@ token_model_kwargs:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index 05c506a85..6063d692c 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -142,12 +142,12 @@ token_model_kwargs:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 2048
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index bcdb7e2eb..fd21dc4bd 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -146,12 +146,12 @@ freeze_lm_head: False
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 9dd038b11..c811e1c7f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -154,12 +154,12 @@ freeze_lm_head: False
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 9397ac6be..bd50151af 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -192,12 +192,12 @@ token_model_kwargs:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1000
 audio_emb_size: 1024
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 56783935f..54c357b52 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -153,12 +153,12 @@ freeze_lm_head: False
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 0f1f51672..120208cd8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -152,12 +152,12 @@ sample_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 8a095f441..8383cf0f7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -153,12 +153,12 @@ sample_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 5a628e536..012f61e86 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -152,12 +152,12 @@ sample_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False

From bacc9f93c77b0909493c91612cf75a5a281d501c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 18 Feb 2025 12:03:52 -0500
Subject: [PATCH 173/270] DASB: VALL-E: Add data scaling support

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 1 +
 .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml      | 1 +
 .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml    | 1 +
 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml  | 1 +
 .../DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml | 1 +
 benchmarks/DASB/LJSpeech/TTS/valle/train.py                 | 6 ++++++
 6 files changed, 11 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 45b4f6e5e..140b85a84 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -87,6 +87,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index fbcb5db5a..2c22f57a4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -71,6 +71,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index 0249b0c64..74654e590 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -71,6 +71,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index 6063d692c..b528660f5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -72,6 +72,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index fd21dc4bd..c2ff765f4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -74,6 +74,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index b25a60d4f..30bfbe3b1 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -686,6 +686,12 @@ def sig_pipeline(wav):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
+    data_scale = hparams.get("data_scale")
+    if data_scale:
+        scaled_data_count = int(len(datasets["train"]) * data_scale)
+        datasets["train"] = datasets["train"].filtered_sorted(
+            select_n=scaled_data_count
+        )
     return datasets
 
 

From b1e270a5e81fde18342f933de51a05405fd27aed Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 18 Feb 2025 14:44:44 -0500
Subject: [PATCH 174/270] DASB: Tokotron: Add scaling + selction based on dWER
 (for comparison)

---
 .../TTS/tokotron/hparams/train_dac.yaml       |  8 +++--
 .../tokotron/hparams/train_discrete_ssl.yaml  |  6 ++++
 .../TTS/tokotron/hparams/train_encodec.yaml   |  8 +++--
 .../TTS/tokotron/hparams/train_mimi.yaml      |  8 +++--
 .../hparams/train_speech_tokenizer.yaml       |  8 +++--
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |  8 +++--
 .../tokotron/hparams/train_wavtokenizer.yaml  | 10 +++---
 .../DASB/LJSpeech/TTS/tokotron/train.py       | 36 +++++++++++++++----
 8 files changed, 72 insertions(+), 20 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
index f94d25d74..d49afdf29 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml
@@ -39,6 +39,7 @@ progress_meta: !ref <progress_folder>/meta.yaml
 num_audio_samples: 32
 samples_interval: 5
 
+
 tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
 
 tokens_loader: !new:utils.tokens.TokensLoader
@@ -49,8 +50,11 @@ token_model_kwargs:
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 0a18b2f60..af723f6c9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -66,6 +66,11 @@ spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -80,6 +85,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
index d16403558..1c54128b7 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml
@@ -44,8 +44,11 @@ tokens_loader: !new:utils.tokens.TokensLoader
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -60,6 +63,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index b99ac7980..b38a07434 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -47,8 +47,11 @@ tokens_loader: !new:utils.tokens.TokensLoader
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -64,6 +67,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 
 # index
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 4b2fb6553..0cb2012ed 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -47,8 +47,11 @@ tokens_loader: !new:utils.tokens.TokensLoader
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -63,6 +66,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
+data_scale: null
 
 
 # index
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index d101e1d85..4ea1ba387 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -49,7 +49,11 @@ tokens_loader: !new:utils.tokens.TokensLoader
 
 splits: ["train", "valid", "test"]
 split_ratio: [90, 5, 5]
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -64,7 +68,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
-
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
index 81bcee2ca..d3bf9c770 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -48,9 +48,11 @@ tokens_loader: !new:utils.tokens.TokensLoader
 
 
 splits: ["train", "valid", "test"]
-split_ratio: [90, 5, 5]
-
-
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: null
+test_key_kind: min
 ckpt_interval_minutes: 30 # save checkpoint every N min
 
 # Training parameters
@@ -65,7 +67,7 @@ skip_prep: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
-
+data_scale: null
 
 # index
 pad_index: 0
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 86e0efc26..3e6b356d4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -306,6 +306,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
             self.train_stats = stage_stats
 
         # End evaluation and report stats
+        eval_summary_stats = {}
         if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch):
             self.evaluator.on_evaluate_end()
             eval_summary_stats = self.get_summary_stats()
@@ -329,9 +330,14 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 valid_stats=stage_stats,
             )
 
-            # Save the current checkpoint and delete previous checkpoints.
+            # Save the current checkpoint and delete previous checkpoints.        
+            ckpt_kwargs = {
+                f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
+            }
             self.checkpointer.save_and_keep_only(
-                meta={"loss": stage_stats["loss"]}, min_keys=["loss"],
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
+                **ckpt_kwargs
             )
 
     def get_summary_stats(self):
@@ -667,6 +673,12 @@ def audio_pipeline(id):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
+    data_scale = hparams.get("data_scale")
+    if data_scale:
+        scaled_data_count = int(len(datasets["train"]) * data_scale)
+        datasets["train"] = datasets["train"].filtered_sorted(
+            select_n=scaled_data_count
+        )
 
     return datasets, silence_padding
 
@@ -918,10 +930,22 @@ def apply_overfit_test(hparams, dataset):
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:
-            tts_brain.evaluate(
-                test_set=datasets["test"],
-                test_loader_kwargs=hparams["test_dataloader_opts"],
-            )
+            test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+            if test_summary_file.exists():
+                logging.info("Test run already completed: %s", test_summary_file)
+            else:
+                test_key_kind = hparams["test_key_kind"]
+                test_key = hparams["test_key"]
+                if test_key:
+                    eval_kwargs = {
+                        f"{test_key_kind}_key": test_key
+                    }
+                tts_brain.evaluate(
+                    test_set=datasets["test"],
+                    test_loader_kwargs=hparams["test_dataloader_opts"],
+                    **eval_kwargs
+                )
+
 
     # Save final checkpoint (fixed name)
     tts_brain.checkpointer.save_checkpoint(name="latest")

From ef35a2f17d4fe9e8b153ce2171e9475f29f018c8 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 18 Feb 2025 23:08:22 -0500
Subject: [PATCH 175/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index da6cd7083..c2aa32f30 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -81,7 +81,7 @@ def create_waveform(self, audio, length):
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
-        wav = tokenizer.tokens_to_sig(audio)
+        wav = self.modules.tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         wav = wav.to(self.device)
         return wav
@@ -427,7 +427,8 @@ def on_stage_end(self, stage, stage_loss, epoch):
             }
             # Save the current checkpoint and delete previous checkpoints.
             self.checkpointer.save_and_keep_only(
-                meta={"loss": stage_stats["loss"]},
+                meta={"loss": stage_stats["loss"], **eval_summary_stats},
+                num_to_keep=hparams["ckpt_keep"],
                 **ckpt_kwargs
             )
         elif stage == sb.Stage.TEST:

From a6073f50c538ccfdd878de3cdc76e716f796771f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 19 Feb 2025 02:07:09 -0500
Subject: [PATCH 176/270] DASB: Add support for test set filtering

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 9164d31e0..df31c4c69 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -1019,7 +1019,13 @@ def apply_overfit_test(hparams, dataset):
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
             tts_brain.evaluate(
                 test_set=datasets["test"],
                 test_loader_kwargs=hparams["test_dataloader_opts"],
-            )
\ No newline at end of file
+                **eval_kwargs
+            )

From 1be28c74349de19960a0d1ae1951dd6329fb7b58 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 19 Feb 2025 03:20:36 -0500
Subject: [PATCH 177/270] DASB: Add support for test set filtering

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 35 ++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index c2aa32f30..03038f020 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1097,6 +1097,37 @@ def apply_overfit_test(hparams, dataset):
     return result
 
 
+def select_eval_subset(dataset, hparams, key="eval_subset"):
+    """Selects a subset of the dataset provided, if specified.
+    The selection is controlled by a hyperparameter named
+    eval_subset, which is expected to list the IDs of the
+    data items on which evaluation will take place, one per line
+
+    Arguments
+    ---------
+    dataset : speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    hparams : dict
+        A hyperparameters file
+
+    Returns
+    -------
+    subset : dataset
+        The dataset, filtered down if applicable
+    """
+    eval_subset_path = hparams.get(key)
+    if eval_subset_path is not None:
+        eval_subset_path = Path(eval_subset_path)
+        if not eval_subset_path.exists():
+            raise ValueError(f"eval_subset {eval_subset_path} does not exist")
+        with open(eval_subset_path) as eval_subset_file:
+            eval_subset_ids = [line.strip() for line in eval_subset_file]
+        subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
+    else:
+        subset = dataset
+    return subset
+
+
 def undo_padding_tensor(batch, lengths):
     """Produces Python lists given a batch of sentences with
     their corresponding relative lengths.
@@ -1238,8 +1269,10 @@ def undo_padding_tensor(batch, lengths):
         eval_kwargs = {
             f"{test_key_kind}_key": test_key
         }
+        eval_dataset = datasets["test"]
+        eval_dataset = select_eval_subset(eval_dataset, hparams)
         tts_brain.evaluate(
-            test_set=datasets["test"],
+            test_set=eval_dataset,
             test_loader_kwargs=hparams["test_dataloader_opts"],
             **eval_kwargs
         )

From 4e5f4ebfae89bd8230b73f1949217a90b93dfffd Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 19 Feb 2025 03:37:40 -0500
Subject: [PATCH 178/270] DASB: Add filtering (useful when some samples aren't
 present, e.g. when using alignments or G2P)

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 03038f020..81435e4a6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1122,6 +1122,10 @@ def select_eval_subset(dataset, hparams, key="eval_subset"):
             raise ValueError(f"eval_subset {eval_subset_path} does not exist")
         with open(eval_subset_path) as eval_subset_file:
             eval_subset_ids = [line.strip() for line in eval_subset_file]
+        existing_ids = dataset.data_ids
+        eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids]
+        if not eval_subset_ids:
+            raise ValueError("{eval_subset_path}: no items found in the dataset")
         subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
     else:
         subset = dataset

From 8dadf968e753073c4ed4a864a29e70f0cb8db1d2 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 19 Feb 2025 18:46:38 -0500
Subject: [PATCH 179/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 81435e4a6..308c5452d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -433,7 +433,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
             )
         elif stage == sb.Stage.TEST:
             self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch, "lr": lr},
+                stats_meta={"epoch": epoch},
                 train_stats=self.train_stats,
                 valid_stats=stage_stats,
             )

From 5272a73c2eadb916bec71644312370e80210c410 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 20 Feb 2025 04:13:13 -0500
Subject: [PATCH 180/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 308c5452d..3965e4283 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -81,7 +81,7 @@ def create_waveform(self, audio, length):
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
-        wav = self.modules.tokenizer.tokens_to_sig(audio)
+        wav = tokenizer.tokens_to_sig(audio)
         clean_padding_(wav, length)
         wav = wav.to(self.device)
         return wav

From b0df9ac99c8d1b6b4471b817473294479a03f0f7 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 21 Feb 2025 11:47:50 -0500
Subject: [PATCH 181/270] DASB: VALL-E: Fixes for WavTokenizer (AR-only)

---
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  2 +-
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   |  4 +-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  2 +-
 benchmarks/DASB/model/valle.py                | 99 ++++++++++---------
 4 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index c2ff765f4..af0222d90 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -154,7 +154,7 @@ nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
-vocab_size: 1024
+vocab_size: 4096
 text_num_tokens: 39
 phn_num_tokens: 52
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 30bfbe3b1..a07c6c53d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
 from speechbrain.dataio.dataio import (
-    clean_padding_,
+    clean_padding,
     length_to_mask,
     write_audio,
 )
@@ -80,7 +80,7 @@ def create_waveform(self, audio, length):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
         wav = tokenizer.tokens_to_sig(audio)
-        clean_padding_(wav, length)
+        wav = clean_padding(wav, length)
         wav = wav.to(self.device)
         return wav
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 012f61e86..17cbc987d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -159,7 +159,7 @@ nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
-vocab_size: 1024
+vocab_size: 4096
 audio_emb_freeze: False
 audio_emb_pretrained: False
 text_num_tokens: 39
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 5805cb061..3abcf057f 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -102,16 +102,17 @@ def __init__(
             qk_norm=qk_norm,
             dropout=dropout,
         )
-
-        self.nar_decoder = ValleNARDecoder(
-            n_level=nq - 1,
-            n_ctx=n_ctx,
-            n_state=att_unit,
-            n_head=head,
-            n_layer=nar_layer,
-            qk_norm=qk_norm,
-            dropout=dropout,
-        )
+        if nq > 1:
+            # NOTE: An NAR encoder is not needed if there is only one track
+            self.nar_decoder = ValleNARDecoder(
+                n_level=nq - 1,
+                n_ctx=n_ctx,
+                n_state=att_unit,
+                n_head=head,
+                n_layer=nar_layer,
+                qk_norm=qk_norm,
+                dropout=dropout,
+            )
 
         self.nq = nq
         self.n_ctx = n_ctx
@@ -301,7 +302,7 @@ def inference(
                 nq_level=0,
             )
             # [B, 1, 1] -> [B, 1]
-            gen_tok, gen_score = gen_tok.squeeze(2), gen_tok.squeeze(2)
+            gen_tok, gen_score = gen_tok.squeeze(2), gen_score.squeeze(2)
 
             generated["token"].append(gen_tok)
             generated["score"].append(gen_score)
@@ -397,42 +398,46 @@ def inference(
         vocab_mask = torch.cat(mask_cache, dim=1)
 
         # (4.2) NAR loop
-        for step in range(1, opts.nq):
-            h_nar = self.nar_decoder(
-                prev_emb, ones * step - 1, mask=mask
-            )  # [B, T, D]
-            logits = self.lm_head(h_nar)
-            gen_tok, gen_score = logits_to_tokens(
-                logits.unsqueeze(2),
-                opts,
-                vocab_mask,
-                search_algo="greedy_search",
-                allow_eos=False,
-                nq_level=step,
-            )
-            gen_tok, gen_score = (
-                gen_tok.squeeze(2),
-                gen_score.squeeze(2),
-            )  # [B, T]
-
-            generated["token"].append(gen_tok[:, prefix.size(1) :])
-            generated["score"].append(gen_score[:, prefix.size(1) :])
-
-            if opts.search_algo == "teacher_force":
-                prev_tok = suffix[:, :, step]
-            else:
-                prev_tok = generated["token"][-1]
-            prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
-            prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
-
-        # (5) combine AR and NAR results
-        gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
-        gen_scores_nar = torch.stack(generated["score"], dim=2)
-
-        gen_tokens = torch.cat(
-            [gen_tokens_ar, gen_tokens_nar], dim=2
-        )  # [B, T, nq]
-        gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+        if self.nq > 1:
+            for step in range(1, opts.nq):
+                h_nar = self.nar_decoder(
+                    prev_emb, ones * step - 1, mask=mask
+                )  # [B, T, D]
+                logits = self.lm_head(h_nar)
+                gen_tok, gen_score = logits_to_tokens(
+                    logits.unsqueeze(2),
+                    opts,
+                    vocab_mask,
+                    search_algo="greedy_search",
+                    allow_eos=False,
+                    nq_level=step,
+                )
+                gen_tok, gen_score = (
+                    gen_tok.squeeze(2),
+                    gen_score.squeeze(2),
+                )  # [B, T]
+
+                generated["token"].append(gen_tok[:, prefix.size(1) :])
+                generated["score"].append(gen_score[:, prefix.size(1) :])
+
+                if opts.search_algo == "teacher_force":
+                    prev_tok = suffix[:, :, step]
+                else:
+                    prev_tok = generated["token"][-1]
+                prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
+                prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
+
+            # (5) combine AR and NAR results
+            gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
+            gen_scores_nar = torch.stack(generated["score"], dim=2)
+
+            gen_tokens = torch.cat(
+                [gen_tokens_ar, gen_tokens_nar], dim=2
+            )  # [B, T, nq]
+            gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+        else:
+            gen_tokens = gen_tokens_ar
+            gen_scores = gen_scores_ar
 
         gen_tokens_list, gen_scores_list = [], []
         for b in range(len(valid_idx)):

From cf24b23ab022d800a9807cd8b474dd0eb3ab6ca2 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 16:31:02 -0500
Subject: [PATCH 182/270] DASB: VALL-E: Update/add test stage logging

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 5 +++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index a07c6c53d..eeb3a9d6b 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -433,6 +433,11 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 num_to_keep=hparams["ckpt_keep"],
                 **ckpt_kwargs
             )
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
 
     def inference(self, batch):
         """Runs TTS inference
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 3965e4283..a28eabb66 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -433,9 +433,8 @@ def on_stage_end(self, stage, stage_loss, epoch):
             )
         elif stage == sb.Stage.TEST:
             self.hparams.train_logger.log_stats(
-                stats_meta={"epoch": epoch},
-                train_stats=self.train_stats,
-                valid_stats=stage_stats,
+                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
             )
 
     def inference(self, batch):

From b6224d6a0f641b12af58ba719b5367953655f859 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 17:27:59 -0500
Subject: [PATCH 183/270] DASB: Fix extraction for clusters with no internet
 connection on compute nodes

---
 benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml          | 3 ++-
 .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml     | 3 ++-
 benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml       | 3 ++-
 benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml  | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
index acddcd93b..dc026cc55 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/speech_tokenizer
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -46,7 +47,7 @@ skip_resample: False
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   num_codebooks: !ref <num_codebooks>
 
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
index 2b96a749b..8d3a9aa27 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/speech_tokenizer
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -44,7 +45,7 @@ skip_resample: False
 # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec)
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
   source: fnlp/SpeechTokenizer  # Only the 24kHz version supports mono audio
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
 
 tokens_extractor: !new:utils.tokens.TokensExtractor
   tokenizer: !ref <tokenizer>
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
index 68dc9df49..3d9792bbb 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/sqcodec
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -45,7 +46,7 @@ skip_resample: False
 
 # SQCodec model
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
-  save_path: !ref <save_path>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
 
diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
index 56c13508c..bfd802740 100644
--- a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml
@@ -9,6 +9,7 @@ seed: 1986
 __set_seed: !apply:torch.manual_seed [!ref <seed>]
 output_folder: !ref results/speech_tokenizer
 save_folder: !ref <output_folder>/save
+pretrained_model_save_folder: !ref <save_folder>
 train_log: !ref <output_folder>/extraction_log.txt
 
 # Data files
@@ -46,7 +47,7 @@ skip_resample: False
 # wavtokenizer model
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
   source: !ref <model_hub>
-  save_path: !ref <save_folder>
+  save_path: !ref <pretrained_model_save_folder>
   checkpoint: !ref <checkpoint>
   config: !ref <config>
   freeze: True

From d0900e0d24f9820f82a4999ef7c1946b4509142f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 18:01:30 -0500
Subject: [PATCH 184/270] DASB: VALL-E: Add layer selection, hpopt updates

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  3 +-
 .../valle/hparams/train_speech_tokenizer.yaml |  6 ++--
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 28 ++++++++++++++++++-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index bd50151af..5920c4be3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -63,7 +63,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
         hubert: speechbrain/hifigan-hubert-k1000-LibriTTS
         wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS
         wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS
-speech_model_layers: [1, 3, 7, 12, 18, 23]
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
 
 # Speaker Embeddings
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index bc88e091f..5d0908c0e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -152,12 +152,12 @@ sample_dataloader_opts:
 
 ####################### Model parameters ###########################
 # Transformer
-d_model: 1024
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
 share_emb: False
 qk_norm: True
 nhead: 16
-num_layers_ar: 12
-num_layers_nar: 12
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
 vocab_size: 1024
 audio_emb_freeze: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index a28eabb66..6127644c2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -703,6 +703,16 @@ def dataio_prepare(hparams):
     tokens_loader = hparams.get("tokens_loader")
     spk_prompt_length = hparams["spk_prompt_length"]
 
+    layer_idx = None
+    if "speech_model_layers" in hparams:
+        layer_idx = get_selected_layer_indexes(hparams)
+
+    if layer_idx is not None:
+        num_codebooks = layer_idx
+    else:
+        num_codebooks = hparams["audio_tokens_per_step"]
+
+
     @sb.utils.data_pipeline.takes("label")
     @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
     def text_pipeline(label):
@@ -722,7 +732,7 @@ def spk_prompt(uttid, spk_sample):
         # Sample a speaker-matched embedding
         selected_uttid = spk_sample[uttid]
         audio = tokens_loader.tokens_by_uttid(
-            selected_uttid, num_codebooks=hparams["audio_tokens_per_step"]
+            selected_uttid, num_codebooks=num_codebooks
         )
         if audio.size(0) > spk_prompt_length:
             offset = torch.randint(0, audio.size(0), (1,)).item()
@@ -1003,6 +1013,22 @@ def init_sequence_encoder(hparams):
     return encoder
 
 
+def get_selected_layer_indexes(hparams):
+    """Finds the layers of selected layers
+
+    Arguments
+    ---------
+    hparams : dict
+        Hyperparameters
+    """
+    selected_layers = hparams.get("speech_model_layers")
+    available_layers = hparams.get("available_speech_model_layers")
+    if not (selected_layers and available_layers):
+        return None
+    layer_idx = [available_layers.index(layer) for layer in selected_layers]
+    return layer_idx
+
+
 def read_token_list(file_name):
     """Reads a simple text file with tokens (e.g. characters or phonemes) listed
     one per line

From c5a3f3af8e1e521a9d26a44f8040c47337d9beee Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 18:57:45 -0500
Subject: [PATCH 185/270] DASB: Add support for eval_run flags

---
 benchmarks/DASB/run_hparam_optimization.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 3029a3678..9be6a3c64 100755
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -207,7 +207,17 @@ while [[ $# -gt 0 ]]; do
       ;;
 
     -*|--*)
-      additional_flags+="$1 $2 " # store additional flags
+      name=$1
+      value=$2
+      if [[ "$name" =~ ^--eval_run_ ]]; then
+        name=$(echo $name | sed s/^--eval_run_/--/)
+        eval_run_additional_flags+="$name $value "
+      else
+        if [[ ! "$eval_run_additional_flags" =~ "$name " ]]; then
+          eval_run_additional_flags+="$name $value "
+        fi
+        additional_flags+="$name $value " # store additional flags
+      fi    
       shift # past argument
       ;;
 
@@ -415,6 +425,6 @@ scp $best_yaml_file $final_yaml_file
 ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder  --cached_data_folder $cached_data_folder \
   --output_folder $output_folder/best --task $task   --dataset $dataset  --seed $seed\
   --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \
-  --rnd_dir False --testing True $additional_flags
+  --rnd_dir False --testing True $eval_run_additional_flags
 
 echo "The test performance with best hparams is available at  $output_folder/best"

From 3ddbc57177afc10d1d963c0c61f55b03aa5ac8f3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 22:59:13 -0500
Subject: [PATCH 186/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 30 ++++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 6127644c2..5e8949501 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -242,6 +242,14 @@ def on_stage_start(self, stage, epoch):
             self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
         )[None, None, :].to(self.device)
 
+        if hasattr(hparams, "speech_model_layers"):
+            self.layer_idx = get_selected_layer_indexes(
+                hparams.available_speech_model_layers,
+                hparams.speech_model_layers
+            )
+        else:
+            self.layer_idx = None
+
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
             metric=self.compute_loss_stats, batch_eval=True,
         )
@@ -705,7 +713,10 @@ def dataio_prepare(hparams):
 
     layer_idx = None
     if "speech_model_layers" in hparams:
-        layer_idx = get_selected_layer_indexes(hparams)
+        layer_idx = get_selected_layer_indexes(
+            hparams["available_speech_model_layers"],
+            hparams["speech_model_layers"],
+        )
 
     if layer_idx is not None:
         num_codebooks = layer_idx
@@ -751,7 +762,7 @@ def spk_prompt(uttid, spk_sample):
     )
     def prompt_pipeline(id, tokens, spk_prompt):
         audio = tokens_loader.tokens_by_uttid(
-            id, num_codebooks=hparams["audio_tokens_per_step"]
+            id, num_codebooks=num_codebooks
         )
         if hparams["flip_layers"]:
             audio = audio.flip(-1)
@@ -1013,16 +1024,21 @@ def init_sequence_encoder(hparams):
     return encoder
 
 
-def get_selected_layer_indexes(hparams):
+def get_selected_layer_indexes(available_layers, selected_layers):
     """Finds the layers of selected layers
 
     Arguments
     ---------
-    hparams : dict
-        Hyperparameters
+    available_layers : list
+        The available layers
+    selected_layers : list
+        The selected layers
+
+    Returns
+    -------
+    layer_idx : list    
+        The layer indexes
     """
-    selected_layers = hparams.get("speech_model_layers")
-    available_layers = hparams.get("available_speech_model_layers")
     if not (selected_layers and available_layers):
         return None
     layer_idx = [available_layers.index(layer) for layer in selected_layers]

From e1bfb7e87823e8ec32d834713db22d7bbf919d8c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 24 Feb 2025 23:05:18 -0500
Subject: [PATCH 187/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 5e8949501..1f36093d9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -81,7 +81,9 @@ def create_waveform(self, audio, length):
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
-        wav = tokenizer.tokens_to_sig(audio)
+        wav = tokenizer.tokens_to_sig(
+            audio, **self.token_model_kwargs
+        )
         clean_padding_(wav, length)
         wav = wav.to(self.device)
         return wav
@@ -265,6 +267,9 @@ def on_stage_start(self, stage, epoch):
         elif stage == sb.Stage.TEST:
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
 
     def apply_curriculum(self):
         """Applies curriculum settings, if specified, training only the autoregressive part - or

From 851bd7d0e2e743007960e38ab7ed735b0622704a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 25 Feb 2025 01:03:05 -0500
Subject: [PATCH 188/270] DASB: VALL-E: Update max length

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml     | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml   | 2 +-
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 2 +-
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml    | 4 ++--
 .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml    | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml   | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index c811e1c7f..f9d07b443 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -97,7 +97,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 5920c4be3..9d9e65b85 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -116,7 +116,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 16000
-max_audio_length: 2000
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 54c357b52..a4a19ae6b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -96,7 +96,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 120208cd8..714cf91b5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -96,7 +96,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 8383cf0f7..c1e3f1e3a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -97,7 +97,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
@@ -160,7 +160,7 @@ nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
-vocab_size: 1024
+vocab_size: 2048
 audio_emb_freeze: False
 audio_emb_pretrained: False
 text_num_tokens: 39
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 5d0908c0e..24c494a98 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -96,7 +96,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 17cbc987d..e98056db3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -96,7 +96,7 @@ betas: [0.9, 0.95]
 # Feature parameters
 sample_rate: 24000
 model_sample_rate: 24000
-max_audio_length: 2300
+max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>

From 7463474f4ceb00028992bb22b96183e38c9e446d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 25 Feb 2025 01:53:11 -0500
Subject: [PATCH 189/270] DASB: Fix WavTokenizer

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 1f36093d9..92df5c1a1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -106,9 +106,11 @@ def compute_forward(self, batch, stage):
         batch = batch.to(self.device)
         prompt, prompt_length = batch.prompt
         batch_size, prompt_max_len, num_tracks = prompt.shape
-        nar_track = torch.randint(
-            1, num_tracks, (batch_size,), device=self.device
-        )
+        nar_track = None
+        if self.train_nar:
+            nar_track = torch.randint(
+                1, num_tracks, (batch_size,), device=self.device
+            )
         logits_ar, logits_nar = self.modules.model(
             dec_seq=batch.prompt.data,
             dec_seq_lengths=batch.prompt.lengths,

From 05f80142f3b734154642830da8874581ff7f9c4e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 25 Feb 2025 03:01:59 -0500
Subject: [PATCH 190/270] DASB: VALL-E: Add speaker prompt resampling

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 92df5c1a1..6524ee6a5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -272,6 +272,8 @@ def on_stage_start(self, stage, epoch):
         self.token_model_kwargs = getattr(
             self.hparams, "token_model_kwargs", {}
         )
+        dataset = stage.name.lower()
+        self.resample_fn[dataset](epoch=epoch or 0)
 
     def apply_curriculum(self):
         """Applies curriculum settings, if specified, training only the autoregressive part - or
@@ -877,7 +879,7 @@ def sig_pipeline(wav):
             "sorting must be random, ascending or descending"
         )
 
-    return datasets
+    return datasets, resample_fn
 
 
 def sample_dataset(dataset, count, seed):
@@ -1283,7 +1285,7 @@ def undo_padding_tensor(batch, lengths):
         )
 
     # We can now directly create the datasets for training, valid, and test
-    datasets = dataio_prepare(hparams)
+    datasets, resample_fn = dataio_prepare(hparams)
 
     # Apply overfit test settings
     datasets = apply_overfit_test(hparams, datasets)
@@ -1298,6 +1300,8 @@ def undo_padding_tensor(batch, lengths):
         checkpointer=hparams["checkpointer"],
     )
 
+    tts_brain.resample_fn = resample_fn
+
     # The `fit()` method iterates the training loop, calling the methods
     # necessary to update the parameters of the model. Since all objects
     # with changing state are managed by the Checkpointer, training can be

From f94c61b0c9e5275c7b36f9acc7cba3a9e8cb0b0b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 25 Feb 2025 11:34:50 -0500
Subject: [PATCH 191/270] DASB: VALL-E: Add SQCodec

---
 .../TTS/valle/hparams/train_sqcodec.yaml      | 234 ++++++++++++++++++
 1 file changed, 234 insertions(+)
 create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..337754bf5
--- /dev/null
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -0,0 +1,234 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/sqcodec
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+
+# Model Type
+output_folder: !ref results/valle/<experiment_name>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+tokens_folder: !PLACEHOLDER
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+freeze_token_model: True
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+g2p_src: speechbrain/soundchoice-g2p
+token_model_kmeans_src: poonehmousavi/SSL_Quantization
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+flip_layers: False
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+use_spk_emb: False
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 50
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+batch_size: 16
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+data_scale: null
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+special_tokens: ["<bos>", "<eos>", "<eot>"]
+special_num_tokens: 4
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+
+# Feature parameters
+sample_rate: 22050
+model_sample_rate: 16000
+max_audio_length: 1000
+text_max_length: 500
+n_ctx: !ref <max_audio_length> + <text_max_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Token model (pretrained)
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 2048
+text_num_tokens: 39
+phn_num_tokens: 52
+
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 4
+freeze_lm_head: False
+
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+compute_cost: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>

From 398304e2f6f5daa1e85e9ed8fb99399d135d3a65 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 25 Feb 2025 16:35:54 -0500
Subject: [PATCH 192/270] DASB: Tokotron: Update SQ-Codec ternary coding

---
 benchmarks/DASB/model/Tokotron.py | 309 ++++++------------------------
 1 file changed, 58 insertions(+), 251 deletions(-)

diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index d86d52273..122b80584 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -25,7 +25,7 @@
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.embedding import Embedding
 from speechbrain.nnet.linear import Linear
-from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, bce_loss
+from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss
 from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
@@ -418,16 +418,21 @@ class TernaryPredictionHead(nn.Module):
     num_positions : int
         the number of positions
     """
-    def __init__(self, d_model, num_positions):
+    def __init__(self, d_model, num_positions, d_hidden=512):
         super().__init__()
         self.num_positions = num_positions
         self.d_model = d_model
         self.num_positions = num_positions
-        self.lin_p = Linear(
+        self.lin_hidden = Linear(
             input_size=d_model,
-            n_neurons=num_positions * 2
+            n_neurons=d_hidden,
+        )
+        self.act = nn.LeakyReLU()
+        self.lin_p = Linear(
+            input_size=d_hidden,
+            n_neurons=num_positions * 3,
+            bias=False
         )
-        self.sigmoid = nn.Sigmoid()
 
     def forward(self, x):
         """Computes the forward pass
@@ -440,13 +445,18 @@ def forward(self, x):
         Returns
         -------
         p : torch.Tensor
-            A tensor of shape (Batch x Length x num_positions x 2) where
-            p[:, :, :, 0] -> the probability of the ternary digit being at least 0
-            p[:, :, :, 0] -> the probability of the ternary digit being at least 1
+            A tensor of shape (Batch x Length x num_positions x ternary digit)
+            The values are logits (unnormalized probabilities)
+
+            p[:, :, :, 0] corresponds to -1
+            p[:, :, :, 1] corresponds to 0
+            p[:, :, :, 2] corresponds to 1
         """
         batch_size, max_len, _ = x.shape
-        p = self.sigmoid(self.lin_p(x))
-        p = p.reshape(batch_size, max_len, self.num_positions, 2)
+        x = self.lin_hidden(x)
+        x = self.act(x)
+        x = self.lin_p(x)
+        p = x.reshape(batch_size, max_len, self.num_positions, 3)
         return p
 
 
@@ -1960,183 +1970,6 @@ def all_weights(self):
         return torch.stack([emb.weight for emb in self.emb])
 
 
-class DACFeatureExtractor(nn.Module):
-    """An adapter for feature extraction
-
-    Arguments
-    ---------
-    dac : DAC
-        a DAC model
-    """
-
-    def __init__(self, dac, n_quantizers):
-        super().__init__()
-        self.dac = dac
-        self.dac.eval()
-        self.n_quantizers = n_quantizers
-
-    def encode(self, inputs, length):
-        """Encodes a raw audio sample using DAC
-
-        Arguments
-        ---------
-        inputs : torch.Tensor
-            A (Batch x Samples) or (Batch x Channel x Samples)
-            tensor of audio
-        length : torch.Tensor
-            A tensor of relative lengths
-
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Tokens x Heads) tensor of audio tokens
-        emb : torch.Tensor
-            Raw vector embeddings from the model's
-            quantizers
-
-        """
-        if inputs.dim() < 3:
-            inputs = inputs.unsqueeze(1)
-        emb, codes, _, _, _ = self.dac.encode(
-            inputs, n_quantizers=self.n_quantizers
-        )
-        emb.transpose_(1, 2)
-        codes.transpose_(1, 2)
-        max_len = emb.size(1)
-        mask = length_to_mask(
-            length * max_len, max_len, device=inputs.device
-        ).unsqueeze(-1)
-        return codes * mask, emb * mask
-
-    def forward(self, inputs, length):
-        """Encodes a raw audio sample using DAC
-
-        Arguments
-        ---------
-        inputs : torch.Tensor
-            A (Batch x Samples) or (Batch x Channel x Samples)
-            tensor of audio
-        length : torch.Tensor
-            A tensor of relative lengths
-
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Tokens x Heads) tensor of audio tokens
-        emb : torch.Tensor
-            Raw vector embeddings from the model's
-            quantizers
-
-        """
-        return self.encode(inputs, length)
-
-    def embeddings(self, tokens):
-        """Converts token indexes to vector embeddings
-
-        Arguments
-        ---------
-        tokens : torch.Tensor
-            a (Batch x Length x Heads) tensor of token indexes
-
-        Returns
-        -------
-        emb : torch.Tensor
-            a (Batch x Length x Heads x Embedding) tensor
-            of raw vector embeddings from the model's
-            quantizer codebooks
-        """
-        emb, _, _ = self.dac.quantizer.from_codes(tokens.transpose(1, 2).int())
-        return emb.transpose(1, 2)
-
-
-class SpeechTokenizerFeatureExtractor(nn.Module):
-    """This lobe enables the integration of HuggingFace and SpeechBrain
-    pretrained SpeechTokenizer.
-
-    Please, install speechtokenizer:
-    pip install speechtokenizer
-
-    Source paper: https://arxiv.org/abs/2308.16692
-
-
-    The model can be used as a fixed Discrete feature extractor or can be finetuned. It
-    will download automatically the model from HuggingFace or use a local path.
-
-    Arguments
-    ---------
-    speech_tokenizer : speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface
-        The speech tokenizer interface
-    codebooks : int, optional
-        The number of codebooks to use - if omitted,
-    """
-
-    def __init__(self, speech_tokenizer, codebooks=None):
-        super().__init__()
-        self.speech_tokenizer = speech_tokenizer
-        self.codebooks = codebooks
-
-    def forward(self, wav, wav_lens=None):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        wav_lens : torch.Tensor
-            The relative length of the wav given in SpeechBrain format.
-
-        Returns
-        -------
-        tokens : torch.Tensor
-            A tensor of audio tokens
-            Shape: (N_q x Batch x Time) by default
-            (Batch x Time x N_q) if shape == compat
-
-        """
-        return self.encode(wav, wav_lens)
-
-    def encode(self, wav, wav_lens=None):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-
-        Arguments
-        ---------
-        wav : torch.Tensor (signal)
-            A batch of audio signals to transform to features.
-        wav_lens : torch.Tensor
-            The relative length of the wav given in SpeechBrain format.
-
-        Returns
-        -------
-        tokens : torch.Tensor
-            A (Batch x Seq, N_q) tensor of audio tokens
-
-        """
-        # Extract discrete codes from SpeechTokenizer
-        codes = self.speech_tokenizer.encode(
-            wav.unsqueeze(1), wav_lens
-        )  # codes: (n_q, B, T)
-        if self.codebooks is not None:
-            codes = codes[: self.codebooks]
-        codes = codes.permute(1, 2, 0)
-        return codes
-
-    def decode(self, codes):
-        """Takes an input waveform and return its corresponding wav2vec encoding.
-
-        Arguments
-        ---------
-        tokens : torch.Tensor
-            A (N_q, Batch x Seq) tensor of audio tokens
-
-        Returns
-        -------
-        wav : torch.Tensor (signal)
-            A batch of reconstructed audio signals.
-        """
-        codes = codes.permute(2, 0, 1)
-        return self.speech_tokenizer.decode(codes)
-
-
 def get_silence_token(
     model,
     sample_length=100000,
@@ -2335,67 +2168,22 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys):
     }
 
 
-def ternary_matrix_to_decimal(matrix):
-    """
-    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
-
-    Arguments
-    ---------
-    matrix : numpy.ndarray
-        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
-        of ternary digits, and N is the number of ternary numbers in each batch.
-
-    Returns
-    -------
-    numpy.ndarray
-        A 2D numpy array of shape (B, N), where each value represents the decimal
-        equivalent of the corresponding ternary number in the input matrix.
-    """
-    (
-        B,
-        D,
-        N,
-    ) = (
-        matrix.shape
-    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
-    powers_of_three = 3 **torch.arange(D)  # [3^0, 3^1, ..., 3^(D-1)]
-
-    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
-    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
-
-    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
-    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
-
-    return decimals
-
-
 def logits_to_ternary(logits):
     """Converts a tensor with two logits to a ternary matrix
 
     Arguments
     ---------
     logits : torch.Tensor
-        The logits (Batch x Length x num_positions x 2)
+        The logits (Batch x Length x num_positions x 3)
 
     Returns
     -------
     result : torch.Tensor
         The corresponding ternary matrix
     """
-    gte0 = logits[..., 0] >= 0.5
-    gte1 = logits[..., 1] >= 0.5
-    val_minus_1 = torch.tensor(-1, device=logits.device)
-    val_zero = torch.tensor(0, device=logits.device)
-    val_plus_1 = torch.tensor(1, device=logits.device)
-    return torch.where(
-        gte0,
-        torch.where(
-            gte1,
-            val_plus_1,
-            val_zero
-        ),
-        val_minus_1
-    )
+    ternary = logits.argmax(-1) - 1
+    return ternary
+
 
 def ternary_matrix_to_decimal(matrix):
     """
@@ -2433,7 +2221,7 @@ def ternary_matrix_to_decimal(matrix):
 
 def ternary_to_decimal(ternary, n_codebook=4):
     """Converts ternary digits to their decimal equivalent
-    
+
     Arguments
     ---------
     ternary : torch.Tensor
@@ -2479,7 +2267,9 @@ def tokens_to_ternary(tokens):
     
     Returns
     -------
-    result : t""" 
+    result : torch.Tensor
+        A (Batch x Length x Ternary Positions) tensor
+        with values of (-1, 0, 1)"""
     batch_size = tokens.size(0)
     n_codebook = tokens.size(2)
     tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
@@ -2491,19 +2281,36 @@ def tokens_to_ternary(tokens):
 
 
 def ternary_loss(predictions, targets, length=None, reduction="mean"):
-    tgt_gte0 = targets >= 0.
-    tgt_gte1 = targets >= 1.
-    loss_gte0 = bce_loss(
-        predictions[:, :, :, 0],
-        tgt_gte0,
-        length=length,
-        reduction=reduction,
+    batch_size, max_len, positions = targets.shape
+    predictions_reshaped = (
+        predictions
+        .permute(2, 0, 1, 3)
+        .reshape(batch_size * positions, max_len, 3)
     )
-    loss_gte1 = bce_loss(
-        predictions[:, :, :, 0],
-        tgt_gte1,
-        length=length,
-        reduction=reduction,
+    targets_cat = targets + 1
+    targets_cat_reshaped = (
+        targets_cat
+        .permute(2, 0, 1)
+        .reshape(batch_size * positions, max_len)
+    )
+    length_reshaped = (
+        length.unsqueeze(-1)
+        .expand(batch_size, positions)
+        .permute(1, 0)
+        .reshape(batch_size * positions)
     )
-    loss = loss_gte0 + loss_gte1
+    loss = nll_loss(
+        log_probabilities=predictions_reshaped,
+        targets=targets_cat_reshaped,
+        length=length_reshaped,
+        reduction=reduction
+    )
+    if reduction == "batch":
+        loss = (
+            loss
+            .reshape(positions, batch_size)
+            .permute(1, 0)
+            .mean(1)
+        )
+
     return loss
\ No newline at end of file

From c90037c388477239dfbf13032aad1d96f622383b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 26 Feb 2025 13:26:47 -0500
Subject: [PATCH 193/270] DASB: Add the ability to disable test runs

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 34 ++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 6524ee6a5..5cbca2493 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1315,20 +1315,20 @@ def undo_padding_tensor(batch, lengths):
     )
 
     # Load best checkpoint for evaluation
-
-    test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
-    if test_summary_file.exists():
-        logging.info("Test run already completed: %s", test_summary_file)
-    else:
-        test_key_kind = hparams["test_key_kind"]
-        test_key = hparams["test_key"]
-        eval_kwargs = {
-            f"{test_key_kind}_key": test_key
-        }
-        eval_dataset = datasets["test"]
-        eval_dataset = select_eval_subset(eval_dataset, hparams)
-        tts_brain.evaluate(
-            test_set=eval_dataset,
-            test_loader_kwargs=hparams["test_dataloader_opts"],
-            **eval_kwargs
-        )
+    if hparams["testing"]:
+        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        if test_summary_file.exists():
+            logging.info("Test run already completed: %s", test_summary_file)
+        else:
+            test_key_kind = hparams["test_key_kind"]
+            test_key = hparams["test_key"]
+            eval_kwargs = {
+                f"{test_key_kind}_key": test_key
+            }
+            eval_dataset = datasets["test"]
+            eval_dataset = select_eval_subset(eval_dataset, hparams)
+            tts_brain.evaluate(
+                test_set=eval_dataset,
+                test_loader_kwargs=hparams["test_dataloader_opts"],
+                **eval_kwargs
+            )

From 131eea389f90900b77bdee272567a860244016f8 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 27 Feb 2025 11:35:49 -0500
Subject: [PATCH 194/270] DASB: Tokotron: Update ternary loss aggregation

---
 benchmarks/DASB/model/Tokotron.py | 43 ++++++++++---------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 122b80584..764c4e34e 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -2282,35 +2282,20 @@ def tokens_to_ternary(tokens):
 
 def ternary_loss(predictions, targets, length=None, reduction="mean"):
     batch_size, max_len, positions = targets.shape
-    predictions_reshaped = (
-        predictions
-        .permute(2, 0, 1, 3)
-        .reshape(batch_size * positions, max_len, 3)
-    )
     targets_cat = targets + 1
-    targets_cat_reshaped = (
-        targets_cat
-        .permute(2, 0, 1)
-        .reshape(batch_size * positions, max_len)
-    )
-    length_reshaped = (
-        length.unsqueeze(-1)
-        .expand(batch_size, positions)
-        .permute(1, 0)
-        .reshape(batch_size * positions)
-    )
-    loss = nll_loss(
-        log_probabilities=predictions_reshaped,
-        targets=targets_cat_reshaped,
-        length=length_reshaped,
-        reduction=reduction
+    predictions_loss = predictions.permute(0, 3, 1, 2)
+    loss = nn.functional.nll_loss(
+        predictions_loss,
+        targets_cat,
+        reduction="none"
     )
-    if reduction == "batch":
-        loss = (
-            loss
-            .reshape(positions, batch_size)
-            .permute(1, 0)
-            .mean(1)
-        )
-
+    mask = length_to_mask(
+        length * max_len,
+        max_len
+    ).unsqueeze(-1)
+    loss = loss * mask
+    if reduction == "mean":
+        loss = loss.sum(2).mean(1).mean(0) / 3.0
+    elif reduction == "batch":
+        loss = loss.sum(2).mean(1) / 3.0
     return loss
\ No newline at end of file

From 7c5e82f4605a23413203baa56ea001b1e8eee3a3 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 27 Feb 2025 13:25:36 -0500
Subject: [PATCH 195/270] DASB: Fix an issue with contiguous tensors

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 2 +-
 benchmarks/DASB/model/Tokotron.py              | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 3e6b356d4..4c5d12b94 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -229,7 +229,7 @@ def compute_objectives(self, predictions, batch, stage):
             input_length=batch.tokens.lengths,
             reduction="batch",
         )
-        return loss_details.loss
+        return loss_details.loss.contiguous()
 
     def on_stage_start(self, stage, epoch):
         """Gets called at the beginning of each epoch.
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 764c4e34e..7d250e4cc 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -2283,7 +2283,7 @@ def tokens_to_ternary(tokens):
 def ternary_loss(predictions, targets, length=None, reduction="mean"):
     batch_size, max_len, positions = targets.shape
     targets_cat = targets + 1
-    predictions_loss = predictions.permute(0, 3, 1, 2)
+    predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
     loss = nn.functional.nll_loss(
         predictions_loss,
         targets_cat,
@@ -2298,4 +2298,4 @@ def ternary_loss(predictions, targets, length=None, reduction="mean"):
         loss = loss.sum(2).mean(1).mean(0) / 3.0
     elif reduction == "batch":
         loss = loss.sum(2).mean(1) / 3.0
-    return loss
\ No newline at end of file
+    return loss

From 7046db00b85d6ab643e7ec6f1ca5b57a2de9d8c7 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 28 Feb 2025 13:03:46 -0500
Subject: [PATCH 196/270] DASB: Tokotron: SQ-Codec Add the ability to bypass
 additional ternary projections

---
 .../TTS/tokotron/hparams/train_sqcodec.yaml   | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 4ea1ba387..6b9782eeb 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -17,7 +17,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped.
 
 config: config.yaml
 checkpoint: ckpt_00190000.pth
-sq_codec_save_path: !ref <pretrained_model_save_path>/sq-codec
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
 g2p_src: flexthink/soundchoice-g2p
 
 # Model type
@@ -110,6 +110,8 @@ gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
 
+ternary_input_mode: embedding
+
 silence_padding: !ref <gate_offset>
 
 # Token model (pretrained)
@@ -156,7 +158,11 @@ transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
 audio_num_tokens: 19683
-audio_emb_size: 1024
+audio_emb_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ternary_input_mode>
+    choices:
+        embedding: 36
+        projection: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
 audio_token_offsets: False
@@ -210,9 +216,13 @@ inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference
     transform_audio: !name:model.Tokotron.tokens_to_ternary
     feed_audio: !name:model.Tokotron.ternary_logits_to_tokens
 
-audio_emb: !new:model.Tokotron.TernaryInput
-    emb_size: !ref <audio_emb_size>
-    num_positions: !ref <ternary_num_positions>
+audio_emb: !apply:speechbrain.utils.hparams.choice
+    value: !ref <ternary_input_mode>
+    choices:
+        projection: !new:model.Tokotron.TernaryInput
+            emb_size: !ref <audio_emb_size>
+            num_positions: !ref <ternary_num_positions>
+        embedding: !new:torch.nn.Identity
 
 out_proj: !new:model.Tokotron.TernaryPredictionHead
     d_model: !ref <d_model>

From ebe181172ca065c12769fb6b2da19345e2607a9e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 28 Feb 2025 23:38:58 -0500
Subject: [PATCH 197/270] DASB: Tokotron: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 4c5d12b94..ec1845d36 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -934,6 +934,7 @@ def apply_overfit_test(hparams, dataset):
             if test_summary_file.exists():
                 logging.info("Test run already completed: %s", test_summary_file)
             else:
+                eval_kwargs = {}
                 test_key_kind = hparams["test_key_kind"]
                 test_key = hparams["test_key"]
                 if test_key:

From dae8bcb82ef7be7cc838f1cf82823bd486d23646 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 1 Mar 2025 00:02:38 -0500
Subject: [PATCH 198/270] DASB: Fixes: SQ-Codec refactoring (decouple from
 Tokotron, simplify)

---
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |  26 +--
 benchmarks/DASB/model/Tokotron.py             | 210 +-----------------
 benchmarks/DASB/model/custom_model.py         |  55 +++++
 benchmarks/DASB/model/sq_codec.py             | 147 ++++++++++++
 4 files changed, 211 insertions(+), 227 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index 6b9782eeb..f0ab3d9c1 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -110,8 +110,6 @@ gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp
     gamma: !ref <gate_loss_gamma>
     max_weight: !ref <gate_loss_max_weight>
 
-ternary_input_mode: embedding
-
 silence_padding: !ref <gate_offset>
 
 # Token model (pretrained)
@@ -145,7 +143,7 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
-transform_audio: !name:model.Tokotron.tokens_to_ternary
+transform_audio: !name:model.sq_codec.tokens_to_ternary
 
 ####################### Model parameters ###########################
 # Transformer
@@ -158,11 +156,7 @@ transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
 audio_num_tokens: 19683
-audio_emb_size: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ternary_input_mode>
-    choices:
-        embedding: 36
-        projection: 1024
+audio_emb_size:  36
 audio_emb_freeze: False
 audio_emb_pretrained: False
 audio_token_offsets: False
@@ -213,18 +207,12 @@ inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference
     audio_token_shift: 0
     max_steps: !ref <infer_max_audio_length>
     representation_mode: !ref <representation_mode>
-    transform_audio: !name:model.Tokotron.tokens_to_ternary
-    feed_audio: !name:model.Tokotron.ternary_logits_to_tokens
+    transform_audio: !name:model.sq_codec.tokens_to_ternary
+    feed_audio: !name:model.sq_codec.ternary_logits_to_tokens
 
-audio_emb: !apply:speechbrain.utils.hparams.choice
-    value: !ref <ternary_input_mode>
-    choices:
-        projection: !new:model.Tokotron.TernaryInput
-            emb_size: !ref <audio_emb_size>
-            num_positions: !ref <ternary_num_positions>
-        embedding: !new:torch.nn.Identity
+audio_emb: !new:torch.nn.Identity
 
-out_proj: !new:model.Tokotron.TernaryPredictionHead
+out_proj: !new:model.custom_model.TernaryPredictionHead
     d_model: !ref <d_model>
     num_positions: !ref <ternary_num_positions>
 
@@ -249,7 +237,7 @@ compute_cost: !new:model.Tokotron.TokotronLoss
     gate_gamma: !ref <gate_loss_gamma>
     gate_max_weight: !ref <gate_loss_max_weight>
     silence_padding: !ref <silence_padding>
-    seq_cost: !name:model.Tokotron.ternary_loss
+    seq_cost: !name:model.sq_codec.ternary_loss
     multihead_output: False
 
 lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 7d250e4cc..6a2de5859 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -29,7 +29,6 @@
 from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
-from model.sq_codec import decimal_to_ternary_matrix
 
 from enum import Enum
 from collections import namedtuple
@@ -407,78 +406,6 @@ def init_audio_emb(self, emb):
         self.audio_emb.initialize(emb)
 
 
-class TernaryPredictionHead(nn.Module):
-    """An alternative prediction head that predicts a fixed number of ternary digits
-    for each position (as used in SQ-Codec)
-    
-    Arguments
-    ---------
-    d_model : int
-        The model dimension
-    num_positions : int
-        the number of positions
-    """
-    def __init__(self, d_model, num_positions, d_hidden=512):
-        super().__init__()
-        self.num_positions = num_positions
-        self.d_model = d_model
-        self.num_positions = num_positions
-        self.lin_hidden = Linear(
-            input_size=d_model,
-            n_neurons=d_hidden,
-        )
-        self.act = nn.LeakyReLU()
-        self.lin_p = Linear(
-            input_size=d_hidden,
-            n_neurons=num_positions * 3,
-            bias=False
-        )
-
-    def forward(self, x):
-        """Computes the forward pass
-        
-        Arguments
-        ---------
-        x : torch.Tensor
-            The decoder output (Batch x Length x d_model)
-
-        Returns
-        -------
-        p : torch.Tensor
-            A tensor of shape (Batch x Length x num_positions x ternary digit)
-            The values are logits (unnormalized probabilities)
-
-            p[:, :, :, 0] corresponds to -1
-            p[:, :, :, 1] corresponds to 0
-            p[:, :, :, 2] corresponds to 1
-        """
-        batch_size, max_len, _ = x.shape
-        x = self.lin_hidden(x)
-        x = self.act(x)
-        x = self.lin_p(x)
-        p = x.reshape(batch_size, max_len, self.num_positions, 3)
-        return p
-
-
-class TernaryInput(nn.Module):
-    def __init__(self, emb_size, num_positions):
-        super().__init__()
-        self.num_positions = num_positions
-        self.in_proj = Linear(
-            input_size=num_positions * 3,
-            n_neurons=emb_size,
-        )
-
-    def forward(self, x):
-        batch_size, max_len = x.shape[:2]
-        x_onehot = torch.nn.functional.one_hot(
-            (x + 1).long(),
-            3
-        ).reshape(batch_size, max_len, self.num_positions * 3)
-        in_proj = self.in_proj(x_onehot.float())
-        return in_proj
-
-
 class TokotronTransformerAutoregressiveInference(nn.Module):
     """A greedy autoregressive inference implementation
 
@@ -2076,7 +2003,7 @@ def feature_pad_to(tensor, length, padding=None):
 
 
 def batch_feature_pad(tensors, padding=None):
-    """Similar to batch_pad_right but pads with the specified padding, whcih
+    """Similar to batch_pad_right but pads with the specified padding, which
     can be a vector or a tensor
 
     Arguments
@@ -2165,137 +2092,4 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys):
         "collate_fn": partial(
             token_collate_fn, silence_token=silence_token, token_keys=token_keys
         ),
-    }
-
-
-def logits_to_ternary(logits):
-    """Converts a tensor with two logits to a ternary matrix
-
-    Arguments
-    ---------
-    logits : torch.Tensor
-        The logits (Batch x Length x num_positions x 3)
-
-    Returns
-    -------
-    result : torch.Tensor
-        The corresponding ternary matrix
-    """
-    ternary = logits.argmax(-1) - 1
-    return ternary
-
-
-def ternary_matrix_to_decimal(matrix):
-    """
-    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
-
-    Arguments
-    ---------
-    matrix : numpy.ndarray
-        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
-        of ternary digits, and N is the number of ternary numbers in each batch.
-
-    Returns
-    -------
-    numpy.ndarray
-        A 2D numpy array of shape (B, N), where each value represents the decimal
-        equivalent of the corresponding ternary number in the input matrix.
-    """
-    (
-        B,
-        D,
-        N,
-    ) = (
-        matrix.shape
-    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
-    powers_of_three = 3 ** torch.arange(D, device=matrix.device)  # [3^0, 3^1, ..., 3^(D-1)]
-
-    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
-    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
-
-    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
-    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
-
-    return decimals
-
-
-def ternary_to_decimal(ternary, n_codebook=4):
-    """Converts ternary digits to their decimal equivalent
-
-    Arguments
-    ---------
-    ternary : torch.Tensor
-        (Batch x Length x num_positions) - ternary digits
-    n_codebooks : torch.Tensor
-        The number of coedbooks"""
-    chunks = ternary.chunk(n_codebook, dim=1)
-    codec_ls = []
-    # TODO: Vectorize
-    for i, chunk in enumerate(chunks):
-        chunk = chunk + 1
-        tmp_codec = ternary_matrix_to_decimal(chunk)
-        codec_ls.append(tmp_codec)
-    codec_ls = torch.stack(codec_ls)
-    return codec_ls.permute(1, 2, 0)
-
-
-def ternary_logits_to_tokens(logits):
-    """Converts ternary logits to tokens (as used for SQ-Codec)
-
-    Arguments
-    ---------
-    logits : torch.Tensor
-        The logits
-
-    Returns
-    -------
-    tokens : torch.Tensor
-        Token IDs
-    """
-    ternary_matrix = logits_to_ternary(logits)
-    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2))
-    return tokens
-
-
-def tokens_to_ternary(tokens):
-    """Converts a sequence of tokens to a ternary matrix
-    
-    Arguments
-    ---------
-    tokens : torch.Tensor
-        A (Batch x Length x Codebooks) tensor of tokens
-    
-    Returns
-    -------
-    result : torch.Tensor
-        A (Batch x Length x Ternary Positions) tensor
-        with values of (-1, 0, 1)"""
-    batch_size = tokens.size(0)
-    n_codebook = tokens.size(2)
-    tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
-    ternary_matrix = torch.cat([
-        decimal_to_ternary_matrix(item, D=9) - 1
-        for item in tokens
-    ], dim=1)
-    return ternary_matrix.transpose(1, 2)
-
-
-def ternary_loss(predictions, targets, length=None, reduction="mean"):
-    batch_size, max_len, positions = targets.shape
-    targets_cat = targets + 1
-    predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
-    loss = nn.functional.nll_loss(
-        predictions_loss,
-        targets_cat,
-        reduction="none"
-    )
-    mask = length_to_mask(
-        length * max_len,
-        max_len
-    ).unsqueeze(-1)
-    loss = loss * mask
-    if reduction == "mean":
-        loss = loss.sum(2).mean(1).mean(0) / 3.0
-    elif reduction == "batch":
-        loss = loss.sum(2).mean(1) / 3.0
-    return loss
+    }
\ No newline at end of file
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 972d35c66..e5a9db761 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,4 +1,5 @@
 import torch
+from speechbrain.nnet.linear import Linear
 
 
 class AttentionMLP(torch.nn.Module):
@@ -109,3 +110,57 @@ def forward(self, in_tokens):
             if self.proj_layer is not None:
                 in_embs = self.proj_layer(in_embs)
             return in_embs
+
+
+class TernaryPredictionHead(torch.nn.Module):
+    """An alternative prediction head that predicts a fixed number of ternary digits
+    for each position (as used in SQ-Codec)
+
+    Arguments
+    ---------
+    d_model : int
+        The model dimension
+    num_positions : int
+        the number of positions
+    """
+    def __init__(self, d_model, num_positions, d_hidden=512):
+        super().__init__()
+        self.num_positions = num_positions
+        self.d_model = d_model
+        self.num_positions = num_positions
+        self.lin_hidden = Linear(
+            input_size=d_model,
+            n_neurons=d_hidden,
+        )
+        self.act = torch.nn.LeakyReLU()
+        self.lin_p = Linear(
+            input_size=d_hidden,
+            n_neurons=num_positions * 3,
+            bias=False
+        )
+
+    def forward(self, x):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The decoder output (Batch x Length x d_model)
+
+        Returns
+        -------
+        p : torch.Tensor
+            A tensor of shape (Batch x Length x num_positions x ternary digit)
+            The values are logits (unnormalized probabilities)
+
+            p[:, :, :, 0] corresponds to -1
+            p[:, :, :, 1] corresponds to 0
+            p[:, :, :, 2] corresponds to 1
+        """
+        batch_size, max_len, _ = x.shape
+        x = self.lin_hidden(x)
+        x = self.act(x)
+        x = self.lin_p(x)
+        p = x.reshape(batch_size, max_len, self.num_positions, 3)
+        return p
+
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 7901675e1..d0b850056 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -21,6 +21,8 @@
 from torch.autograd import Function
 from torch.nn.utils import remove_weight_norm, weight_norm
 
+from speechbrain.dataio.dataio import length_to_mask
+
 
 class SQCodec(nn.Module):
     """
@@ -1342,6 +1344,41 @@ def ternary_matrix_to_decimal(matrix):
     return decimals
 
 
+def ternary_matrix_to_decimal_torch(matrix):
+    """
+    Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch.
+
+    Arguments
+    ---------
+    matrix : numpy.ndarray
+        A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number
+        of ternary digits, and N is the number of ternary numbers in each batch.
+
+    Returns
+    -------
+    numpy.ndarray
+        A 2D numpy array of shape (B, N), where each value represents the decimal
+        equivalent of the corresponding ternary number in the input matrix.
+    """
+    (
+        B,
+        D,
+        N,
+    ) = (
+        matrix.shape
+    )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
+    powers_of_three = 3 ** torch.arange(D, device=matrix.device)  # [3^0, 3^1, ..., 3^(D-1)]
+
+    # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
+    powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
+
+    # Compute dot product using broadcasting: matrix * powers_of_three along D axis
+    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+
+    return decimals
+
+
+
 def get_padding(kernel_size, dilation=1):
     """
     Computes the padding size for a given kernel size and dilation.
@@ -1359,3 +1396,113 @@ def get_padding(kernel_size, dilation=1):
         Calculated padding size.
     """
     return int((kernel_size * dilation - dilation) / 2)
+
+
+def ternary_to_decimal(ternary, n_codebook=4):
+    """Converts ternary digits to their decimal equivalent
+
+    Arguments
+    ---------
+    ternary : torch.Tensor
+        (Batch x Length x num_positions) - ternary digits
+    n_codebooks : torch.Tensor
+        The number of codebooks
+    
+    Returns
+    -------
+    result: torch.Tensor
+        the result (Batch x Length x codebooks)
+    """
+    chunks = ternary.chunk(n_codebook, dim=1)
+    codec_ls = []
+    # TODO: Vectorize
+    for i, chunk in enumerate(chunks):
+        chunk = chunk + 1
+        tmp_codec = ternary_matrix_to_decimal_torch(chunk)
+        codec_ls.append(tmp_codec)
+    codec_ls = torch.stack(codec_ls)
+    return codec_ls.permute(1, 2, 0)
+
+
+def ternary_logits_to_tokens(logits):
+    """Converts ternary logits to tokens (as used for SQ-Codec)
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits
+
+    Returns
+    -------
+    tokens : torch.Tensor
+        Token IDs
+    """
+    ternary_matrix = logits_to_ternary(logits)
+    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2))
+    return tokens
+
+
+def tokens_to_ternary(tokens):
+    """Converts a sequence of tokens to a ternary matrix
+    
+    Arguments
+    ---------
+    tokens : torch.Tensor
+        A (Batch x Length x Codebooks) tensor of tokens
+    
+    Returns
+    -------
+    result : torch.Tensor
+        A (Batch x Length x Ternary Positions) tensor
+        with values of (-1, 0, 1)"""
+    has_batch = tokens.dim() > 2
+    if not has_batch:
+        tokens = tokens.unsqueeze(0)
+    batch_size = tokens.size(0)
+    n_codebook = tokens.size(2)
+    tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
+    ternary_matrix = torch.cat([
+        decimal_to_ternary_matrix(item, D=9) - 1
+        for item in tokens
+    ], dim=1)
+    ternary_matrix = ternary_matrix.transpose(1, 2)
+    if not has_batch:
+        ternary_matrix = ternary_matrix[0]
+    return ternary_matrix
+
+
+def logits_to_ternary(logits):
+    """Converts a tensor with two logits to a ternary matrix
+
+    Arguments
+    ---------
+    logits : torch.Tensor
+        The logits (Batch x Length x num_positions x 3)
+
+    Returns
+    -------
+    result : torch.Tensor
+        The corresponding ternary matrix
+    """
+    ternary = logits.argmax(-1) - 1
+    return ternary
+
+def ternary_loss(predictions, targets, length=None, reduction="mean"):
+    batch_size, max_len, positions = targets.shape
+    targets_cat = targets + 1
+    predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
+    loss = nn.functional.nll_loss(
+        predictions_loss,
+        targets_cat,
+        reduction="none"
+    )
+    mask = length_to_mask(
+        length * max_len,
+        max_len
+    ).unsqueeze(-1)
+    loss = loss * mask
+    if reduction == "mean":
+        loss = loss.sum(2).mean(1).mean(0) / 3.0
+    elif reduction == "batch":
+        loss = loss.sum(2).mean(1) / 3.0
+    return loss

From 9b09d2049f5ee292574bdbf1b38347ea1da862f4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 4 Mar 2025 11:09:50 -0500
Subject: [PATCH 199/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 5cbca2493..695937f36 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 from hyperpyyaml import load_hyperpyyaml
 from speechbrain.dataio.dataio import (
-    clean_padding_,
+    clean_padding,
     length_to_mask,
     write_audio,
 )
@@ -84,7 +84,7 @@ def create_waveform(self, audio, length):
         wav = tokenizer.tokens_to_sig(
             audio, **self.token_model_kwargs
         )
-        clean_padding_(wav, length)
+        wav = clean_padding(wav, length)
         wav = wav.to(self.device)
         return wav
 

From 4c4663db840be8e2b84b3e56ca1e216d06ec01db Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Mar 2025 15:13:17 -0500
Subject: [PATCH 200/270] DASB: Update VALL-E for SQCodec

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 12 +++---
 benchmarks/DASB/model/sq_codec.py           | 44 ++++++++++++++++-----
 benchmarks/DASB/model/valle.py              | 15 +++++--
 3 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index eeb3a9d6b..771d4b14a 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -159,7 +159,7 @@ def compute_objectives(self, predictions, batch, stage):
             logits_ar_sm = self.hparams.log_softmax(logits_ar)
             targets_ar = prompt[:, 1:, 0]
             loss_ar = self.hparams.compute_cost(
-                log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
+                logits_ar_sm, targets=targets_ar, mask=mask
             )
             loss_components.append(loss_ar)
         else:
@@ -168,7 +168,7 @@ def compute_objectives(self, predictions, batch, stage):
             logits_nar_sm = self.hparams.log_softmax(logits_nar)
             targets_nar = prompt[batch_idx, 1:, nar_track]
             loss_nar = self.hparams.compute_cost(
-                log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
+                logits_nar_sm, targets=targets_nar, mask=mask,
             )
             loss_components.append(loss_nar)
         else:
@@ -218,12 +218,12 @@ def compute_loss_stats(
         stats = {}
         if self.train_ar:
             stats["loss_ar"] = self.hparams.compute_cost(
-                log_probabilities=logits_ar, targets=targets_ar, mask=mask,
+                logits_ar, targets=targets_ar, mask=mask,
                 reduction=reduction,
             )
         if self.train_nar:
             stats["loss_nar"] = self.hparams.compute_cost(
-                log_probabilities=logits_nar, targets=targets_nar, mask=mask,
+                logits_nar, targets=targets_nar, mask=mask,
                 reduction=reduction,
             )
         return stats
@@ -258,6 +258,7 @@ def on_stage_start(self, stage, epoch):
         elif stage == sb.Stage.TEST:
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
+        self.transform_audio = getattr(self.hparams, "transform_audio", None)
 
     def apply_curriculum(self):
         """Applies curriculum settings, if specified, training only the autoregressive part - or
@@ -572,7 +573,7 @@ def dataio_prepare(hparams):
         "valid": hparams["valid_json"],
         "test": hparams["test_json"],
     }
-    
+
     label_encoder = hparams["label_encoder"]
     input_feature = INPUT_FEATURE_MAP[hparams["input"]]
     offsets = get_offsets(
@@ -606,7 +607,6 @@ def prompt_pipeline(id, tokens):
         audio = tokens_loader.tokens_by_uttid(
             id, num_codebooks=hparams["audio_tokens_per_step"]
         )
-
         if hparams["flip_layers"]:
             audio = audio.flip(-1)
         yield audio
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index d0b850056..ce8764af7 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1283,6 +1283,23 @@ def forward(self, x):
         return x
 
 
+class TernaryEmbedding(nn.Module):
+    """A module wrapper for tokens-to-ternary conversion
+
+    Arguments
+    ---------
+    tokens : torch.Tensor
+        the tokens"""
+    def forward(self, tokens):
+        if tokens.dim() < 3:
+            tokens = tokens.unsqueeze(-1)
+        batch_size, max_len, tracks = tokens.shape
+        emb = tokens_to_ternary(tokens).float()
+        positions = emb.size(-1)
+        emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
+        return emb
+
+
 def decimal_to_ternary_matrix(decimals, D):
     """
     Convert a tensor of decimal numbers to a D*T ternary matrix for each batch.
@@ -1378,7 +1395,6 @@ def ternary_matrix_to_decimal_torch(matrix):
     return decimals
 
 
-
 def get_padding(kernel_size, dilation=1):
     """
     Computes the padding size for a given kernel size and dilation.
@@ -1444,12 +1460,12 @@ def ternary_logits_to_tokens(logits):
 
 def tokens_to_ternary(tokens):
     """Converts a sequence of tokens to a ternary matrix
-    
+
     Arguments
     ---------
     tokens : torch.Tensor
         A (Batch x Length x Codebooks) tensor of tokens
-    
+
     Returns
     -------
     result : torch.Tensor
@@ -1487,7 +1503,12 @@ def logits_to_ternary(logits):
     ternary = logits.argmax(-1) - 1
     return ternary
 
-def ternary_loss(predictions, targets, length=None, reduction="mean"):
+
+def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", reduction="mean"):
+    if targets.dim() < 3:
+        targets = targets.unsqueeze(-1)
+    if targets_type == "tokens":
+        targets = tokens_to_ternary(targets.unsqueeze(-1))
     batch_size, max_len, positions = targets.shape
     targets_cat = targets + 1
     predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
@@ -1496,13 +1517,16 @@ def ternary_loss(predictions, targets, length=None, reduction="mean"):
         targets_cat,
         reduction="none"
     )
-    mask = length_to_mask(
-        length * max_len,
-        max_len
-    ).unsqueeze(-1)
-    loss = loss * mask
+    mask = None
+    if length is not None:
+        mask = length_to_mask(
+            length * max_len,
+            max_len
+        ).unsqueeze(-1)
+    if mask is not None:
+        loss = loss * mask
     if reduction == "mean":
         loss = loss.sum(2).mean(1).mean(0) / 3.0
     elif reduction == "batch":
         loss = loss.sum(2).mean(1) / 3.0
-    return loss
+    return loss
\ No newline at end of file
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 3abcf057f..4cc155c2d 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -71,6 +71,10 @@ class ValleLM(nn.Module):
         Number of layers in NAR Transformer.
     n_ctx : int
         maximum context length of AR & NAR Transformer.
+    lm_head : torch.nn.Module, optional
+        an alternative LM head implementation head, an alternative
+        to the default Linear, useful for non-trivial codecs,
+        such as SQ-Codec
     """
 
     def __init__(
@@ -86,11 +90,16 @@ def __init__(
         ar_layer=4,
         nar_layer=4,
         n_ctx=3000,
+        emb=None,
+        lm_head=None,
     ):
         super().__init__()
-
-        self.emb = torch.nn.Embedding(vocab_size, att_unit)
-        self.lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
+        if emb is None:
+            emb = torch.nn.Embedding(vocab_size, att_unit)
+        self.emb = emb
+        if lm_head is None:
+            lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
+        self.lm_head = lm_head
         if share_emb:
             self.lm_head.weight = self.emb.weight
 

From 6af2d83c28d3de453ac36e8011b50fac700baebe Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Mar 2025 15:14:25 -0500
Subject: [PATCH 201/270] DASB: Fixes / clean-up

---
 benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
index 08d7297e5..416c63010 100644
--- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
+++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py
@@ -197,7 +197,6 @@ def prepare_ljspeech(
             model_name,
             data_split["train"],
             save_json_train,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -217,7 +216,6 @@ def prepare_ljspeech(
             model_name,
             data_split["valid"],
             save_json_valid,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -237,7 +235,6 @@ def prepare_ljspeech(
             model_name,
             data_split["test"],
             save_json_test,
-            data_folder,
             wavs_folder,
             meta_csv,
             phoneme_alignments_folder,
@@ -391,7 +388,6 @@ def prepare_json(
     model_name,
     seg_lst,
     json_file,
-    data_folder,
     wavs_folder,
     csv_reader,
     phoneme_alignments_folder,
@@ -437,14 +433,8 @@ def prepare_json(
         Max f0 for pitch computation
     use_custom_cleaner : bool
         If True, uses custom cleaner defined for this recipe
-    extract_features : list, optional
-        If specified, feature extraction will be performed
-    extract_features_context : types.SimpleNamespace, optional
-        Context for feature extraction (pretrained models, etc)
-    extract_features_folder : path-like, optional
-        The folder where extracted features will be saved
-    extract_features_opts : dict, optional
-        Options for feature extraction
+    extract_phonemes : bool
+        Whether to extract phonemes
     g2p_src : str
         The name of the HuggingFace Hub to use for the Grapheme-to-Phoneme
         model or the path to it

From 8c6a886876445a716cdf4eab61fef3ea69bbd83f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:38:59 -0500
Subject: [PATCH 202/270] DASB: SQ-Codec: Make the special loss optional

---
 .../TTS/valle/hparams/train_sqcodec.yaml      | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 337754bf5..d1d584bcc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -175,6 +175,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 4
+ternary_num_digits: 9
+pred_mode: ternary
 freeze_lm_head: False
 
 
@@ -191,6 +193,8 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     dropout: !ref <dropout>
     share_emb: !ref <share_emb>
     qk_norm: !ref <qk_norm>
+    emb: !ref <emb>
+    lm_head: !ref <lm_head>
 
 inference_opts: !name:model.valle.SpeechLMInferenceOptions
     start: !ref <bos_index>
@@ -199,6 +203,20 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
 
+lm_head: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryPredictionHead
+            d_model: !ref <d_model>
+            num_positions: !ref <ternary_num_digits>
+        tokens: null
+
+emb: !new:speechbrain.nnet.containers.Sequential
+    ternary: !new:model.sq_codec.TernaryEmbedding
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !ref <ternary_num_digits>
+        n_neurons: !ref <d_model>
+
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
     save_path: !ref <sq_codec_save_path>
     checkpoint: !ref <checkpoint>
@@ -211,7 +229,12 @@ modules:
 opt_class: !name:torch.optim.Adam
     lr: !ref <lr>
 
-compute_cost: !name:model.valle.masked_nll_loss
+compute_cost: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !name:model.sq_codec.ternary_loss
+            targets_type: tokens
+        tokens: !name:model.valle.masked_nll_loss
 
 log_softmax: !new:speechbrain.nnet.activations.Softmax
     apply_log: True

From 583f42a847c7ae1d474e721a292d4c110806c039 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Mar 2025 19:37:57 -0500
Subject: [PATCH 203/270] DASB: SQ Codec: Fixes

---
 benchmarks/DASB/model/sq_codec.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index ce8764af7..d8d52924f 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1284,19 +1284,25 @@ def forward(self, x):
 
 
 class TernaryEmbedding(nn.Module):
-    """A module wrapper for tokens-to-ternary conversion
-
-    Arguments
-    ---------
-    tokens : torch.Tensor
-        the tokens"""
+    """A module wrapper for tokens-to-ternary conversion"""
     def forward(self, tokens):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        tokens : torch.Tensor
+            the tokens
+        """
+        squeeze = False
         if tokens.dim() < 3:
+            squeeze = True
             tokens = tokens.unsqueeze(-1)
         batch_size, max_len, tracks = tokens.shape
         emb = tokens_to_ternary(tokens).float()
         positions = emb.size(-1)
         emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
+        if squeeze:
+            emb = emb.squeeze(-2)
         return emb
 
 

From 7a011eb5bcdb2b30d302ef74a0fc99bd377a1992 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 5 Mar 2025 22:00:07 -0500
Subject: [PATCH 204/270] DASB: SQCodec: Fixes

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml       | 2 +-
 benchmarks/DASB/model/valle.py                               | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index d1d584bcc..cf99298a4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -152,7 +152,7 @@ nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
-vocab_size: 2048
+vocab_size: 19683
 text_num_tokens: 39
 phn_num_tokens: 52
 
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 4cc155c2d..2ce0d5806 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -391,7 +391,10 @@ def inference(
             prev_tok = suffix[:, :, 0]
         else:
             prev_tok = gen_tokens_ar[:, :, 0]
-        start_emb = self.emb.weight[opts.start].tile(
+        start_token = torch.tensor(
+            [opts.start], device=prefix.device
+        )[None, None, :]
+        start_emb = self.emb(start_token).squeeze().tile(
             len(valid_idx), 1, 1
         )  # [B, 1, D]
         prev_emb = torch.cat(

From 24a401440e7b0735c2b064f8f0617c836f71f063 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Mar 2025 01:23:27 -0500
Subject: [PATCH 205/270] DASB: VALL-E: SQ-Codec updates

---
 .../TTS/valle/hparams/train_sqcodec.yaml      |   3 +-
 .../DASB/LibriTTS/TTS/tokotron/train.py       |   2 +-
 .../TTS/valle/hparams/train_sqcodec.yaml      | 277 ++++++++++++++++++
 benchmarks/DASB/model/sq_codec.py             |  19 +-
 4 files changed, 295 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index cf99298a4..9d5596d41 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 4
-ternary_num_digits: 9
+ternary_num_digits: 10
 pred_mode: ternary
 freeze_lm_head: False
 
@@ -213,6 +213,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice
 
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
+        num_digits: !ref <ternary_num_digits>
     linear: !new:speechbrain.nnet.linear.Linear
         input_size: !ref <ternary_num_digits>
         n_neurons: !ref <d_model>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index df31c4c69..7d99c5c7d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -811,7 +811,7 @@ def init_sequence_encoder(hparams):
 
 
 def get_selected_layer_indexes(hparams):
-    """Finds the layers of selected layers
+    """Finds the indexes of selected layers
 
     Arguments
     ---------
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
new file mode 100644
index 000000000..377e0d7a3
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -0,0 +1,277 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+
+experiment_name: valle/speech_tokenizer
+
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/<experiment_name>/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+alignments_folder: null
+prepare_save_folder: !ref <cached_data_folder>
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+progress_meta: !ref <progress_folder>/meta.yaml
+num_audio_samples: 32
+samples_interval: 5
+
+g2p_src: flexthink/soundchoice-g2p
+tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are saved.
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+flip_layers: False
+splits: ["train", "valid", "test"]
+
+ckpt_key: dwer
+ckpt_key_kind: min
+ckpt_keep: 2
+test_key: dwer
+test_key_kind: min
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 100
+number_of_epochs_ar: null
+number_of_epochs_nar: null
+epoch_size: 50000
+epoch_fixed: False
+batch_size: 16
+valid_inter_data_count: 50
+valid_batch_size: !ref <batch_size>
+grad_accumulation_factor: 1
+max_grad_norm: 1.0
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+# index
+pad_index: 0
+bos_index: 1
+eos_index: 2
+eot_index: 3
+eop_index: 4
+special_tokens: ["<bos>", "<eos>", "<eot>", "<eop>"]
+special_num_tokens: 5
+
+# stages related parameters
+lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)"
+lr_warmup_steps: 70000
+lr_annealing_mode: step
+betas: [0.9, 0.95]
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 24000
+max_audio_length: 4000
+text_max_length: 500
+spk_prompt_length: 150
+n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
+infer_max_audio_length: !ref <max_audio_length>
+max_length_ratio: 10.0
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice
+        value: !ref <epoch_fixed>
+        choices:
+            True: !ref <epoch_size> // <batch_size>
+            False: null
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+share_emb: False
+qk_norm: True
+nhead: 16
+num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
+num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
+dropout: 0.2
+vocab_size: 19683
+audio_emb_freeze: False
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+
+model_vocab_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
+
+audio_token_shift: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens> +  <special_num_tokens>
+        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+
+audio_tokens_per_step: 8
+ternary_num_digits: 10
+pred_mode: ternary
+
+# Model Settings
+config: config.yaml
+checkpoint: ckpt_00190000.pth
+sq_codec_save_path: !ref <pretrained_model_save_folder>/sq-codec
+
+freeze_lm_head: False
+
+############################## models ################################
+
+model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
+    vocab_size: !ref <model_vocab_size>
+    nq: !ref <audio_tokens_per_step>
+    att_unit: !ref <d_model>
+    head: !ref <nhead>
+    ar_layer: !ref <num_layers_ar>
+    nar_layer: !ref <num_layers_nar>
+    n_ctx: !ref <n_ctx>
+    dropout: !ref <dropout>
+    share_emb: !ref <share_emb>
+    qk_norm: !ref <qk_norm>
+    lm_head: !ref <lm_head>
+    emb: !ref <emb>
+
+inference_opts: !name:model.valle.SpeechLMInferenceOptions
+    start: !ref <bos_index>
+    eos: !ref <eos_index>
+    minlenratio: 1.0
+    maxlenratio: !ref <max_length_ratio>
+    nq: !ref <audio_tokens_per_step>
+
+lm_head: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryPredictionHead
+            d_model: !ref <d_model>
+            num_positions: !ref <ternary_num_digits>
+        tokens: null
+
+emb: !new:speechbrain.nnet.containers.Sequential
+    ternary: !new:model.sq_codec.TernaryEmbedding
+        num_digits: !ref <ternary_num_digits>    
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !ref <ternary_num_digits>
+        n_neurons: !ref <d_model>
+
+tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
+    save_path: !ref <sq_codec_save_path>
+    checkpoint: !ref <checkpoint>
+    config: !ref <config>
+
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.AdamW
+    lr: !ref <lr>
+    betas: !ref <betas>
+
+compute_cost: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !name:model.sq_codec.ternary_loss
+            targets_type: tokens
+        tokens: !name:model.valle.masked_nll_loss
+
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index d8d52924f..6e0daed80 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1284,7 +1284,16 @@ def forward(self, x):
 
 
 class TernaryEmbedding(nn.Module):
-    """A module wrapper for tokens-to-ternary conversion"""
+    """A module wrapper for tokens-to-ternary conversion
+
+    Arguments
+    ---------
+    num_digits : int
+        The number of ternary digits"""
+    def __init__(self, num_digits):
+        super().__init__()
+        self.num_digits = num_digits
+
     def forward(self, tokens):
         """Computes the forward pass
 
@@ -1298,7 +1307,7 @@ def forward(self, tokens):
             squeeze = True
             tokens = tokens.unsqueeze(-1)
         batch_size, max_len, tracks = tokens.shape
-        emb = tokens_to_ternary(tokens).float()
+        emb = tokens_to_ternary(tokens, D=self.num_digits).float()
         positions = emb.size(-1)
         emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
         if squeeze:
@@ -1464,13 +1473,15 @@ def ternary_logits_to_tokens(logits):
     return tokens
 
 
-def tokens_to_ternary(tokens):
+def tokens_to_ternary(tokens, D=9):
     """Converts a sequence of tokens to a ternary matrix
 
     Arguments
     ---------
     tokens : torch.Tensor
         A (Batch x Length x Codebooks) tensor of tokens
+    D : int
+        The number of ternary digits
 
     Returns
     -------
@@ -1484,7 +1495,7 @@ def tokens_to_ternary(tokens):
     n_codebook = tokens.size(2)
     tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
     ternary_matrix = torch.cat([
-        decimal_to_ternary_matrix(item, D=9) - 1
+        decimal_to_ternary_matrix(item, D=D) - 1
         for item in tokens
     ], dim=1)
     ternary_matrix = ternary_matrix.transpose(1, 2)

From 7e5d15d5f532667b056704433977675b01676ff5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Mar 2025 11:30:35 -0500
Subject: [PATCH 206/270] DASB: SQCodec: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 377e0d7a3..2a3daf66a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -182,7 +182,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens> +  <special_num_tokens>
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
-audio_tokens_per_step: 8
+audio_tokens_per_step: 4
 ternary_num_digits: 10
 pred_mode: ternary
 

From 0f14a23ce399e5944a9c09fa3bcf3f898980a1da Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Mar 2025 15:08:29 -0500
Subject: [PATCH 207/270] DASB: SQ-Codec: Fully implement ternary mode

---
 .../TTS/valle/hparams/train_sqcodec.yaml      | 13 +++++++-
 .../TTS/valle/hparams/train_sqcodec.yaml      | 12 ++++++-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   |  8 ++---
 benchmarks/DASB/model/custom_model.py         | 31 +++++++++++++++++++
 benchmarks/DASB/model/sq_codec.py             |  4 +--
 benchmarks/DASB/model/valle.py                | 14 ++++++---
 6 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 9d5596d41..0919133ad 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 4
-ternary_num_digits: 10
+ternary_num_digits: 11
 pred_mode: ternary
 freeze_lm_head: False
 
@@ -195,6 +195,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     qk_norm: !ref <qk_norm>
     emb: !ref <emb>
     lm_head: !ref <lm_head>
+    logits_to_probs: !ref <logits_to_probs>
 
 inference_opts: !name:model.valle.SpeechLMInferenceOptions
     start: !ref <bos_index>
@@ -211,6 +212,15 @@ lm_head: !apply:speechbrain.utils.hparams.choice
             num_positions: !ref <ternary_num_digits>
         tokens: null
 
+logits_to_probs: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryLogitTokenizer
+            num_tokens: !ref <model_vocab_size>
+            num_positions: !ref <ternary_num_digits>
+        tokens: !new:torch.nn.Identity
+
+
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>
@@ -235,6 +245,7 @@ compute_cost: !apply:speechbrain.utils.hparams.choice
     choices:
         ternary: !name:model.sq_codec.ternary_loss
             targets_type: tokens
+            num_positions: !ref <ternary_num_digits>
         tokens: !name:model.valle.masked_nll_loss
 
 log_softmax: !new:speechbrain.nnet.activations.Softmax
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 2a3daf66a..4b015c203 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -183,7 +183,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 4
-ternary_num_digits: 10
+ternary_num_digits: 11
 pred_mode: ternary
 
 # Model Settings
@@ -208,6 +208,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     qk_norm: !ref <qk_norm>
     lm_head: !ref <lm_head>
     emb: !ref <emb>
+    logits_to_probs: !ref <logits_to_probs>    
 
 inference_opts: !name:model.valle.SpeechLMInferenceOptions
     start: !ref <bos_index>
@@ -224,6 +225,14 @@ lm_head: !apply:speechbrain.utils.hparams.choice
             num_positions: !ref <ternary_num_digits>
         tokens: null
 
+logits_to_probs: !apply:speechbrain.utils.hparams.choice
+    value: !ref <pred_mode>
+    choices:
+        ternary: !new:model.custom_model.TernaryLogitTokenizer
+            num_tokens: !ref <model_vocab_size>
+            num_positions: !ref <ternary_num_digits>
+        tokens: !new:torch.nn.Identity
+
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>    
@@ -251,6 +260,7 @@ compute_cost: !apply:speechbrain.utils.hparams.choice
     choices:
         ternary: !name:model.sq_codec.ternary_loss
             targets_type: tokens
+            num_positions: !ref <ternary_num_digits>
         tokens: !name:model.valle.masked_nll_loss
 
 log_softmax: !new:speechbrain.nnet.activations.Softmax
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 695937f36..2df9405ca 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -162,7 +162,7 @@ def compute_objectives(self, predictions, batch, stage):
             logits_ar_sm = self.hparams.log_softmax(logits_ar)
             targets_ar = prompt[:, 1:, 0]
             loss_ar = self.hparams.compute_cost(
-                log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask
+                logits_ar_sm, targets=targets_ar, mask=mask
             )
             loss_components.append(loss_ar)
         else:
@@ -171,7 +171,7 @@ def compute_objectives(self, predictions, batch, stage):
             logits_nar_sm = self.hparams.log_softmax(logits_nar)
             targets_nar = prompt[batch_idx, 1:, nar_track]
             loss_nar = self.hparams.compute_cost(
-                log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask,
+                logits_nar_sm, targets=targets_nar, mask=mask,
             )
             loss_components.append(loss_nar)
         else:
@@ -221,12 +221,12 @@ def compute_loss_stats(
         stats = {}
         if self.train_ar:
             stats["loss_ar"] = self.hparams.compute_cost(
-                log_probabilities=logits_ar, targets=targets_ar, mask=mask,
+                logits_ar, targets=targets_ar, mask=mask,
                 reduction=reduction,
             )
         if self.train_nar:
             stats["loss_nar"] = self.hparams.compute_cost(
-                log_probabilities=logits_nar, targets=targets_nar, mask=mask,
+                logits_nar, targets=targets_nar, mask=mask,
                 reduction=reduction,
             )
         return stats
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index e5a9db761..b389a9473 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,5 +1,6 @@
 import torch
 from speechbrain.nnet.linear import Linear
+from model.sq_codec import tokens_to_ternary
 
 
 class AttentionMLP(torch.nn.Module):
@@ -163,4 +164,34 @@ def forward(self, x):
         x = self.lin_p(x)
         p = x.reshape(batch_size, max_len, self.num_positions, 3)
         return p
+    
 
+class TernaryLogitTokenizer(torch.nn.Module):
+    """Converts ternary logits to probabilities
+
+    Arguments
+    ---------
+    num_positions : int
+        The number of ternary digits/positions
+    num_tokens : int
+        The number of tokens
+    """
+    def __init__(self, num_positions, num_tokens=None):
+        super().__init__()
+        self.num_positions = num_positions
+        if num_tokens is None:
+            num_tokens = 3 ** num_positions
+        self.num_tokens = num_tokens
+        self.register_buffer("vocab", torch.arange(num_tokens))
+        self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
+        self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
+
+    def forward(self, logits):
+        logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
+        token_logits_raw = torch.where(
+            self.vocab_ternary[:, None, None, :, :, None] == self.idx,
+            logits_unsq,
+            1 - logits_unsq
+        ).prod(-1).prod(-1)
+        token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
+        return (token_logits_raw / token_logits_raw_sum).squeeze(2)
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 6e0daed80..e8119a9e9 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1521,11 +1521,11 @@ def logits_to_ternary(logits):
     return ternary
 
 
-def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", reduction="mean"):
+def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"):
     if targets.dim() < 3:
         targets = targets.unsqueeze(-1)
     if targets_type == "tokens":
-        targets = tokens_to_ternary(targets.unsqueeze(-1))
+        targets = tokens_to_ternary(targets.unsqueeze(-1), D=num_positions)
     batch_size, max_len, positions = targets.shape
     targets_cat = targets + 1
     predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 2ce0d5806..8baf8e562 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -22,8 +22,7 @@
 from torch.nn import functional as F
 from dataclasses import dataclass
 
-from speechbrain.nnet.losses import reduce_loss
-from speechbrain.nnet.losses import truncate
+from speechbrain.nnet.losses import reduce_loss, truncate
 
 
 @dataclass
@@ -75,6 +74,9 @@ class ValleLM(nn.Module):
         an alternative LM head implementation head, an alternative
         to the default Linear, useful for non-trivial codecs,
         such as SQ-Codec
+    logits_to_probs : callable, optional
+        A module or a function that converts logits to token probabilities to
+        support top-K sampling
     """
 
     def __init__(
@@ -92,6 +94,7 @@ def __init__(
         n_ctx=3000,
         emb=None,
         lm_head=None,
+        logits_to_probs=None,
     ):
         super().__init__()
         if emb is None:
@@ -100,6 +103,9 @@ def __init__(
         if lm_head is None:
             lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
         self.lm_head = lm_head
+        if logits_to_probs is None:
+            logits_to_probs = nn.Identity()
+        self.logits_to_probs = logits_to_probs
         if share_emb:
             self.lm_head.weight = self.emb.weight
 
@@ -302,7 +308,7 @@ def inference(
             #  (3.2) AR loop
             prev_emb = self.emb(prev_tok)  # [B, 1, D]
             h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
-            logits = self.lm_head(h_ar)  # [B, 1, V]
+            logits = self.logits_to_probs(self.lm_head(h_ar))  # [B, 1, V]
             gen_tok, gen_score = logits_to_tokens(
                 logits.unsqueeze(2),
                 opts,
@@ -415,7 +421,7 @@ def inference(
                 h_nar = self.nar_decoder(
                     prev_emb, ones * step - 1, mask=mask
                 )  # [B, T, D]
-                logits = self.lm_head(h_nar)
+                logits = self.logits_to_probs(self.lm_head(h_nar))
                 gen_tok, gen_score = logits_to_tokens(
                     logits.unsqueeze(2),
                     opts,

From 10f8fdb8126c95194eb0aa0147ba338113eb3247 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 6 Mar 2025 16:14:54 -0500
Subject: [PATCH 208/270] DASB: Fix SpeechTokenizer

---
 .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 24c494a98..24be494a5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/speech_tokenizer
+experiment_name: valle/speech_tokenizer
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -95,7 +95,7 @@ betas: [0.9, 0.95]
 
 # Feature parameters
 sample_rate: 24000
-model_sample_rate: 24000
+model_sample_rate: 16000
 max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150

From c00962e36451a3f5898abf14140594aac374cf86 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:10:04 -0500
Subject: [PATCH 209/270] Fixes for SQCodec: Make offsets optional, align the
 shift with ternary

---
 .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 15 ++++++---------
 benchmarks/DASB/LJSpeech/TTS/valle/train.py       | 15 +++++++++++++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 0919133ad..f61b2e56e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -48,6 +48,7 @@ kmeans_dataset: LibriSpeech
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: False
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
@@ -165,17 +166,13 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
 model_vocab_size: !apply:speechbrain.utils.hparams.choice
     value: !ref <input>
     choices:
-        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
-        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
-
-audio_token_shift: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <text_num_tokens> +  <special_num_tokens>
-        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+        text: !ref <vocab_size> * 2
+        phonemes: !ref <vocab_size> * 2
 
+audio_token_shift: !ref 3**(<ternary_num_digits> - 1)
+ 
 audio_tokens_per_step: 4
-ternary_num_digits: 11
+ternary_num_digits: 10
 pred_mode: ternary
 freeze_lm_head: False
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 771d4b14a..d26b27cca 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -242,6 +242,8 @@ def on_stage_start(self, stage, epoch):
         self.offsets = get_offsets(
             self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
         )[None, None, :].to(self.device)
+        if not self.hparams.use_token_offsets:
+            self.offsets = torch.zeros_like(self.offsets)
 
         self.loss_metric = sb.utils.metric_stats.MultiMetricStats(
             metric=self.compute_loss_stats, batch_eval=True,
@@ -489,9 +491,10 @@ def _get_inference_opts(self):
         tracks = torch.arange(
             self.hparams.audio_tokens_per_step, device=self.device
         )[:, None]
+        if not self.hparams.use_token_offsets:
+            tracks = torch.zeros_like(tracks)
         track_start = (
-            self.hparams.text_num_tokens
-            + self.hparams.special_num_tokens
+            self.hparams.audio_token_shift
             + tracks * self.hparams.vocab_size
         )
         if self.hparams.flip_layers:
@@ -501,6 +504,12 @@ def _get_inference_opts(self):
             ((idx >= track_start) & (idx < track_end))
             | (idx == self.hparams.bos_index)
         ).logical_not()
+        mask[
+            (
+                (idx >= self.hparams.special_num_tokens)
+                & (idx <= self.hparams.audio_token_shift)
+            ).expand_as(mask)
+        ] = True
         return self.hparams.inference_opts(
             masks={self.hparams.bos_index: mask}, device=self.device,
         )
@@ -579,6 +588,8 @@ def dataio_prepare(hparams):
     offsets = get_offsets(
         hparams["vocab_size"], hparams["audio_tokens_per_step"]
     ).unsqueeze(0)
+    if not hparams["use_token_offsets"]:
+        offsets = torch.zeros_like(offsets)
     if hparams["flip_layers"]:
         offsets = offsets.flip(-1)
 

From 50ef659417b60e3c8e01012b9579056e17779807 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 7 Mar 2025 15:53:48 -0500
Subject: [PATCH 210/270] DASB: SQ-Codec: Add chunking to avoid OOM

---
 benchmarks/DASB/model/custom_model.py | 28 +++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index b389a9473..84007b7cd 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,3 +1,4 @@
+import math
 import torch
 from speechbrain.nnet.linear import Linear
 from model.sq_codec import tokens_to_ternary
@@ -175,23 +176,34 @@ class TernaryLogitTokenizer(torch.nn.Module):
         The number of ternary digits/positions
     num_tokens : int
         The number of tokens
+    chunk_size : int
+        The size of the chunk (to prevent OOM)
     """
-    def __init__(self, num_positions, num_tokens=None):
+    def __init__(self, num_positions, num_tokens=None, chunk_size=10):
         super().__init__()
         self.num_positions = num_positions
         if num_tokens is None:
             num_tokens = 3 ** num_positions
         self.num_tokens = num_tokens
+        self.chunk_size = chunk_size
         self.register_buffer("vocab", torch.arange(num_tokens))
         self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
         self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
 
     def forward(self, logits):
         logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
-        token_logits_raw = torch.where(
-            self.vocab_ternary[:, None, None, :, :, None] == self.idx,
-            logits_unsq,
-            1 - logits_unsq
-        ).prod(-1).prod(-1)
-        token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
-        return (token_logits_raw / token_logits_raw_sum).squeeze(2)
+        chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size))
+        token_logits_chunks = []
+        for chunk in chunks:
+            token_logits_raw = torch.where(
+                self.vocab_ternary[:, None, None, :, :, None] == self.idx,
+                chunk,
+                1 - chunk
+            ).prod(-1).prod(-1)
+            token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
+            token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
+        token_logits = torch.cat(
+            token_logits_chunks,
+            dim=1
+        )
+        return token_logits

From 08b14ff0da4df7730119193329bb3ad3c67e2178 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 8 Mar 2025 00:48:34 -0500
Subject: [PATCH 211/270] DASB: SQ-Codec: Update LibriTTS

---
 .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml |  8 ++------
 .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 15 +++------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index f61b2e56e..21530b199 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -163,13 +163,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 
-model_vocab_size: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <vocab_size> * 2
-        phonemes: !ref <vocab_size> * 2
+model_vocab_size: !ref <vocab_size> * 2
 
-audio_token_shift: !ref 3**(<ternary_num_digits> - 1)
+audio_token_shift: 19683
  
 audio_tokens_per_step: 4
 ternary_num_digits: 10
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 4b015c203..fca222b27 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -170,20 +170,11 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 
-model_vocab_size: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <text_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
-        phonemes: !ref <phn_num_tokens> + (<vocab_size> * <audio_tokens_per_step>) + <special_num_tokens>
-
-audio_token_shift: !apply:speechbrain.utils.hparams.choice
-    value: !ref <input>
-    choices:
-        text: !ref <text_num_tokens> +  <special_num_tokens>
-        phonemes: !ref <phn_num_tokens> + <special_num_tokens>
+model_vocab_size: !ref <vocab_size> * 2
 
+audio_token_shift: 19683
 audio_tokens_per_step: 4
-ternary_num_digits: 11
+ternary_num_digits: 10
 pred_mode: ternary
 
 # Model Settings

From d1ce08a2e627d39c90bd43b12db25f519a449c4c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 8 Mar 2025 14:12:34 -0500
Subject: [PATCH 212/270] DASB: Add a mulltitrack ternary language model head
 (a separate projection learned for each layer, independently)

---
 .../TTS/valle/hparams/train_sqcodec.yaml      |  4 +-
 benchmarks/DASB/model/custom_model.py         | 70 ++++++++++++++++++-
 benchmarks/DASB/model/valle.py                | 25 +++++--
 3 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 21530b199..10a1403fc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -98,6 +98,7 @@ text_max_length: 500
 n_ctx: !ref <max_audio_length> + <text_max_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+top_k: 1
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -196,11 +197,12 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    top_k: !ref <top_k>
 
 lm_head: !apply:speechbrain.utils.hparams.choice
     value: !ref <pred_mode>
     choices:
-        ternary: !new:model.custom_model.TernaryPredictionHead
+        ternary: !new:model.custom_model.MultitrackTernaryPredictionHead
             d_model: !ref <d_model>
             num_positions: !ref <ternary_num_digits>
         tokens: null
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 84007b7cd..11a5a9dac 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -141,7 +141,7 @@ def __init__(self, d_model, num_positions, d_hidden=512):
             bias=False
         )
 
-    def forward(self, x):
+    def forward(self, x, track=None):
         """Computes the forward pass
 
         Arguments
@@ -149,6 +149,9 @@ def forward(self, x):
         x : torch.Tensor
             The decoder output (Batch x Length x d_model)
 
+        track : int
+            The track index (if applicable)
+
         Returns
         -------
         p : torch.Tensor
@@ -165,7 +168,70 @@ def forward(self, x):
         x = self.lin_p(x)
         p = x.reshape(batch_size, max_len, self.num_positions, 3)
         return p
-    
+
+
+class MultitrackTernaryPredictionHead(torch.nn.Module):
+    """An alternative prediction head that predicts a fixed number of ternary digits
+    for each position (as used in SQ-Codec)
+
+    Arguments
+    ---------
+    d_model : int
+        The model dimension
+    num_positions : int
+        the number of positions
+    """
+    def __init__(self, d_model, num_positions, d_hidden=512, num_tracks=1):
+        super().__init__()
+        self.num_positions = num_positions
+        self.d_model = d_model
+        self.num_positions = num_positions
+        self.lin_hidden = torch.nn.ModuleList(
+            [
+                Linear(
+                    input_size=d_model,
+                    n_neurons=d_hidden,                    
+                )
+            ] * num_tracks
+        )
+        self.act = torch.nn.LeakyReLU()
+        self.lin_p = torch.nn.ModuleList(
+            [
+                Linear(
+                    input_size=d_hidden,
+                    n_neurons=num_positions * 3,
+                )
+            ] * num_tracks
+        )
+
+    def forward(self, x, track=0):
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The decoder output (Batch x Length x d_model)
+
+        track : int
+            The track index (if applicable)
+
+        Returns
+        -------
+        p : torch.Tensor
+            A tensor of shape (Batch x Length x num_positions x ternary digit)
+            The values are logits (unnormalized probabilities)
+
+            p[:, :, :, 0] corresponds to -1
+            p[:, :, :, 1] corresponds to 0
+            p[:, :, :, 2] corresponds to 1
+        """
+        batch_size, max_len, _ = x.shape
+        x = self.lin_hidden[track](x)
+        x = self.act(x)
+        x = self.lin_p[track](x)
+        p = x.reshape(batch_size, max_len, self.num_positions, 3)
+        return p
+
 
 class TernaryLogitTokenizer(torch.nn.Module):
     """Converts ternary logits to probabilities
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 8baf8e562..828b33898 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -94,6 +94,7 @@ def __init__(
         n_ctx=3000,
         emb=None,
         lm_head=None,
+        lm_head_multitrack=False,
         logits_to_probs=None,
     ):
         super().__init__()
@@ -103,6 +104,7 @@ def __init__(
         if lm_head is None:
             lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
         self.lm_head = lm_head
+        self.lm_head_multitrack = lm_head_multitrack
         if logits_to_probs is None:
             logits_to_probs = nn.Identity()
         self.logits_to_probs = logits_to_probs
@@ -204,9 +206,9 @@ def forward(
         # Logits
         logits_ar, logits_nar = None, None
         if predict_ar:
-            logits_ar = self.lm_head(h_ar)
+            logits_ar = self.apply_lm_head(h_ar, 0)
         if predict_nar:
-            logits_nar = self.lm_head(h_nar)
+            logits_nar = self.apply_lm_head(h_nar, nar_level_idx + 1)
 
         return logits_ar, logits_nar
 
@@ -308,7 +310,7 @@ def inference(
             #  (3.2) AR loop
             prev_emb = self.emb(prev_tok)  # [B, 1, D]
             h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
-            logits = self.logits_to_probs(self.lm_head(h_ar))  # [B, 1, V]
+            logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0))  # [B, 1, V]
             gen_tok, gen_score = logits_to_tokens(
                 logits.unsqueeze(2),
                 opts,
@@ -421,7 +423,9 @@ def inference(
                 h_nar = self.nar_decoder(
                     prev_emb, ones * step - 1, mask=mask
                 )  # [B, T, D]
-                logits = self.logits_to_probs(self.lm_head(h_nar))
+                
+                logits = self.apply_lm_head(h_nar, step)
+                logits = self.logits_to_probs(logits)
                 gen_tok, gen_score = logits_to_tokens(
                     logits.unsqueeze(2),
                     opts,
@@ -463,6 +467,19 @@ def inference(
             gen_scores_list.append(gen_scores[b][: finish_idx[b]])
 
         return gen_tokens_list, gen_scores_list
+    
+    def apply_lm_head(self, x, track):
+        """Applies the language model head
+        
+        Arguments
+        ---------
+        """
+
+        if self.lm_head_multitrack:
+            result = self.lm_head(x, track)
+        else:
+            result = self.lm_head(x)
+        return result
 
     def _initialize(self):
         for m in self.modules():

From 15f096c69c6f023c3b9c5dbdf1885f23c41280ae Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 8 Mar 2025 15:21:02 -0500
Subject: [PATCH 213/270] DASB: Vall-E: Multitrack fixes

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml       | 1 +
 .../DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml       | 5 ++++-
 benchmarks/DASB/model/valle.py                               | 5 +++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 10a1403fc..9e887e4c2 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -205,6 +205,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.MultitrackTernaryPredictionHead
             d_model: !ref <d_model>
             num_positions: !ref <ternary_num_digits>
+            num_tracks: !ref <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index fca222b27..928aaa094 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -102,6 +102,7 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+top_k: 1
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -207,13 +208,15 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    top_k: !ref <top_k>
 
 lm_head: !apply:speechbrain.utils.hparams.choice
     value: !ref <pred_mode>
     choices:
-        ternary: !new:model.custom_model.TernaryPredictionHead
+        ternary: !new:model.custom_model.MultitrackTernaryPredictionHead
             d_model: !ref <d_model>
             num_positions: !ref <ternary_num_digits>
+            num_tracks: !ref <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 828b33898..523bade2b 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -14,6 +14,7 @@
 
 import logging
 import torch
+import inspect
 from typing import Tuple, Optional
 from speechbrain.dataio.dataio import length_to_mask
 
@@ -94,7 +95,6 @@ def __init__(
         n_ctx=3000,
         emb=None,
         lm_head=None,
-        lm_head_multitrack=False,
         logits_to_probs=None,
     ):
         super().__init__()
@@ -104,7 +104,8 @@ def __init__(
         if lm_head is None:
             lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False)
         self.lm_head = lm_head
-        self.lm_head_multitrack = lm_head_multitrack
+        spec = inspect.getfullargspec(lm_head.forward)
+        self.lm_head_multitrack = "track" in spec.args
         if logits_to_probs is None:
             logits_to_probs = nn.Identity()
         self.logits_to_probs = logits_to_probs

From 981fe9366166d77148cfa966cca703aa3c50a700 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 9 Mar 2025 09:32:12 -0400
Subject: [PATCH 214/270] DASB: SQ-Codec: Fixes

---
 benchmarks/DASB/model/custom_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 11a5a9dac..64d84c522 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -226,6 +226,8 @@ def forward(self, x, track=0):
             p[:, :, :, 2] corresponds to 1
         """
         batch_size, max_len, _ = x.shape
+        if torch.is_tensor(track):
+            track = track.int().item()
         x = self.lin_hidden[track](x)
         x = self.act(x)
         x = self.lin_p[track](x)

From de4aaaa690e756f9333706b378d11354d2c9e89e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Mar 2025 15:51:40 -0400
Subject: [PATCH 215/270] DASB: SQ-Codec: Remove the multi-track ternary head
 (it did not help)

---
 .../TTS/valle/hparams/train_sqcodec.yaml      |   3 +-
 .../TTS/valle/hparams/train_sqcodec.yaml      |   3 +-
 benchmarks/DASB/model/custom_model.py         | 107 ------------------
 3 files changed, 2 insertions(+), 111 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 9e887e4c2..6e4e7d4f6 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -202,10 +202,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
 lm_head: !apply:speechbrain.utils.hparams.choice
     value: !ref <pred_mode>
     choices:
-        ternary: !new:model.custom_model.MultitrackTernaryPredictionHead
+        ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
             num_positions: !ref <ternary_num_digits>
-            num_tracks: !ref <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 928aaa094..942f85cd5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -213,10 +213,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
 lm_head: !apply:speechbrain.utils.hparams.choice
     value: !ref <pred_mode>
     choices:
-        ternary: !new:model.custom_model.MultitrackTernaryPredictionHead
+        ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
             num_positions: !ref <ternary_num_digits>
-            num_tracks: !ref <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 64d84c522..f02598b57 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -168,110 +168,3 @@ def forward(self, x, track=None):
         x = self.lin_p(x)
         p = x.reshape(batch_size, max_len, self.num_positions, 3)
         return p
-
-
-class MultitrackTernaryPredictionHead(torch.nn.Module):
-    """An alternative prediction head that predicts a fixed number of ternary digits
-    for each position (as used in SQ-Codec)
-
-    Arguments
-    ---------
-    d_model : int
-        The model dimension
-    num_positions : int
-        the number of positions
-    """
-    def __init__(self, d_model, num_positions, d_hidden=512, num_tracks=1):
-        super().__init__()
-        self.num_positions = num_positions
-        self.d_model = d_model
-        self.num_positions = num_positions
-        self.lin_hidden = torch.nn.ModuleList(
-            [
-                Linear(
-                    input_size=d_model,
-                    n_neurons=d_hidden,                    
-                )
-            ] * num_tracks
-        )
-        self.act = torch.nn.LeakyReLU()
-        self.lin_p = torch.nn.ModuleList(
-            [
-                Linear(
-                    input_size=d_hidden,
-                    n_neurons=num_positions * 3,
-                )
-            ] * num_tracks
-        )
-
-    def forward(self, x, track=0):
-        """Computes the forward pass
-
-        Arguments
-        ---------
-        x : torch.Tensor
-            The decoder output (Batch x Length x d_model)
-
-        track : int
-            The track index (if applicable)
-
-        Returns
-        -------
-        p : torch.Tensor
-            A tensor of shape (Batch x Length x num_positions x ternary digit)
-            The values are logits (unnormalized probabilities)
-
-            p[:, :, :, 0] corresponds to -1
-            p[:, :, :, 1] corresponds to 0
-            p[:, :, :, 2] corresponds to 1
-        """
-        batch_size, max_len, _ = x.shape
-        if torch.is_tensor(track):
-            track = track.int().item()
-        x = self.lin_hidden[track](x)
-        x = self.act(x)
-        x = self.lin_p[track](x)
-        p = x.reshape(batch_size, max_len, self.num_positions, 3)
-        return p
-
-
-class TernaryLogitTokenizer(torch.nn.Module):
-    """Converts ternary logits to probabilities
-
-    Arguments
-    ---------
-    num_positions : int
-        The number of ternary digits/positions
-    num_tokens : int
-        The number of tokens
-    chunk_size : int
-        The size of the chunk (to prevent OOM)
-    """
-    def __init__(self, num_positions, num_tokens=None, chunk_size=10):
-        super().__init__()
-        self.num_positions = num_positions
-        if num_tokens is None:
-            num_tokens = 3 ** num_positions
-        self.num_tokens = num_tokens
-        self.chunk_size = chunk_size
-        self.register_buffer("vocab", torch.arange(num_tokens))
-        self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
-        self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
-
-    def forward(self, logits):
-        logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
-        chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size))
-        token_logits_chunks = []
-        for chunk in chunks:
-            token_logits_raw = torch.where(
-                self.vocab_ternary[:, None, None, :, :, None] == self.idx,
-                chunk,
-                1 - chunk
-            ).prod(-1).prod(-1)
-            token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
-            token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
-        token_logits = torch.cat(
-            token_logits_chunks,
-            dim=1
-        )
-        return token_logits

From e8af8994008aa39dc0e251eee4d7cadff2e60cc5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Mar 2025 16:06:48 -0400
Subject: [PATCH 216/270] DASB: VALL-E Fix ternary loss masking

---
 benchmarks/DASB/model/sq_codec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index e8119a9e9..29a483456 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1534,12 +1534,12 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter
         targets_cat,
         reduction="none"
     )
-    mask = None
     if length is not None:
         mask = length_to_mask(
             length * max_len,
             max_len
-        ).unsqueeze(-1)
+        )
+    mask = mask.unsqueeze(-1)
     if mask is not None:
         loss = loss * mask
     if reduction == "mean":

From 851eb845526231990b3bfb737bbd0742661e7613 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Mar 2025 16:15:46 -0400
Subject: [PATCH 217/270] DASB: SQCodec: Fixes

---
 benchmarks/DASB/model/custom_model.py | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index f02598b57..c88d7b536 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -168,3 +168,45 @@ def forward(self, x, track=None):
         x = self.lin_p(x)
         p = x.reshape(batch_size, max_len, self.num_positions, 3)
         return p
+
+
+class TernaryLogitTokenizer(torch.nn.Module):
+    """Converts ternary logits to probabilities
+
+    Arguments
+    ---------
+    num_positions : int
+        The number of ternary digits/positions
+    num_tokens : int
+        The number of tokens
+    chunk_size : int
+        The size of the chunk (to prevent OOM)
+    """
+    def __init__(self, num_positions, num_tokens=None, chunk_size=10):
+        super().__init__()
+        self.num_positions = num_positions
+        if num_tokens is None:
+            num_tokens = 3 ** num_positions
+        self.num_tokens = num_tokens
+        self.chunk_size = chunk_size
+        self.register_buffer("vocab", torch.arange(num_tokens))
+        self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
+        self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
+
+    def forward(self, logits):
+        logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
+        chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size))
+        token_logits_chunks = []
+        for chunk in chunks:
+            token_logits_raw = torch.where(
+                self.vocab_ternary[:, None, None, :, :, None] == self.idx,
+                chunk,
+                1 - chunk
+            ).prod(-1).prod(-1)
+            token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
+            token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
+        token_logits = torch.cat(
+            token_logits_chunks,
+            dim=1
+        )
+        return token_logits

From 38ed4324e75c190e58d421e30de40d6486d4b48c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Mar 2025 19:52:13 -0400
Subject: [PATCH 218/270] DASB: Add the ability to filter priors

---
 benchmarks/DASB/run_hparam_optimization.sh | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 9be6a3c64..058bbcb18 100755
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -63,6 +63,7 @@ orion_db_type="PickledDB"
 exp_max_trials=50
 store_all=True
 compress_exp=True
+hparam_filter=""
 
 # Function to print argument descriptions and exit
 print_argument_descriptions() {
@@ -202,6 +203,12 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
 
+    --hparam_filter)
+      hparam_filter="$2"
+      shift
+      shift
+      ;;
+
     --help)
       print_argument_descriptions
       ;;
@@ -281,6 +288,11 @@ echo "-------------------------------------"
 get_flag() {
     local file_path="$1"
     local pattern="$2"
+    local filter="$3"
+
+    if [[ -z "$filter" ]]; then
+      filter=".*"
+    fi
 
     # Check if the file exists
     if [ ! -f "$file_path" ]; then
@@ -289,7 +301,7 @@ get_flag() {
     fi
 
     # Use grep to find all lines containing the pattern and then extract the flags using sed
-    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n'
+    grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | grep $filter | tr -d '\n'
 }
 
 
@@ -333,7 +345,9 @@ function extract_best_params() {
 step_id=1
 hparams_step=$hparams
 pattern="@orion_step1:"
-opt_flags=$(get_flag "$hparams_step" "$pattern")
+opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter")
+echo ">>> OPT FLAGS: $opt_flags"
+exit
 
 # Check if the string is empty and exit with an error if it is
 if [ -z "$opt_flags" ]; then
@@ -409,7 +423,7 @@ while [ -n "$opt_flags" ]; do
     pattern="@orion_step$step_id:"
 
     # update optimization flags pattern
-    opt_flags=$(get_flag "$hparams_step" "$pattern")
+    opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter")
 done
 
 echo

From d5aea40c93fc2eb2080bcdce9b494bfec65754a1 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 10 Mar 2025 20:48:54 -0400
Subject: [PATCH 219/270] DASB: Removed debugging code

---
 benchmarks/DASB/run_hparam_optimization.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index 058bbcb18..c0b06b09a 100755
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -346,8 +346,6 @@ step_id=1
 hparams_step=$hparams
 pattern="@orion_step1:"
 opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter")
-echo ">>> OPT FLAGS: $opt_flags"
-exit
 
 # Check if the string is empty and exit with an error if it is
 if [ -z "$opt_flags" ]; then

From 6cef5492752fe8069832bf4d2910872ec8452008 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 11 Mar 2025 01:41:02 -0400
Subject: [PATCH 220/270] DASB: VALL-E: SQ-Codec fixes

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml   |  1 +
 .../TTS/valle/hparams/train_encodec.yaml        |  1 +
 .../TTS/valle/hparams/train_espnet_encodec.yaml |  1 +
 .../LJSpeech/TTS/valle/hparams/train_mimi.yaml  |  1 +
 .../TTS/valle/hparams/train_wavtokenizer.yaml   |  1 +
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml   |  3 ++-
 .../TTS/valle/hparams/train_discrete_ssl.yaml   |  1 +
 .../TTS/valle/hparams/train_encodec.yaml        |  3 ++-
 .../TTS/valle/hparams/train_espnet_encodec.yaml |  3 ++-
 .../LibriTTS/TTS/valle/hparams/train_mimi.yaml  |  3 ++-
 .../valle/hparams/train_speech_tokenizer.yaml   |  1 +
 .../TTS/valle/hparams/train_sqcodec.yaml        |  8 +++++++-
 .../TTS/valle/hparams/train_wavtokenizer.yaml   |  3 ++-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py     | 17 ++++++++++++++---
 14 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 140b85a84..715a2d199 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -60,6 +60,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index 2c22f57a4..747e6626e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -45,6 +45,7 @@ kmeans_dataset: LibriSpeech
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index 74654e590..c3874b6a7 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -45,6 +45,7 @@ kmeans_dataset: LibriSpeech
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index b528660f5..b5747d763 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -46,6 +46,7 @@ kmeans_dataset: LibriSpeech
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index af0222d90..110839413 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -48,6 +48,7 @@ kmeans_dataset: LibriSpeech
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 token_offset: 1
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index f9d07b443..3052dc76b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/dac
+experiment_name: valle/dac
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -51,6 +51,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 9d9e65b85..4c61228a2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -66,6 +66,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice
 available_speech_model_layers: [1, 3, 7, 12, 18, 23]
 speech_model_layers: !ref <available_speech_model_layers>
 flip_layers: False
+use_token_offsets: True
 
 # Speaker Embeddings
 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index a4a19ae6b..6596858b2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/encodec
+experiment_name: valle/encodec
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 714cf91b5..a23789f15 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/encodec
+experiment_name: valle/encodec
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index c1e3f1e3a..7b61a18a7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/mimi
+experiment_name: valle/mimi
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -51,6 +51,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 24be494a5..3b9bd8214 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 942f85cd5..4ac5e039a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: False
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
@@ -176,6 +177,7 @@ model_vocab_size: !ref <vocab_size> * 2
 audio_token_shift: 19683
 audio_tokens_per_step: 4
 ternary_num_digits: 10
+ternary_emb_hidden_size: 512
 pred_mode: ternary
 
 # Model Settings
@@ -229,8 +231,12 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>    
-    linear: !new:speechbrain.nnet.linear.Linear
+    hidden: !new:speechbrain.nnet.linear.Linear
         input_size: !ref <ternary_num_digits>
+        n_neurons: !ref <ternary_emb_hidden_size>
+    act: !new:torch.nn.LeakyReLU
+    linear: !new:speechbrain.nnet.linear.Linear
+        input_size: !ref <ternary_emb_hidden_size>
         n_neurons: !ref <d_model>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index e98056db3..4e4d13c27 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -3,7 +3,7 @@
 # Authors:  Artem Ploujnikov
 # ############################################################################
 
-experiment_name: tokotron/wavtokenizer
+experiment_name: valle/wavtokenizer
 
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 74443
@@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER  # Path to the folder where extracted tokens are sav
 tokens_loader: !new:utils.tokens.TokensLoader
     data_path: !ref <tokens_folder>
 flip_layers: False
+use_token_offsets: True
 splits: ["train", "valid", "test"]
 
 ckpt_key: dwer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 2df9405ca..2d3091654 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -245,6 +245,8 @@ def on_stage_start(self, stage, epoch):
         self.offsets = get_offsets(
             self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
         )[None, None, :].to(self.device)
+        if not self.hparams.use_token_offsets:
+            self.offsets = torch.zeros_like(self.offsets)
 
         if hasattr(hparams, "speech_model_layers"):
             self.layer_idx = get_selected_layer_indexes(
@@ -527,9 +529,10 @@ def _get_inference_opts(self):
         tracks = torch.arange(
             self.hparams.audio_tokens_per_step, device=self.device
         )[:, None]
+        if not self.hparams.use_token_offsets:
+            tracks = torch.zeros_like(tracks)
         track_start = (
-            self.hparams.text_num_tokens
-            + self.hparams.special_num_tokens
+            self.hparams.audio_token_shift
             + tracks * self.hparams.vocab_size
         )
         if self.hparams.flip_layers:
@@ -537,8 +540,14 @@ def _get_inference_opts(self):
         track_end = track_start + self.hparams.vocab_size
         mask = (
             ((idx >= track_start) & (idx < track_end))
-            | (idx == self.hparams.eos_index)
+            | (idx == self.hparams.bos_index)
         ).logical_not()
+        mask[
+            (
+                (idx >= self.hparams.special_num_tokens)
+                & (idx <= self.hparams.audio_token_shift)
+            ).expand_as(mask)
+        ] = True
         return self.hparams.inference_opts(
             masks={self.hparams.bos_index: mask}, device=self.device,
         )
@@ -714,6 +723,8 @@ def dataio_prepare(hparams):
     offsets = get_offsets(
         hparams["vocab_size"], hparams["audio_tokens_per_step"]
     ).unsqueeze(0)
+    if not hparams["use_token_offsets"]:
+        offsets = torch.zeros_like(offsets)    
     if hparams["flip_layers"]:
         offsets = offsets.flip(-1)
 

From 9fe48e4c268bf706142e1eacb33e4652ed0188d8 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:09:19 -0400
Subject: [PATCH 221/270] DASB: SQ-Codec: Fix the sample rate

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 4ac5e039a..a69b3db3d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -96,7 +96,7 @@ betas: [0.9, 0.95]
 
 # Feature parameters
 sample_rate: 24000
-model_sample_rate: 24000
+model_sample_rate: 16000
 max_audio_length: 4000
 text_max_length: 500
 spk_prompt_length: 150

From 263f8b501fdcf81a35052a8e28119fa2f9f9a0ae Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 11 Mar 2025 15:41:17 -0400
Subject: [PATCH 222/270] VALL-E: SQ-Codec: Add target dropout (optional,
 disabled by default)

---
 .../TTS/valle/hparams/train_sqcodec.yaml      |  2 +
 benchmarks/DASB/model/valle.py                | 56 +++++++++++--------
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 6e4e7d4f6..579c860aa 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -154,6 +154,7 @@ nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
+target_dropout: 0.5
 vocab_size: 19683
 text_num_tokens: 39
 phn_num_tokens: 52
@@ -185,6 +186,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     nar_layer: !ref <num_layers_nar>
     n_ctx: !ref <n_ctx>
     dropout: !ref <dropout>
+    target_dropout: !ref <target_dropout>
     share_emb: !ref <share_emb>
     qk_norm: !ref <qk_norm>
     emb: !ref <emb>
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 523bade2b..fbaa45b30 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -61,6 +61,9 @@ class ValleLM(nn.Module):
         If true, apply LayerNorm to q and k in atention.
     dropout : float
         dropout rate for attention layers.
+    target_dropout : float
+        a separate dropout applied to targets only (may be
+        useful to mitigate autorgressive prediction instability)
     att_unit: int
         Dimention of Transformer attention.
     head : int
@@ -88,6 +91,7 @@ def __init__(
         share_emb=True,
         qk_norm=False,
         dropout=0.0,
+        target_dropout=0.0,
         att_unit=256,
         head=2,
         ar_layer=4,
@@ -119,6 +123,7 @@ def __init__(
             n_layer=ar_layer,
             qk_norm=qk_norm,
             dropout=dropout,
+            target_dropout=target_dropout
         )
         if nq > 1:
             # NOTE: An NAR encoder is not needed if there is only one track
@@ -575,6 +580,30 @@ def forward(
 
 
 class TransformerDecoder(nn.Module):
+    """A custom transformer decoder implementation for VALL-E
+
+    Arguments
+    ---------
+    n_ctx : int
+        The context length
+    n_state : int
+        The number of states
+    n_head : int
+        The number of heads
+    n_layer : int
+        The number of layers
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Whether to normalize queries and keys
+    dropout : float
+        The dropout probability
+    target_dropout : float
+        The target dropout probability
+    layer_class : type
+        The layer type to be used
+    """    
     def __init__(
         self,
         n_ctx,
@@ -584,30 +613,10 @@ def __init__(
         causal=True,
         qk_norm=False,
         dropout=0.0,
+        target_dropout=0.0,
         layer_class=ResidualAttentionBlock,
     ):
-        """A custom transformer decoder implementation for VALL-E
 
-        Arguments
-        ---------
-        n_ctx : int
-            The context length
-        n_state : int
-            The number of states
-        n_head : int
-            The number of heads
-        n_layer : int
-            The number of layers
-        causal : bool
-            Whether to operate in causal mode (i.e. avoid attending
-            to future steps)
-        qk_norm : bool
-            Whether to normalize queries and keys
-        dropout : float
-            The dropout probability
-        layer_class : type
-            The layer type to be used
-        """
         super().__init__()
 
         self.pos_emb = nn.Embedding(n_ctx, n_state)
@@ -626,6 +635,7 @@ def __init__(
             ]
         )
         self.ln = LayerNorm(n_state)
+        self.target_dropout = nn.Dropout(target_dropout)
 
         self.causal = causal
         self.kv_cache = None
@@ -654,9 +664,11 @@ def forward(
 
         offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
         x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0)
+        tgt = self.target_dropout(x)
 
         for block in self.blocks:
-            x = block(x, mask=mask, kv_cache=kv_cache)
+            x = block(x, tgt, mask=mask, kv_cache=kv_cache)
+            tgt = x
 
         x = self.ln(x)
         return x

From fb2d573a2597c34d7d2d91990cc463ee88e25112 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Mar 2025 01:15:00 -0400
Subject: [PATCH 223/270] DASB: SQ-Codec updates

---
 .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml  | 14 +++++++-------
 benchmarks/DASB/model/custom_model.py              |  4 +++-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index a69b3db3d..337a69f15 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -71,7 +71,7 @@ batch_size: 16
 valid_inter_data_count: 50
 valid_batch_size: !ref <batch_size>
 grad_accumulation_factor: 1
-max_grad_norm: 1.0
+max_grad_norm: 0.01
 sorting: random
 num_workers: 4
 skip_prep: False
@@ -155,12 +155,14 @@ sample_dataloader_opts:
 ####################### Model parameters ###########################
 # Transformer
 d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+ternary_d_hidden: 128
 share_emb: False
 qk_norm: True
 nhead: 16
 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])"
 num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])"
 dropout: 0.2
+target_dropout: 0.2
 vocab_size: 19683
 audio_emb_freeze: False
 audio_emb_pretrained: False
@@ -177,7 +179,6 @@ model_vocab_size: !ref <vocab_size> * 2
 audio_token_shift: 19683
 audio_tokens_per_step: 4
 ternary_num_digits: 10
-ternary_emb_hidden_size: 512
 pred_mode: ternary
 
 # Model Settings
@@ -198,6 +199,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     nar_layer: !ref <num_layers_nar>
     n_ctx: !ref <n_ctx>
     dropout: !ref <dropout>
+    target_dropout: !ref <target_dropout>
     share_emb: !ref <share_emb>
     qk_norm: !ref <qk_norm>
     lm_head: !ref <lm_head>
@@ -217,7 +219,9 @@ lm_head: !apply:speechbrain.utils.hparams.choice
     choices:
         ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
+            d_hidden: !ref <ternary_d_hidden>
             num_positions: !ref <ternary_num_digits>
+            norm: False
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
@@ -231,12 +235,8 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>    
-    hidden: !new:speechbrain.nnet.linear.Linear
-        input_size: !ref <ternary_num_digits>
-        n_neurons: !ref <ternary_emb_hidden_size>
-    act: !new:torch.nn.LeakyReLU
     linear: !new:speechbrain.nnet.linear.Linear
-        input_size: !ref <ternary_emb_hidden_size>
+        input_size: !ref <ternary_num_digits>
         n_neurons: !ref <d_model>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index c88d7b536..fec745d96 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -125,11 +125,12 @@ class TernaryPredictionHead(torch.nn.Module):
     num_positions : int
         the number of positions
     """
-    def __init__(self, d_model, num_positions, d_hidden=512):
+    def __init__(self, d_model, num_positions, d_hidden=512, norm=False):
         super().__init__()
         self.num_positions = num_positions
         self.d_model = d_model
         self.num_positions = num_positions
+        self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity()
         self.lin_hidden = Linear(
             input_size=d_model,
             n_neurons=d_hidden,
@@ -163,6 +164,7 @@ def forward(self, x, track=None):
             p[:, :, :, 2] corresponds to 1
         """
         batch_size, max_len, _ = x.shape
+        x = self.norm(x)
         x = self.lin_hidden(x)
         x = self.act(x)
         x = self.lin_p(x)

From 51438b9d45cfd48b374d994485a240506ef70db1 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:25:21 -0400
Subject: [PATCH 224/270] DASB: SQ-Codec: Add argmax mode

---
 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 2 ++
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 579c860aa..0993a052e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -171,6 +171,7 @@ audio_token_shift: 19683
  
 audio_tokens_per_step: 4
 ternary_num_digits: 10
+ternary_tokenizer_mode: argmax
 pred_mode: ternary
 freeze_lm_head: False
 
@@ -215,6 +216,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.TernaryLogitTokenizer
             num_tokens: !ref <model_vocab_size>
             num_positions: !ref <ternary_num_digits>
+            mode: !ref <ternary_tokenizer_mode>
         tokens: !new:torch.nn.Identity
 
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 337a69f15..1d596c3fa 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -179,6 +179,7 @@ model_vocab_size: !ref <vocab_size> * 2
 audio_token_shift: 19683
 audio_tokens_per_step: 4
 ternary_num_digits: 10
+ternary_tokenizer_mode: argmax
 pred_mode: ternary
 
 # Model Settings
@@ -230,6 +231,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.TernaryLogitTokenizer
             num_tokens: !ref <model_vocab_size>
             num_positions: !ref <ternary_num_digits>
+            mode: !ref <ternary_tokenizer_mode>
         tokens: !new:torch.nn.Identity
 
 emb: !new:speechbrain.nnet.containers.Sequential

From acbcfcfa2c6e7e6f6b4a956b92b5eea5f7081105 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:25:49 -0400
Subject: [PATCH 225/270] DASB: SQ-Codec: Add argmax mode

---
 benchmarks/DASB/model/custom_model.py | 17 +++++++++++++++--
 benchmarks/DASB/model/sq_codec.py     |  4 ++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index fec745d96..23b138688 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,7 +1,7 @@
 import math
 import torch
 from speechbrain.nnet.linear import Linear
-from model.sq_codec import tokens_to_ternary
+from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens
 
 
 class AttentionMLP(torch.nn.Module):
@@ -183,19 +183,27 @@ class TernaryLogitTokenizer(torch.nn.Module):
         The number of tokens
     chunk_size : int
         The size of the chunk (to prevent OOM)
+    mode : str
+        "probability" : treats the outputs as a probability distribution
+        "argmax" : "hard" mode, only the top probability is used. Cannot be used with
+        top_k sampling with k > 1
+        
     """
-    def __init__(self, num_positions, num_tokens=None, chunk_size=10):
+    def __init__(self, num_positions, num_tokens=None, chunk_size=10, mode="probability"):
         super().__init__()
         self.num_positions = num_positions
         if num_tokens is None:
             num_tokens = 3 ** num_positions
         self.num_tokens = num_tokens
         self.chunk_size = chunk_size
+        self.mode = mode
         self.register_buffer("vocab", torch.arange(num_tokens))
         self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
         self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
 
     def forward(self, logits):
+        if self.mode == "argmax":
+            return self._probs_argmax(logits)
         logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
         chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size))
         token_logits_chunks = []
@@ -212,3 +220,8 @@ def forward(self, logits):
             dim=1
         )
         return token_logits
+
+    def _probs_argmax(self, logits):
+        logit_tokens = ternary_logits_to_tokens(logits, n_codebook=1)
+        probs = (logit_tokens == self.vocab[None, None, :]).float()
+        return probs
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 29a483456..d213308e7 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1455,7 +1455,7 @@ def ternary_to_decimal(ternary, n_codebook=4):
     return codec_ls.permute(1, 2, 0)
 
 
-def ternary_logits_to_tokens(logits):
+def ternary_logits_to_tokens(logits, n_codebook=4):
     """Converts ternary logits to tokens (as used for SQ-Codec)
 
     Arguments
@@ -1469,7 +1469,7 @@ def ternary_logits_to_tokens(logits):
         Token IDs
     """
     ternary_matrix = logits_to_ternary(logits)
-    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2))
+    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook)
     return tokens
 
 

From b38c1ccac91fd578c54d2b99cb71a077af495af0 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 12 Mar 2025 14:31:16 -0400
Subject: [PATCH 226/270] DASB: Fixes

---
 benchmarks/DASB/model/sq_codec.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index d213308e7..307daaeeb 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -126,7 +126,7 @@ def build_codec_model(self, config):
         exp_model_config = OmegaConf.load(config)
         scalar_codec = ScalarModel(**exp_model_config.generator.config)
         device = next(iter(scalar_codec.parameters())).device
-        parameter_dict = torch.load(self.ckpt_path, map_location=device)
+        parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False)
         scalar_codec.load_state_dict(parameter_dict["codec_model"])
         return scalar_codec
 
@@ -1543,7 +1543,7 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter
     if mask is not None:
         loss = loss * mask
     if reduction == "mean":
-        loss = loss.sum(2).mean(1).mean(0) / 3.0
+        loss = loss.sum(2).sum(1).sum(0) / mask.sum()
     elif reduction == "batch":
-        loss = loss.sum(2).mean(1) / 3.0
+        loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1)
     return loss
\ No newline at end of file

From 44e93fd2daf55f5be14d687c18e921722526de6d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:02:04 -0400
Subject: [PATCH 227/270] SQCodec: Fixes

---
 benchmarks/DASB/model/custom_model.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 23b138688..31ce24d42 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -125,11 +125,10 @@ class TernaryPredictionHead(torch.nn.Module):
     num_positions : int
         the number of positions
     """
-    def __init__(self, d_model, num_positions, d_hidden=512, norm=False):
+    def __init__(self, d_model, num_positions, d_hidden=512, norm=True):
         super().__init__()
         self.num_positions = num_positions
         self.d_model = d_model
-        self.num_positions = num_positions
         self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity()
         self.lin_hidden = Linear(
             input_size=d_model,
@@ -165,10 +164,16 @@ def forward(self, x, track=None):
         """
         batch_size, max_len, _ = x.shape
         x = self.norm(x)
+        if self.use_emb:
+            positions = torch.arange(
+                self.num_positions,
+                device=x.device
+            )[None, None, :]
+            x = x[:, :, None, :] + self.emb(positions)
         x = self.lin_hidden(x)
         x = self.act(x)
-        x = self.lin_p(x)
-        p = x.reshape(batch_size, max_len, self.num_positions, 3)
+        p = self.lin_p(x)
+        p = p.reshape(batch_size, max_len, self.num_positions, 3)
         return p
 
 
@@ -211,8 +216,8 @@ def forward(self, logits):
             token_logits_raw = torch.where(
                 self.vocab_ternary[:, None, None, :, :, None] == self.idx,
                 chunk,
-                1 - chunk
-            ).prod(-1).prod(-1)
+                1.
+            ).prod(-1).log().sum(-1).exp()
             token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
             token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
         token_logits = torch.cat(

From f875cd9c04691736eafe827f7ee19e3202a75dee Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:14:28 -0400
Subject: [PATCH 228/270] DASB: SQCodec: Fixes

---
 benchmarks/DASB/model/custom_model.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 31ce24d42..3bf24f4d3 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -164,12 +164,6 @@ def forward(self, x, track=None):
         """
         batch_size, max_len, _ = x.shape
         x = self.norm(x)
-        if self.use_emb:
-            positions = torch.arange(
-                self.num_positions,
-                device=x.device
-            )[None, None, :]
-            x = x[:, :, None, :] + self.emb(positions)
         x = self.lin_hidden(x)
         x = self.act(x)
         p = self.lin_p(x)

From 69b346b687e377cceaba81d22d6453acb3c0985a Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 14 Mar 2025 20:04:51 -0400
Subject: [PATCH 229/270] DASB: SQCodec: Update to predict everything
 autoregressively

---
 .../TTS/valle/hparams/train_sqcodec.yaml      | 14 +++---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   |  5 ++-
 benchmarks/DASB/model/custom_model.py         | 22 +++++----
 benchmarks/DASB/model/sq_codec.py             | 10 +++--
 benchmarks/DASB/model/valle.py                | 45 +++++++++++--------
 5 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 0993a052e..7d3cb278e 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -98,7 +98,7 @@ text_max_length: 500
 n_ctx: !ref <max_audio_length> + <text_max_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
-top_k: 1
+top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -170,8 +170,8 @@ model_vocab_size: !ref <vocab_size> * 2
 audio_token_shift: 19683
  
 audio_tokens_per_step: 4
+flatten: true
 ternary_num_digits: 10
-ternary_tokenizer_mode: argmax
 pred_mode: ternary
 freeze_lm_head: False
 
@@ -180,7 +180,7 @@ freeze_lm_head: False
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     vocab_size: !ref <model_vocab_size>
-    nq: !ref <audio_tokens_per_step>
+    nq: 1
     att_unit: !ref <d_model>
     head: !ref <nhead>
     ar_layer: !ref <num_layers_ar>
@@ -199,7 +199,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     eos: !ref <eos_index>
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
-    nq: !ref <audio_tokens_per_step>
+    nq: 1
     top_k: !ref <top_k>
 
 lm_head: !apply:speechbrain.utils.hparams.choice
@@ -207,7 +207,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice
     choices:
         ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
-            num_positions: !ref <ternary_num_digits>
+            num_positions: !ref <ternary_num_digits> * <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
@@ -216,15 +216,15 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.TernaryLogitTokenizer
             num_tokens: !ref <model_vocab_size>
             num_positions: !ref <ternary_num_digits>
-            mode: !ref <ternary_tokenizer_mode>
         tokens: !new:torch.nn.Identity
 
 
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>
+        flat: True
     linear: !new:speechbrain.nnet.linear.Linear
-        input_size: !ref <ternary_num_digits>
+        input_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
         n_neurons: !ref <d_model>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index d26b27cca..97e319eb9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -157,7 +157,10 @@ def compute_objectives(self, predictions, batch, stage):
 
         if self.train_ar:
             logits_ar_sm = self.hparams.log_softmax(logits_ar)
-            targets_ar = prompt[:, 1:, 0]
+            if self.hparams.flatten:
+                targets_ar = prompt[:, 1:]
+            else:
+                targets_ar = prompt[:, 1:, 0]
             loss_ar = self.hparams.compute_cost(
                 logits_ar_sm, targets=targets_ar, mask=mask
             )
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 3bf24f4d3..cdfcd5ced 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -188,29 +188,32 @@ class TernaryLogitTokenizer(torch.nn.Module):
         top_k sampling with k > 1
         
     """
-    def __init__(self, num_positions, num_tokens=None, chunk_size=10, mode="probability"):
+    def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10):
         super().__init__()
         self.num_positions = num_positions
         if num_tokens is None:
             num_tokens = 3 ** num_positions
         self.num_tokens = num_tokens
+        self.num_tracks = num_tracks
         self.chunk_size = chunk_size
-        self.mode = mode
         self.register_buffer("vocab", torch.arange(num_tokens))
         self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
         self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
 
     def forward(self, logits):
-        if self.mode == "argmax":
-            return self._probs_argmax(logits)
-        logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3)
-        chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size))
+        batch_size, max_len, num_positions, _ = logits.shape
+        logits = logits.softmax(-1)
+        logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3)
+        chunks = logits.chunk(
+            dim=1,
+            chunks=math.ceil(logits.size(1) / self.chunk_size)
+        )
         token_logits_chunks = []
         for chunk in chunks:
             token_logits_raw = torch.where(
                 self.vocab_ternary[:, None, None, :, :, None] == self.idx,
                 chunk,
-                1.
+                torch.ones_like(chunk)
             ).prod(-1).log().sum(-1).exp()
             token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
             token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
@@ -219,8 +222,3 @@ def forward(self, logits):
             dim=1
         )
         return token_logits
-
-    def _probs_argmax(self, logits):
-        logit_tokens = ternary_logits_to_tokens(logits, n_codebook=1)
-        probs = (logit_tokens == self.vocab[None, None, :]).float()
-        return probs
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 307daaeeb..2c52ee8ac 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -1290,9 +1290,10 @@ class TernaryEmbedding(nn.Module):
     ---------
     num_digits : int
         The number of ternary digits"""
-    def __init__(self, num_digits):
+    def __init__(self, num_digits, emb_size=512, flat=False):
         super().__init__()
         self.num_digits = num_digits
+        self.flat = flat
 
     def forward(self, tokens):
         """Computes the forward pass
@@ -1309,7 +1310,10 @@ def forward(self, tokens):
         batch_size, max_len, tracks = tokens.shape
         emb = tokens_to_ternary(tokens, D=self.num_digits).float()
         positions = emb.size(-1)
-        emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
+        if self.flat:
+            emb = emb.unsqueeze(-2)
+        else:
+            emb = emb.reshape(batch_size, max_len, tracks, positions // tracks)
         if squeeze:
             emb = emb.squeeze(-2)
         return emb
@@ -1546,4 +1550,4 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter
         loss = loss.sum(2).sum(1).sum(0) / mask.sum()
     elif reduction == "batch":
         loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1)
-    return loss
\ No newline at end of file
+    return loss
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index fbaa45b30..619e555ca 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -307,6 +307,9 @@ def inference(
         )
         modality_index = prev_tok.flatten()
         mask = modality_index_to_mask(modality_index, opts)
+        tracks = prefix.size(-1)
+        if opts.nq == 1 and tracks > 1:
+            prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks)
         mask_cache = []
         modality_tokens = torch.tensor(
             list(opts.masks.keys()), device=prefix.device
@@ -314,11 +317,13 @@ def inference(
 
         for step in range(maxlen):
             #  (3.2) AR loop
-            prev_emb = self.emb(prev_tok)  # [B, 1, D]
+            prev_emb = self.emb(prev_tok).squeeze(2)  # [B, 1, D]
             h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
             logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0))  # [B, 1, V]
+            if logits.dim() < 4:
+                logits = logits.unsqueeze(-2)
             gen_tok, gen_score = logits_to_tokens(
-                logits.unsqueeze(2),
+                logits,
                 opts,
                 mask,
                 allow_eos=step >= minlen,
@@ -408,23 +413,24 @@ def inference(
         start_token = torch.tensor(
             [opts.start], device=prefix.device
         )[None, None, :]
-        start_emb = self.emb(start_token).squeeze().tile(
-            len(valid_idx), 1, 1
-        )  # [B, 1, D]
-        prev_emb = torch.cat(
-            [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
-        )  # [B, T, D]
-
-        ones = torch.ones_like(valid_idx)
-        mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
-        mask = mask.unsqueeze(1).unsqueeze(1)
-        generated = {"token": [], "score": []}
-
-        mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
-        vocab_mask = torch.cat(mask_cache, dim=1)
 
         # (4.2) NAR loop
         if self.nq > 1:
+            start_emb = self.emb(start_token).squeeze().tile(
+                len(valid_idx), 1, 1
+            )  # [B, 1, D]
+            prev_emb = torch.cat(
+                [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
+            )  # [B, T, D]
+
+            ones = torch.ones_like(valid_idx)
+            mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
+            mask = mask.unsqueeze(1).unsqueeze(1)
+            generated = {"token": [], "score": []}
+
+            mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
+            vocab_mask = torch.cat(mask_cache, dim=1)
+
             for step in range(1, opts.nq):
                 h_nar = self.nar_decoder(
                     prev_emb, ones * step - 1, mask=mask
@@ -469,8 +475,11 @@ def inference(
 
         gen_tokens_list, gen_scores_list = [], []
         for b in range(len(valid_idx)):
-            gen_tokens_list.append(gen_tokens[b][: finish_idx[b]])
-            gen_scores_list.append(gen_scores[b][: finish_idx[b]])
+            item_finish_idx = finish_idx[b]
+            if len(item_finish_idx) > 1:
+                item_finish_idx = item_finish_idx[0]
+            gen_tokens_list.append(gen_tokens[b][:item_finish_idx])
+            gen_scores_list.append(gen_scores[b][:item_finish_idx])
 
         return gen_tokens_list, gen_scores_list
     

From f51b3a8f0d345794a55b165e232e0a7f46581425 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Mar 2025 01:15:41 -0400
Subject: [PATCH 230/270] DASB: VALL-E: Fixes

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 97e319eb9..872a190c5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -276,7 +276,7 @@ def apply_curriculum(self):
             else self.modules.model.lm_head
         )
         lm_head.requires_grad_(True)
-        if self.hparams.audio_tokens_per_step == 1:
+        if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
         elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:

From 1f05e76c95ce33dd2f5ac0bc42bc5175e6a24717 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Mar 2025 23:05:12 -0400
Subject: [PATCH 231/270] DASB: SQCodec: Fixes, add LibriTTS

---
 .../DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml   | 1 +
 .../TTS/tokotron/hparams/train_speech_tokenizer.yaml     | 1 +
 .../LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml   | 1 +
 .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml   | 1 +
 .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml | 1 +
 .../DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml      | 1 +
 .../DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml       | 1 +
 .../LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml   | 1 +
 .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml   | 1 +
 .../LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 1 +
 .../TTS/valle/hparams/train_speech_tokenizer.yaml        | 1 +
 .../DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml   | 2 +-
 .../LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml   | 1 +
 benchmarks/DASB/LibriTTS/TTS/valle/train.py              | 9 ++++++---
 benchmarks/DASB/model/valle.py                           | 6 +++++-
 15 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index b38a07434..505460dfa 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -166,6 +166,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
+flatten: false
 attention_type: regularMHA
 
 ############################## models ################################
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 0cb2012ed..0ff172529 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -165,6 +165,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
+flatten: false
 bandwidth: 1.5
 attention_type: regularMHA
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index 715a2d199..b2a5f37dc 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -207,6 +207,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 6
+flatten: false
 
 freeze_lm_head: False
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index 747e6626e..cae286efd 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -175,6 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 bandwidth: 6
 
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index c3874b6a7..5aae5e0db 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -173,6 +173,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 espnet_repo: https://github.com/espnet/espnet
 espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
 model_hub: espnet/libritts_encodec_24k
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index b5747d763..edae05d51 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -174,6 +174,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 bandwidth: 6
 
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 3052dc76b..b7579f092 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -186,6 +186,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 2
+flatten: false
 
 # Model Settings
 model_type: 24khz
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 4c61228a2..7e4b5e0be 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -228,6 +228,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 6
+flatten: false
 
 freeze_lm_head: False
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 6596858b2..c35aaa4f9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -185,6 +185,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 
 # Model Settings
 model_hub: facebook/encodec_24khz
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index a23789f15..efd408469 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 
 # Model Settings
 espnet_repo: https://github.com/espnet/espnet
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 3b9bd8214..b6f699cf9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 
 # Model Settings
 model_hub: fnlp/SpeechTokenizer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 1d596c3fa..5ea73a123 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -178,6 +178,7 @@ model_vocab_size: !ref <vocab_size> * 2
 
 audio_token_shift: 19683
 audio_tokens_per_step: 4
+flatten: false
 ternary_num_digits: 10
 ternary_tokenizer_mode: argmax
 pred_mode: ternary
@@ -222,7 +223,6 @@ lm_head: !apply:speechbrain.utils.hparams.choice
             d_model: !ref <d_model>
             d_hidden: !ref <ternary_d_hidden>
             num_positions: !ref <ternary_num_digits>
-            norm: False
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 4e4d13c27..b63fe0d24 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 1
+flatten: false
 
 # Model Settings
 model_hub: novateur/WavTokenizer-medium-music-audio-75token
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 2d3091654..07539443e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -160,7 +160,10 @@ def compute_objectives(self, predictions, batch, stage):
 
         if self.train_ar:
             logits_ar_sm = self.hparams.log_softmax(logits_ar)
-            targets_ar = prompt[:, 1:, 0]
+            if self.hparams.flatten:
+                targets_ar = prompt[:, 1:]
+            else:
+                targets_ar = prompt[:, 1:, 0]
             loss_ar = self.hparams.compute_cost(
                 logits_ar_sm, targets=targets_ar, mask=mask
             )
@@ -288,11 +291,11 @@ def apply_curriculum(self):
             else self.modules.model.lm_head
         )
         lm_head.requires_grad_(True)
-        if self.hparams.audio_tokens_per_step == 1:
+        if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
         elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
-            self.train_nar = False                
+            self.train_nar = False
         elif (
             self.hparams.number_of_epochs_nar is not None
             and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 619e555ca..2223c0991 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -350,7 +350,11 @@ def inference(
             if torch.any(modality_change_mask):
                 modality_index = torch.where(
                     modality_change_mask, prev_tok[:, 0], modality_index,
-                )
+                ).flatten().squeeze()
+                if modality_index.dim() == 0:
+                    modality_index = modality_index.unsqueeze(0)
+                if modality_index.size(0) > 1:
+                    modality_index = modality_index[0:1]
                 mask = modality_index_to_mask(modality_index, opts)
                 logging.warning(
                     f"Step {step}: change modality index {modality_index}"

From 9011781577f3c11c184dabaaa6c90ade0a7d8915 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 15 Mar 2025 23:06:04 -0400
Subject: [PATCH 232/270] DASB: SQCodec updates

---
 .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml   |  2 ++
 .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml   | 13 ++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index 7d3cb278e..fb1ca4d33 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -148,6 +148,7 @@ token_model_kwargs:
 ####################### Model parameters ###########################
 # Transformer
 d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
+ternary_d_hidden: 512
 share_emb: False
 qk_norm: True
 nhead: 16
@@ -207,6 +208,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice
     choices:
         ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
+            d_hidden: !ref <ternary_d_hidden>
             num_positions: !ref <ternary_num_digits> * <audio_tokens_per_step>
         tokens: null
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 5ea73a123..ec95ebaf6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -178,9 +178,8 @@ model_vocab_size: !ref <vocab_size> * 2
 
 audio_token_shift: 19683
 audio_tokens_per_step: 4
-flatten: false
+flatten: true
 ternary_num_digits: 10
-ternary_tokenizer_mode: argmax
 pred_mode: ternary
 
 # Model Settings
@@ -194,7 +193,7 @@ freeze_lm_head: False
 
 model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     vocab_size: !ref <model_vocab_size>
-    nq: !ref <audio_tokens_per_step>
+    nq: 1
     att_unit: !ref <d_model>
     head: !ref <nhead>
     ar_layer: !ref <num_layers_ar>
@@ -213,7 +212,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     eos: !ref <eos_index>
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
-    nq: !ref <audio_tokens_per_step>
+    nq: 1
     top_k: !ref <top_k>
 
 lm_head: !apply:speechbrain.utils.hparams.choice
@@ -222,7 +221,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.TernaryPredictionHead
             d_model: !ref <d_model>
             d_hidden: !ref <ternary_d_hidden>
-            num_positions: !ref <ternary_num_digits>
+            num_positions: !ref <ternary_num_digits> * <audio_tokens_per_step>
         tokens: null
 
 logits_to_probs: !apply:speechbrain.utils.hparams.choice
@@ -231,14 +230,14 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
         ternary: !new:model.custom_model.TernaryLogitTokenizer
             num_tokens: !ref <model_vocab_size>
             num_positions: !ref <ternary_num_digits>
-            mode: !ref <ternary_tokenizer_mode>
         tokens: !new:torch.nn.Identity
 
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
         num_digits: !ref <ternary_num_digits>    
+        flat: True
     linear: !new:speechbrain.nnet.linear.Linear
-        input_size: !ref <ternary_num_digits>
+        input_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
         n_neurons: !ref <d_model>
 
 tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer

From 9a7565212560fed07611e6fa9fa43f6048eebe86 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 16 Mar 2025 19:41:39 -0400
Subject: [PATCH 233/270] DASB: VALL-E fixes

---
 benchmarks/DASB/model/valle.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 2223c0991..110e4ca3d 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -308,7 +308,8 @@ def inference(
         modality_index = prev_tok.flatten()
         mask = modality_index_to_mask(modality_index, opts)
         tracks = prefix.size(-1)
-        if opts.nq == 1 and tracks > 1:
+        is_flattened = opts.nq == 1 and tracks > 1
+        if is_flattened:
             prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks)
         mask_cache = []
         modality_tokens = torch.tensor(
@@ -342,7 +343,10 @@ def inference(
 
             # (3.3) detect modality swtich
             mask_cache.append(mask.clone())
-            modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens)
+            mod_tok = prev_tok[:, 0]
+            if is_flattened:
+                mod_tok = mod_tok[:, 0]
+            modality_change_mask = torch.isin(mod_tok, modality_tokens)
             # Note: The ESPNET VALL-E had
             # modality_change_mask = torch.logical_and(
             #    prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
@@ -484,7 +488,9 @@ def inference(
                 item_finish_idx = item_finish_idx[0]
             gen_tokens_list.append(gen_tokens[b][:item_finish_idx])
             gen_scores_list.append(gen_scores[b][:item_finish_idx])
-
+        if is_flattened:
+            gen_tokens_list = [item.squeeze(-2) for item in gen_tokens_list]
+            gen_scores_list = [item.squeeze(-2) for item in gen_scores_list]
         return gen_tokens_list, gen_scores_list
     
     def apply_lm_head(self, x, track):

From add349ad8639af90f704d811009c744d5498c8c5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:55:31 -0400
Subject: [PATCH 234/270] DASB: Fixes

---
 benchmarks/DASB/model/valle.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 110e4ca3d..61fd5bb08 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -310,7 +310,7 @@ def inference(
         tracks = prefix.size(-1)
         is_flattened = opts.nq == 1 and tracks > 1
         if is_flattened:
-            prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks)
+            prev_tok = prev_tok.expand(1, tracks)
         mask_cache = []
         modality_tokens = torch.tensor(
             list(opts.masks.keys()), device=prefix.device
@@ -318,6 +318,8 @@ def inference(
 
         for step in range(maxlen):
             #  (3.2) AR loop
+            if is_flattened:
+                prev_tok = prev_tok.unsqueeze(1)
             prev_emb = self.emb(prev_tok).squeeze(2)  # [B, 1, D]
             h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
             logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0))  # [B, 1, V]
@@ -331,7 +333,7 @@ def inference(
                 nq_level=0,
             )
             # [B, 1, 1] -> [B, 1]
-            gen_tok, gen_score = gen_tok.squeeze(2), gen_score.squeeze(2)
+            gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1)
 
             generated["token"].append(gen_tok)
             generated["score"].append(gen_score)
@@ -343,10 +345,7 @@ def inference(
 
             # (3.3) detect modality swtich
             mask_cache.append(mask.clone())
-            mod_tok = prev_tok[:, 0]
-            if is_flattened:
-                mod_tok = mod_tok[:, 0]
-            modality_change_mask = torch.isin(mod_tok, modality_tokens)
+            modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens)
             # Note: The ESPNET VALL-E had
             # modality_change_mask = torch.logical_and(
             #    prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64,
@@ -484,13 +483,8 @@ def inference(
         gen_tokens_list, gen_scores_list = [], []
         for b in range(len(valid_idx)):
             item_finish_idx = finish_idx[b]
-            if len(item_finish_idx) > 1:
-                item_finish_idx = item_finish_idx[0]
             gen_tokens_list.append(gen_tokens[b][:item_finish_idx])
             gen_scores_list.append(gen_scores[b][:item_finish_idx])
-        if is_flattened:
-            gen_tokens_list = [item.squeeze(-2) for item in gen_tokens_list]
-            gen_scores_list = [item.squeeze(-2) for item in gen_scores_list]
         return gen_tokens_list, gen_scores_list
     
     def apply_lm_head(self, x, track):

From 331bad099ffd989a2d98b6e1a19653333e92505f Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 17 Mar 2025 21:38:40 -0400
Subject: [PATCH 235/270] DASB: Train dataset data loader fix

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 07539443e..d0208518b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -23,11 +23,13 @@
     length_to_mask,
     write_audio,
 )
+from speechbrain.dataio.dataloader import LoopedLoader
 from speechbrain.utils.data_utils import pad_right_to
 from speechbrain.utils.distributed import run_on_main
 from speechbrain.utils.data_utils import batch_pad_right
 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset
 from functools import partial
+from torch.utils.data import DataLoader
 import re
 import string
 
@@ -648,11 +650,14 @@ def fit(
                 "Test only mode, skipping training and validation stages."
             )
             return
-
+        if not (
+            isinstance(train_set, DataLoader)
+            or isinstance(train_set, LoopedLoader)
+        ):        
+            train_set = self.make_dataloader(
+                train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs
+            )
         self.on_fit_start()
-        train_set = self.make_dataloader(
-            train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs
-        )
         epoch = self.hparams.epoch_counter.current
         if epoch < self.hparams.number_of_epochs:
             valid_set = sample_dataset(
@@ -892,7 +897,6 @@ def sig_pipeline(wav):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
-
     return datasets, resample_fn
 
 

From 17ebf5d4ffef409b5fd5a719d6bbd27e84358f84 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 28 Mar 2025 02:29:51 -0400
Subject: [PATCH 236/270] DASB: Add a fallback for hparams files

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index d0208518b..94dd6c746 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1251,6 +1251,8 @@ def undo_padding_tensor(batch, lengths):
 
     # Load evaluation hyperparameters
     eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if not eval_hparams_file.exists():
+        eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml"
     if eval_hparams_file.exists():
         logger.info(
             "Using evaluation hyperparameters from %s", eval_hparams_file

From 47744ab075cd76831503542f29a8de9aaad9edff Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 28 Mar 2025 02:46:43 -0400
Subject: [PATCH 237/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 7b61a18a7..9359a2b24 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -185,6 +185,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
+flatten: false
 
 # Model Settings
 model_hub: kyutai/mimi

From fa87f1dae5752d3fdcbbf8235d6303edf50f4722 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 16 Apr 2025 16:21:48 -0400
Subject: [PATCH 238/270] DASB: Fix the summary.json check

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 7d99c5c7d..71ca5b37b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -1015,7 +1015,7 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        test_summary_file = Path(hparams["output_folder"]).glob("eval/test/*/summary.json")
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:

From bb08a3ae1c9394506e06c71cf88d890988d037aa Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 16 Apr 2025 16:30:21 -0400
Subject: [PATCH 239/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 71ca5b37b..323134e90 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -1015,8 +1015,8 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = Path(hparams["output_folder"]).glob("eval/test/*/summary.json")
-        if test_summary_file.exists():
+        test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
+        if test_summary_file is not None:
             logging.info("Test run already completed: %s", test_summary_file)
         else:
             test_key_kind = hparams["test_key_kind"]

From 33daea82a17dc57a465d3673d3d42550324a20e0 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 20 Apr 2025 08:55:51 -0400
Subject: [PATCH 240/270] DASB: Fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 94dd6c746..d0abffc47 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1336,8 +1336,8 @@ def undo_padding_tensor(batch, lengths):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
-        if test_summary_file.exists():
+        test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
+        if test_summary_file is not None:
             logging.info("Test run already completed: %s", test_summary_file)
         else:
             test_key_kind = hparams["test_key_kind"]

From d27a9ef32ea3f44ea0a7960acae1fc073ca36b03 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:23:00 -0400
Subject: [PATCH 241/270] DASB: Add memory fraction (to share a large GPU)

---
 benchmarks/DASB/LJSpeech/TTS/valle/train.py | 17 +++++++++++++++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 17 +++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 872a190c5..c932fc872 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -13,6 +13,7 @@
 
 import logging
 import speechbrain as sb
+import os
 import torch
 import sys
 import shutil
@@ -727,6 +728,19 @@ def get_offsets(vocab_size, tracks):
     return torch.arange(tracks) * vocab_size
 
 
+def apply_mem_fraction():
+    """Applies the memory fraction, based on environment variables, useful for cases where
+    multiple experiments share a large GPU"""
+    if not torch.cuda.is_available():
+        return
+    mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION")
+    if mem_fraction:
+        fraction, device = mem_fraction.split(":")
+        fraction, device = float(fraction), int(device)
+        logger.info("Using %f of GPU %f", fraction, device)
+        torch.cuda.set_per_process_memory_fraction(fraction, device)
+
+
 def init_sequence_encoder(hparams):
     """Initialize a sequence encoder
 
@@ -895,6 +909,9 @@ def undo_padding_tensor(batch, lengths):
     # Initialize ddp (useful only for multi-GPU DDP training)
     sb.utils.distributed.ddp_init_group(run_opts)
 
+    # Applies the memory fraction for a shared GPU
+    apply_mem_fraction()
+
     # Load hyperparameters file with command-line overrides
     with open(hparams_file) as fin:
         yaml = fin.read()
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index d0abffc47..738ac2a3f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -31,6 +31,7 @@
 from functools import partial
 from torch.utils.data import DataLoader
 import re
+import os
 import string
 
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
@@ -941,6 +942,19 @@ def get_offsets(vocab_size, tracks):
     return torch.arange(tracks) * vocab_size
 
 
+def apply_mem_fraction():
+    """Applies the memory fraction, based on environment variables, useful for cases where
+    multiple experiments share a large GPU"""
+    if not torch.cuda.is_available():
+        return
+    mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION")
+    if mem_fraction:
+        fraction, device = mem_fraction.split(":")
+        fraction, device = float(fraction), int(device)
+        logger.info("Using %f of GPU %f", fraction, device)
+        torch.cuda.set_per_process_memory_fraction(fraction, device)
+
+
 def group_by_speaker(dataset, hparams):
     """Groups utterance IDs in a dataset by speaker, for selection. The selection
     is stable based on the seed - calling this method multiple times will always
@@ -1245,6 +1259,9 @@ def undo_padding_tensor(batch, lengths):
     # Initialize ddp (useful only for multi-GPU DDP training)
     sb.utils.distributed.ddp_init_group(run_opts)
 
+    # Applies the memory fraction for a shared GPU
+    apply_mem_fraction()
+
     # Load hyperparameters file with command-line overrides
     with open(hparams_file) as fin:
         yaml = fin.read()

From 4ae6e86dff07b87325b54e6a191678b205fbf214 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 21 Apr 2025 18:23:15 -0400
Subject: [PATCH 242/270] DASB: Fix kmeans path conflicts

---
 .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml     | 2 +-
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index b2a5f37dc..bba258f8d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -233,7 +233,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-    save_path: !ref <kmeans_cache_dir>
+    save_path: !ref <kmeans_cache_dir>/<ssl_model_type>
     ssl_model: !ref <ssl_model>
     vocoder_repo_id: !ref <vocoder_repo_id>
     kmeans_dataset: !ref <kmeans_dataset>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 7e4b5e0be..e7e4657aa 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -254,7 +254,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
-    save_path: !ref <kmeans_cache_dir>
+    save_path: !ref <kmeans_cache_dir>/<ssl_model_type>
     ssl_model: !ref <ssl_model>
     vocoder_repo_id: !ref <vocoder_repo_id>
     kmeans_dataset: !ref <kmeans_dataset>

From 27a460875fe948b67c109c870acf3bcc402ba851 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 26 Apr 2025 22:55:01 -0400
Subject: [PATCH 243/270] DASB: Mimi fix

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 9359a2b24..40d3f03f6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -209,7 +209,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
 inference_opts: !name:model.valle.SpeechLMInferenceOptions
     start: !ref <bos_index>
     eos: !ref <eos_index>
-    minlenratio: 1.0
+    minlenratio: 0.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
 

From 6de5acbd32962bd97b9e069128cd9c077b2cae38 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 13 May 2025 20:23:11 -0400
Subject: [PATCH 244/270] DASB: WER/CER fix

---
 benchmarks/DASB/utils/eval.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 76f2a6c2f..da5d71ddb 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -483,8 +483,10 @@ def _evaluate_samples(self, wavs, length, text, sample_rate):
         predicted_words = [self.normalize(text) for text in predicted_words]
         ids = range(1, len(wavs) + 1)
         wer_metric, cer_metric = init_asr_metrics()
-        wer_metric.append(ids, predicted_words, text)
-        cer_metric.append(ids, predicted_words, text)
+        predicted_words_split = [item.split(" ") for item in predicted_words]
+        text_split = [item.split(" ") for item in text]
+        wer_metric.append(ids, predicted_words_split, text_split)
+        cer_metric.append(ids, predicted_words_split, text_split)
         wer = torch.tensor(
             [score["WER"] for score in wer_metric.scores], device=wavs.device
         )

From 7210b3c7ccbf39a1dd1cef1be2f7edde46c557a6 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 14 May 2025 09:21:09 -0400
Subject: [PATCH 245/270] WER/CER fixes

---
 benchmarks/DASB/utils/eval.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index da5d71ddb..0bd3cac30 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -330,6 +330,8 @@ def compute_diff_rate(self, details, device):
         wer_metric, cer_metric = init_asr_metrics()
         pred = self._replace_blanks(details["pred"])
         pred_ref = self._replace_blanks(details["pred_ref"])
+        pred = [item.split(" ") for item in pred]
+        pred_ref = [item.split(" ") for item in pred_ref]
         wer_metric.append(ids, pred, pred_ref)
         cer_metric.append(ids, pred, pred_ref)
         dwer = torch.tensor(

From 4c5dba53d52c61cf893b3bb2ae48dc41817cc6f4 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 16 May 2025 11:23:57 -0400
Subject: [PATCH 246/270] DASB: VALL-E: Added an option to do preparation only
 without training

---
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |  1 +
 .../TTS/valle/hparams/train_discrete_ssl.yaml |  1 +
 .../TTS/valle/hparams/train_encodec.yaml      |  1 +
 .../valle/hparams/train_espnet_encodec.yaml   |  1 +
 .../TTS/valle/hparams/train_mimi.yaml         |  1 +
 .../valle/hparams/train_speech_tokenizer.yaml |  1 +
 .../TTS/valle/hparams/train_sqcodec.yaml      |  1 +
 .../TTS/valle/hparams/train_wavtokenizer.yaml |  1 +
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 93 ++++++++++---------
 9 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index b7579f092..c3cc4a750 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -76,6 +76,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index e7e4657aa..4ef2d230c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -95,6 +95,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index c35aaa4f9..df8513cb8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -75,6 +75,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index efd408469..3587b3b23 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -75,6 +75,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 40d3f03f6..16348d9e7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -76,6 +76,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index b6f699cf9..ac8172585 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -75,6 +75,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index ec95ebaf6..c49c6d88c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -75,6 +75,7 @@ max_grad_norm: 0.01
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index b63fe0d24..22b8c19f3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -75,6 +75,7 @@ max_grad_norm: 1.0
 sorting: random
 num_workers: 4
 skip_prep: False
+prep_only: False
 overfit_test: False
 overfit_test_sample_count: !ref <batch_size>
 overfit_test_epoch_data_count: 1000
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 738ac2a3f..7014131cc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1321,51 +1321,52 @@ def undo_padding_tensor(batch, lengths):
             },
         )
 
-    # We can now directly create the datasets for training, valid, and test
-    datasets, resample_fn = dataio_prepare(hparams)
-
-    # Apply overfit test settings
-    datasets = apply_overfit_test(hparams, datasets)
-    audio_keys = ["audio_tokens"]
-
-    # Trainer initialization
-    tts_brain = VALLEBrain(
-        modules=hparams["modules"],
-        opt_class=hparams["opt_class"],
-        hparams=hparams,
-        run_opts=run_opts,
-        checkpointer=hparams["checkpointer"],
-    )
+    if not hparams.get("prep_only"):
+        # We can now directly create the datasets for training, valid, and test
+        datasets, resample_fn = dataio_prepare(hparams)
+
+        # Apply overfit test settings
+        datasets = apply_overfit_test(hparams, datasets)
+        audio_keys = ["audio_tokens"]
+
+        # Trainer initialization
+        tts_brain = VALLEBrain(
+            modules=hparams["modules"],
+            opt_class=hparams["opt_class"],
+            hparams=hparams,
+            run_opts=run_opts,
+            checkpointer=hparams["checkpointer"],
+        )
 
-    tts_brain.resample_fn = resample_fn
-
-    # The `fit()` method iterates the training loop, calling the methods
-    # necessary to update the parameters of the model. Since all objects
-    # with changing state are managed by the Checkpointer, training can be
-    # stopped at any point, and will be resumed on next call.
-    tts_brain.fit(
-        tts_brain.hparams.epoch_counter,
-        datasets["train"],
-        datasets["valid"],
-        train_loader_kwargs=hparams["train_dataloader_opts"],
-        valid_loader_kwargs=hparams["valid_dataloader_opts"],
-    )
+        tts_brain.resample_fn = resample_fn
+
+        # The `fit()` method iterates the training loop, calling the methods
+        # necessary to update the parameters of the model. Since all objects
+        # with changing state are managed by the Checkpointer, training can be
+        # stopped at any point, and will be resumed on next call.
+        tts_brain.fit(
+            tts_brain.hparams.epoch_counter,
+            datasets["train"],
+            datasets["valid"],
+            train_loader_kwargs=hparams["train_dataloader_opts"],
+            valid_loader_kwargs=hparams["valid_dataloader_opts"],
+        )
 
-    # Load best checkpoint for evaluation
-    if hparams["testing"]:
-        test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
-        if test_summary_file is not None:
-            logging.info("Test run already completed: %s", test_summary_file)
-        else:
-            test_key_kind = hparams["test_key_kind"]
-            test_key = hparams["test_key"]
-            eval_kwargs = {
-                f"{test_key_kind}_key": test_key
-            }
-            eval_dataset = datasets["test"]
-            eval_dataset = select_eval_subset(eval_dataset, hparams)
-            tts_brain.evaluate(
-                test_set=eval_dataset,
-                test_loader_kwargs=hparams["test_dataloader_opts"],
-                **eval_kwargs
-            )
+        # Load best checkpoint for evaluation
+        if hparams["testing"]:
+            test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
+            if test_summary_file is not None:
+                logging.info("Test run already completed: %s", test_summary_file)
+            else:
+                test_key_kind = hparams["test_key_kind"]
+                test_key = hparams["test_key"]
+                eval_kwargs = {
+                    f"{test_key_kind}_key": test_key
+                }
+                eval_dataset = datasets["test"]
+                eval_dataset = select_eval_subset(eval_dataset, hparams)
+                tts_brain.evaluate(
+                    test_set=eval_dataset,
+                    test_loader_kwargs=hparams["test_dataloader_opts"],
+                    **eval_kwargs
+                )

From 7fec49f8bda36ef35412271e8e8f6c582996dded Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 17 May 2025 00:08:13 -0400
Subject: [PATCH 247/270] DASB: VALL-E: Add a duration filter

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml  |  5 +++++
 .../TTS/valle/hparams/train_discrete_ssl.yaml       |  5 +++++
 .../LibriTTS/TTS/valle/hparams/train_encodec.yaml   |  5 +++++
 .../TTS/valle/hparams/train_espnet_encodec.yaml     |  6 ++++++
 .../DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml |  5 +++++
 .../TTS/valle/hparams/train_speech_tokenizer.yaml   |  5 +++++
 .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml   |  5 +++++
 .../TTS/valle/hparams/train_wavtokenizer.yaml       |  5 +++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py         | 13 +++++++++++++
 9 files changed, 54 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index c3cc4a750..cc52722b1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -54,6 +54,11 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 4ef2d230c..7bf2be75f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -73,6 +73,11 @@ spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
 use_spk_emb: False
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index df8513cb8..0114bdcb1 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -53,6 +53,11 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 3587b3b23..f33998f26 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -53,6 +53,12 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 16348d9e7..daf37c2a7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -54,6 +54,11 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index ac8172585..407562365 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -53,6 +53,11 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index c49c6d88c..66fb3535a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -53,6 +53,11 @@ flip_layers: False
 use_token_offsets: False
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 22b8c19f3..7780a3fc3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -53,6 +53,11 @@ flip_layers: False
 use_token_offsets: True
 splits: ["train", "valid", "test"]
 
+# Duration Filter
+duration_min: null
+duration_max: null
+
+
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 7014131cc..26758e076 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -874,6 +874,19 @@ def sig_pipeline(wav):
             dynamic_dataset = dynamic_dataset.filtered_sorted(
                 key_test={"has_alignments": lambda value: value}
             )
+        duration_min = hparams.get("duration_min")
+        duration_max = hparams.get("duration_max")
+        if duration_min or duration_max:
+            key_min_value = None
+            key_max_value = None
+            if duration_min:
+                key_min_value = {"duration": duration_min}
+            if duration_max:
+                key_max_value = {"duration": duration_max}
+            dynamic_dataset = dynamic_dataset.filtered_sorted(
+                key_min_value=key_min_value,
+                key_max_value=key_max_value,
+            )
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False

From 3d67a55d711b4ac0ec7addc73be33b6c2bc148f5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 17 May 2025 16:58:02 -0400
Subject: [PATCH 248/270] DASB: A fix for broken annotations

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 26758e076..9acb98831 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -887,6 +887,14 @@ def sig_pipeline(wav):
                 key_min_value=key_min_value,
                 key_max_value=key_max_value,
             )
+        dynamic_dataset = dynamic_dataset.filtered_sorted(
+            key_test={
+                "wrd": lambda wrd: not any(
+                    "{" in item
+                    for item in wrd
+                )
+            }
+        )
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False

From bd47e648d17a5ebd3ae0699048e055777715d584 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 20 May 2025 18:48:42 -0400
Subject: [PATCH 249/270] DASB: Minor fix for backward compatibility

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index ec1845d36..5b9082da5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -935,8 +935,8 @@ def apply_overfit_test(hparams, dataset):
                 logging.info("Test run already completed: %s", test_summary_file)
             else:
                 eval_kwargs = {}
-                test_key_kind = hparams["test_key_kind"]
-                test_key = hparams["test_key"]
+                test_key_kind = hparams.get("test_key_kind", "min")
+                test_key = hparams.get("test_key")
                 if test_key:
                     eval_kwargs = {
                         f"{test_key_kind}_key": test_key

From 42ecf13826e57f3f405ca5fc4292667361470710 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 22 May 2025 15:10:00 -0400
Subject: [PATCH 250/270] DASB: Add inference grid search and micro dWER

---
 .../DASB/LJSpeech/TTS/valle/evaluation.py     |   2 +
 .../DASB/LibriTTS/TTS/valle/evaluation.py     |   5 +
 .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml |  24 +-
 .../valle/hparams/train_espnet_encodec.yaml   |   2 +-
 .../DASB/LibriTTS/TTS/valle/inference_fit.py  | 348 ++++++++++++++++++
 benchmarks/DASB/utils/eval.py                 |  70 +++-
 6 files changed, 441 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
index 6c2dd1c8d..d5aaa649d 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
@@ -75,6 +75,8 @@ def on_evaluation_start(self, output_folder="eval"):
         self.read_reports()
         self.create_reports()
         self.item_ids = []
+        for evaluator_key in self.enabled_evaluators:
+            self.evaluators[evaluator_key].on_evaluation_start()
 
     def on_evaluation_end(self):
         """Invoked at the beginning of the evaluation cycle. The default
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
index 9fd6da808..ebb619757 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -241,6 +241,11 @@ def summarize(self, field=None):
                 items=self.details[evaluator_key], key=metric_key,
             ).items()
         }
+        for evaluator_key in self.enabled_evaluators:
+            result.update({
+                f"{evaluator_key}_{stat_key}": value
+                for stat_key, value in
+                self.evaluators[evaluator_key].global_metrics().items()})
         if field is not None:
             result = result[field]
         return result
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
index 129cf9337..c58c1d49b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -8,6 +8,7 @@ eval_subset: null
 eval_asr_beam_size: 66
 eval_asr_type: encoder_decoder
 eval_asr_source: openai/whisper-small
+eval_asr_metric_mode: micro
 eval_spk_sim_source: microsoft/wavlm-base-sv
 evaluations: utmos,asr,spk_sim
 tmp_folder: null
@@ -19,11 +20,17 @@ eval_utmos_domain_id: null
 eval_utmos_judge_id: null
 eval_perf: False
 
+# Inference Fit
+inference_fit_top_k: [20, 30]
+inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2]
+inference_fit_key_metric: dwer
+inference_fit_key_metric_kind: min
 
 eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator
   source: !ref <eval_asr_source>
   sample_rate: !ref <eval_sample_rate>
   savedir: !ref <pretrained_model_save_folder>
+  metric_mode: !ref <eval_asr_metric_mode>
 
 eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator
   source: !ref <eval_utmos_source>
@@ -51,7 +58,22 @@ eval_summary:
   spk_sim:
     descriptive: ["score"]
 
+dwer_metric_key: !apply:speechbrain.utils.hparams.choice
+  value: !ref <eval_asr_metric_mode>
+  choices:
+    macro: asr_dwer_median
+    micro: asr_dwer_micro
+
 eval_summary_log:
   utmos: utmos_utmos_mean
-  dwer: asr_dwer_median
+  dwer: !ref <dwer_metric_key>
+  spk_sim: spk_sim_score_mean
+
+inference_fit_space:
+    top_k: !ref <inference_fit_top_k>
+    sampling_temperature: !ref <inference_fit_sampling_temperature>
+
+inference_fit_metrics:
+  utmos: utmos_utmos_mean
+  dwer: !ref <dwer_metric_key>
   spk_sim: spk_sim_score_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index f33998f26..bc485eaa9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -264,4 +264,4 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
-    seed: !ref <seed>
+    seed: !ref <seed>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
new file mode 100644
index 000000000..e2f02e95a
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -0,0 +1,348 @@
+"""Inference fit grid search for VALL-E
+
+Curriculum inspired by Lifeiteng's VALL-E
+https://github.com/lifeiteng/vall-e
+
+Authors
+ * Artem Ploujnikov 2024
+"""
+
+import speechbrain as sb
+import sys
+import csv
+import torch
+import operator
+import yaml
+
+from hyperpyyaml import load_hyperpyyaml
+from pathlib import Path
+from torch import nn
+from tqdm.auto import tqdm
+from types import SimpleNamespace
+from speechbrain.dataio.dataio import clean_padding
+from speechbrain.utils.logger import get_logger
+from speechbrain.utils.data_utils import batch_pad_right, pad_right_to
+
+base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
+sys.path.append(base_dir)
+
+from evaluation import SpeechEvaluationMetricStats  # noqa: E402
+from train import undo_padding_tensor, get_offsets  # noqa: E402
+
+logger = get_logger(__name__)
+
+class InferenceFit:
+    """An inference fit wrapper"""
+    def __init__(self, hparams, run_opts):
+        device = run_opts.get("device", "cpu")
+        self.hparams = SimpleNamespace(**hparams)
+        self.modules = nn.ModuleDict(self.hparams.modules).to(device)
+        self.device = device
+        self.space = self.hparams.inference_fit_space
+        self.result = None
+        self.evaluation_metric = SpeechEvaluationMetricStats(
+            self.hparams, self.device
+        )
+        self.offsets = get_offsets(
+            self.hparams.vocab_size, self.hparams.audio_tokens_per_step,
+        )[None, None, :].to(self.device)
+        if not self.hparams.use_token_offsets:
+            self.offsets = torch.zeros_like(self.offsets)
+        self.output_folder_rel = "eval/inference_fit"
+        self.output_folder = Path(self.hparams.output_folder) / self.output_folder_rel
+        self.token_model_kwargs = getattr(
+            self.hparams, "token_model_kwargs", {}
+        )
+
+    def fit(self, dataset):
+        """Performs infernece fitting
+
+        Arguments
+        ---------
+        dataset: DynamicItemDataset
+            a dataset
+
+        Returns
+        -------
+        result: dict
+            the fit result
+        """
+        self.result = []
+        self.recover()
+        logger.info("Parameter Space: %s", format_space(self.space))
+        evaluations = self.enumerate_param_space()
+        for idx, params in enumerate(tqdm(evaluations, desc="Parameter space")):
+            eval_result = self.evaluate(dataset, params)
+            self.result.append({"idx": idx, **params, **eval_result})
+        self.best = self.find_best()
+        return self.result, self.best
+    
+    def find_best(self):
+        best = self.result[0]
+        op = (
+            operator.lt
+            if self.hparams.inference_fit_key_metric_kind == "min"
+            else operator.gt
+        )
+        for item in self.result[1:]:
+            value = item[self.hparams.inference_fit_key_metric]
+            if op(value, best[self.hparams.inference_fit_key_metric]):
+                best = item
+        return best
+
+    def enumerate_param_space(self):
+        return enumerate_space(self.space)
+
+    def evaluate(self, dataset, params):
+        dataloader = sb.dataio.dataloader.make_dataloader(dataset)
+        params_str = format_params(params)
+        logger.info("Starting evaluation of %s", params_str)
+        folder_name = params_to_folder_name(params)
+        self.evaluation_metric.on_evaluation_start(f"{self.output_folder_rel}/{folder_name}")
+        for batch in tqdm(dataloader, desc="Evaluation run", total=len(dataset)):
+            self.evaluate_batch(batch, params)
+        logger.info("Finished evaluation of %s", params_str)
+        self.evaluation_metric.on_evaluation_end()
+        summary = self.evaluation_metric.summarize()
+        metrics = {
+            key: summary.get(value, 0.0)
+            for key, value in self.hparams.inference_fit_metrics.items()
+        }
+        return metrics
+
+    def evaluate_batch(self, batch, params):
+        audio_tokens, audio_length = self.inference(batch, params)
+        wav = self.create_waveform(audio_tokens, audio_length)
+        wav = wav.squeeze(1)
+        self.evaluation_metric.append(
+            ids=batch.uttid,
+            wav=wav,
+            text=batch.label_norm_eval,
+            length=audio_length,
+            wav_ref=batch.sig.data,
+            length_ref=batch.sig.lengths,
+        )
+
+    def write_report(self):
+        if self.result is None:
+            logger.warning("Nothing to report")
+            return
+
+        report_file_name = self.output_folder / "results.csv"
+        report_file_name.parent.mkdir(parents=True, exist_ok=True)
+        with open(report_file_name, "w") as report_file:
+            columns = next(iter(self.result)).keys()
+            writer = csv.DictWriter(report_file, columns)
+            writer.writeheader()
+            for result in self.result:
+                writer.writerow(result)
+        best_file_name = self.output_folder / "best.yaml"
+        with open(best_file_name, "w") as best_file:
+            yaml.dump(self.best, best_file)
+
+    def inference(self, batch, params):
+        """Runs TTS inference
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch
+
+        Returns
+        -------
+        audio : torch.Tensor
+            A padded tensor of audio
+        audio_length : torch.Tensor
+            Relative lengths
+        """
+        prefix, prefix_length = batch.prefix
+        # NOTE: ESPNET VALL-E does not support batched inference
+        prefix_items = undo_padding_tensor(prefix.int(), prefix_length)
+        inference = self.modules.model.inference
+        inference_results = [
+            inference(
+                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts(params)
+            )
+            for prefix_item in prefix_items
+        ]
+        inferred_tokens = [
+            self._pad_inferred_sample(result)
+            for result in inference_results
+        ]
+        audio, audio_length = batch_pad_right(inferred_tokens)
+        audio_length = audio_length.to(self.device)
+        audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0)
+        return audio, audio_length
+
+    # TODO: Duplicated in train, consider refactoring
+    def _pad_inferred_sample(self, result):
+        """Applies length padding to an inference result
+
+        Arguments
+        ---------
+        result : list
+            The VALL-E Inference output
+
+        Returns
+        -------
+        sample : torch.Tensor
+            A sample, padded if needed
+        """
+        if result[0]:
+            sample = result[0][0]
+        else:
+            sample = torch.zeros(
+                1000, self.hparams.audio_tokens_per_step, device=self.device
+            )
+        min_length = getattr(self.hparams, "infer_min_length", 10)
+        sample_length, tracks = sample.shape
+        if sample_length < min_length:
+            sample = pad_right_to(
+                sample,
+                (min_length, tracks),
+            )[0]
+        return sample
+
+    def create_waveform(self, audio, length):
+        """Creates a waveform from a discrete or continuous audio
+        representation
+
+        Arguments
+        ---------
+        audio : torch.Tensor
+            An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features)
+        lengths : torch.Tensor
+            A 1-D tensor
+
+                    Returns
+        -------
+        wav : torch.Tensor
+        """
+        tokenizer = self.modules.tokenizer
+        tokenizer.device = self.device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(self.device)
+            tokenizer.codec_vocoder.device = self.device
+        wav = tokenizer.tokens_to_sig(
+            audio, **self.token_model_kwargs
+        )
+        wav = clean_padding(wav, length)
+        wav = wav.to(self.device)
+        return wav
+
+    def _get_inference_opts(self, params):
+        idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[
+            None, :
+        ]
+        tracks = torch.arange(
+            self.hparams.audio_tokens_per_step, device=self.device
+        )[:, None]
+        if not self.hparams.use_token_offsets:
+            tracks = torch.zeros_like(tracks)
+        track_start = (
+            self.hparams.audio_token_shift
+            + tracks * self.hparams.vocab_size
+        )
+        if self.hparams.flip_layers:
+            track_start = track_start.flip(0)
+        track_end = track_start + self.hparams.vocab_size
+        mask = (
+            ((idx >= track_start) & (idx < track_end))
+            | (idx == self.hparams.bos_index)
+        ).logical_not()
+        mask[
+            (
+                (idx >= self.hparams.special_num_tokens)
+                & (idx <= self.hparams.audio_token_shift)
+            ).expand_as(mask)
+        ] = True
+        return self.hparams.inference_opts(
+            masks={self.hparams.bos_index: mask},
+            **params,
+            device=self.device,
+        )
+
+    def recover(self):
+        test_key_kind = hparams["test_key_kind"]
+        test_key = hparams["test_key"]
+        kwargs = {
+            f"{test_key_kind}_key": test_key
+        }
+        logger.info("Revovering a checkpoint")
+        ckpt = self.hparams.checkpointer.recover_if_possible(**kwargs)
+        if not ckpt:
+            logger.error("Checkpoint not found - cannot evaluate")
+            raise ValueError("No checkpoint available")
+        logger.info("Checkpoint recovered: %s", ckpt)
+
+
+def enumerate_space(space, entry=None, points=None):
+    if points is None:
+        points = []
+    if not space:
+        points.append(entry)
+        return points
+    if entry is None:
+        entry = {}
+    key, values = next(iter(space.items()))
+    rest = dict(space)
+    del rest[key]
+    for value in values:
+        enumerate_space(rest, {**entry, key: value}, points)
+    return points
+
+
+def format_space(space):
+    return ", ".join(
+        f"{parameter}: {values}"
+        for parameter, values in space.items()
+    )
+
+
+def format_params(params):
+    return ", ".join(
+        f"{key}={value}"
+        for key, value in params.items()
+    )
+
+
+def params_to_folder_name(params):
+    params_str = "-".join(
+        f"{key}-{value}"
+        for key, value in params.items()
+    )
+    return f"eval-{params_str}"
+
+
+if __name__ == "__main__":
+    # Reading command line arguments
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+
+    # Load hyperparameters file with command-line overrides
+    with open(hparams_file) as fin:
+        yaml_content = fin.read()
+
+    # Load evaluation hyperparameters
+    eval_hparams_file = Path(hparams_file).parent / "eval.yaml"
+    if not eval_hparams_file.exists():
+        eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml"
+    if eval_hparams_file.exists():
+        logger.info(
+            "Using evaluation hyperparameters from %s", eval_hparams_file
+        )
+        with open(eval_hparams_file) as eval_hparams:
+            hparams_yaml = eval_hparams.read()
+            yaml_content = "\n".join([yaml_content, hparams_yaml])
+    else:
+        logger.info(
+            "%s not found - not using evaluation hyperparameters",
+            eval_hparams_file,
+        )
+    hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True)
+    from train import dataio_prepare
+    datasets, _ = dataio_prepare(hparams)
+    dataset = datasets["valid"]
+
+    inference_fit = InferenceFit(hparams, run_opts)
+    inference_fit.fit(dataset)
+    inference_fit.write_report()
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 0bd3cac30..6f4d4f808 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -204,6 +204,17 @@ def resample(self, audio, sample_rate=None):
             )
         return audio
 
+    def on_evaluation_start(self):
+        """Invoked when evaluation starts"""
+        pass
+
+    def on_evaluation_end(self):
+        """Invoked when evaluation ends"""
+        pass
+
+    def global_metrics(self):
+        return {}
+
 
 def _unbatchify(value):
     """Removes the batch dimension from the tensor. If a single
@@ -238,7 +249,24 @@ def __call__(self, wavs, length):
 
 
 class ASRSpeechEvaluator(SpeechEvaluator):
-    """A superclass for ASR speech evaluators"""
+    """A superclass for ASR speech evaluators
+    
+    Arguments
+    ---------
+    sample_rate : int
+        The sample rate used by the underlying ASR system
+    metric_mode : str
+        macro = metrics are evaluated per utterance and aggregated
+        micro = metrics are evaluated globally
+    """
+
+    def __init__(self, sample_rate=16000, metric_mode="macro"):
+        super().__init__(sample_rate=sample_rate)
+        self.metric_mode = metric_mode
+        self.metrics = {}
+
+    def on_evaluation_start(self):
+        self.metrics = {}
 
     def evaluate(
         self,
@@ -327,21 +355,32 @@ def compute_diff_rate(self, details, device):
 
         """
         ids = range(1, len(details["pred"]) + 1)
-        wer_metric, cer_metric = init_asr_metrics()
+        wer_metric, cer_metric = self.get_asr_metrics("diff")
         pred = self._replace_blanks(details["pred"])
         pred_ref = self._replace_blanks(details["pred_ref"])
         pred = [item.split(" ") for item in pred]
         pred_ref = [item.split(" ") for item in pred_ref]
         wer_metric.append(ids, pred, pred_ref)
         cer_metric.append(ids, pred, pred_ref)
+        count = len(ids)
         dwer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores], device=device
+            [score["WER"] for score in wer_metric.scores[-count:]], device=device
         )
         dcer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores], device=device
+            [score["WER"] for score in cer_metric.scores[-count:]], device=device
         )
         return {"dwer": dwer, "dcer": dcer}
 
+    def get_asr_metrics(self, kind="regular"):
+        if self.metric_mode == "micro":
+            if kind not in self.metrics:
+                metrics = init_asr_metrics()
+                self.metrics[kind] = metrics
+            metrics = self.metrics[kind]
+        else:
+            metrics = init_asr_metrics()
+        return metrics
+
     def _replace_blanks(self, preds):
         """Replaces blanks with single spaces, preventing an exception
         in the case of an unintelligible sample
@@ -351,6 +390,19 @@ def _replace_blanks(self, preds):
         """
         return [" " if item == "" else item for item in preds]
 
+    def global_metrics(self):
+        global_metrics = {}
+        if self.metric_mode == "micro":
+            wer_metric, cer_metric = self.get_asr_metrics("diff")
+            if wer_metric.scores:
+                global_metrics["wer_micro"] = wer_metric.summarize("WER")
+                global_metrics["cer_micro"] = cer_metric.summarize("WER")
+            dwer_metric, dcer_metric = self.get_asr_metrics("diff")
+            if dwer_metric.scores:
+                global_metrics["dwer_micro"] = dwer_metric.summarize("WER")
+                global_metrics["dcer_micro"] = dcer_metric.summarize("WER")
+        return global_metrics
+
 
 class WhisperASRSpeechEvaluator(ASRSpeechEvaluator):
     """A speech evaluator implementation based on Whisper ASR
@@ -383,12 +435,13 @@ def __init__(
         source,
         savedir=None,
         sample_rate=22050,
+        metric_mode="macro",
         min_decode_ratio=0.0,
         max_decode_ratio=1.0,
         run_opts=None,
         unbatch=True,
     ):
-        super().__init__(sample_rate=sample_rate)
+        super().__init__(sample_rate=sample_rate, metric_mode=metric_mode)
         if run_opts is None:
             run_opts = {}
         if savedir is None:
@@ -484,16 +537,17 @@ def _evaluate_samples(self, wavs, length, text, sample_rate):
         )
         predicted_words = [self.normalize(text) for text in predicted_words]
         ids = range(1, len(wavs) + 1)
-        wer_metric, cer_metric = init_asr_metrics()
+        wer_metric, cer_metric = self.get_asr_metrics()
         predicted_words_split = [item.split(" ") for item in predicted_words]
         text_split = [item.split(" ") for item in text]
         wer_metric.append(ids, predicted_words_split, text_split)
         cer_metric.append(ids, predicted_words_split, text_split)
+        count = len(ids)
         wer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores], device=wavs.device
+            [score["WER"] for score in wer_metric.scores[-count:]], device=wavs.device
         )
         cer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores], device=wavs.device
+            [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device
         )
         return {
             "wer": wer,

From d6150c310c3e51a3bc1c5e1d615b3c42e1d8009b Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 23 May 2025 20:07:46 -0400
Subject: [PATCH 251/270] DASB: Add ASR-based selection + minor updates

---
 .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml |   3 +-
 .../valle/hparams/train_espnet_encodec.yaml   |  22 +-
 .../DASB/LibriTTS/TTS/valle/inference_fit.py  |   3 +-
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   |  48 ++++-
 benchmarks/DASB/model/valle.py                | 189 +++++++++++++++++-
 benchmarks/DASB/utils/data.py                 |  33 +++
 6 files changed, 282 insertions(+), 16 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
index c58c1d49b..8fc302473 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -1,5 +1,6 @@
-eval_dataset: valid
+eval_dataset: test
 eval_suffix: ""
+eval_folder: null
 eval_sample_rate: 16000
 eval_spk_sim_sample_rate: 16000
 eval_samples: null
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index bc485eaa9..59c31bb99 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -110,6 +110,7 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -157,6 +158,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 
 ####################### Model parameters ###########################
 # Transformer
@@ -222,6 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
   source: !ref <model_hub>
@@ -264,4 +272,16 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
     save_file: !ref <train_log>
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
-    seed: !ref <seed>
\ No newline at end of file
+    seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
index e2f02e95a..79fcea5d6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -339,9 +339,10 @@ def params_to_folder_name(params):
             eval_hparams_file,
         )
     hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True)
-    from train import dataio_prepare
+    from train import dataio_prepare, select_eval_subset # noqa
     datasets, _ = dataio_prepare(hparams)
     dataset = datasets["valid"]
+    dataset = select_eval_subset(dataset, hparams)
 
     inference_fit = InferenceFit(hparams, run_opts)
     inference_fit.fit(dataset)
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 9acb98831..1030c1b9f 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -38,6 +38,7 @@
 sys.path.append(base_dir)
 
 from evaluation import SpeechEvaluationMetricStats  # noqa: E402
+from model.valle import DefaultSampleSelector
 
 logger = logging.getLogger(__name__)
 
@@ -273,7 +274,7 @@ def on_stage_start(self, stage, epoch):
                 self.evaluation_metric.on_evaluation_start()
                 self.is_evaluating = True
             else:
-                logger.info("No evaluation on epoch %d", epoch)
+                logger.info("No evaluation on epoch %d", epoch)            
         elif stage == sb.Stage.TEST:
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
@@ -282,6 +283,22 @@ def on_stage_start(self, stage, epoch):
         )
         dataset = stage.name.lower()
         self.resample_fn[dataset](epoch=epoch or 0)
+        self.init_sample_selector(stage)
+
+    def init_sample_selector(self, stage):
+        """Initializes the sample selector"""
+        if stage == sb.Stage.TRAIN:
+            self.sample_selector = None
+        else:
+            sample_selector = getattr(
+                self.hparams, "sample_selector", None
+            )
+            if not sample_selector:
+                sample_selector = DefaultSampleSelector
+            self.sample_selector = sample_selector(
+                token_shift=self.hparams.audio_token_shift,
+                offsets=self.offsets
+            )
 
     def apply_curriculum(self):
         """Applies curriculum settings, if specified, training only the autoregressive part - or
@@ -484,13 +501,23 @@ def inference(self, batch):
             self.modules.model.module.inference
             if hasattr(self.modules.model, "module")
             else self.modules.model.inference
-        )        
+        )
+        logger.info("Running inference")
         inference_results = [
             inference(
                 prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts()
             )
             for prefix_item in prefix_items
         ]
+        logger.info("Running selection")
+        inference_results = [
+            self.sample_selector.select(
+                tokens,
+                scores,
+                label
+            )
+            for (tokens, scores), label in zip(inference_results, batch.label_norm_eval)
+        ]
         inferred_tokens = [
             self._pad_inferred_sample(result)
             for result in inference_results
@@ -513,8 +540,8 @@ def _pad_inferred_sample(self, result):
         sample : torch.Tensor
             A sample, padded if needed
         """
-        if result[0]:
-            sample = result[0][0]
+        if result is not None:
+            sample = result
         else:
             sample = torch.zeros(
                 1000, self.hparams.audio_tokens_per_step, device=self.device
@@ -579,8 +606,15 @@ def save_eval(self, stage):
 
     def _get_eval_output_folder(self, stage):
         epoch = self.hparams.epoch_counter.current
+        eval_folder_name = None
+        if stage == sb.Stage.TEST and self.hparams.eval_folder:
+            eval_folder_name = self.hparams.eval_folder
+        if not eval_folder_name:
+            eval_folder_name = stage.name.lower()
+        if self.hparams.eval_suffix:
+            eval_folder_name += self.hparams.eval_suffix
         output_folder = (
-            Path(self.hparams.output_folder) / "eval" / stage.name.lower()
+            Path(self.hparams.output_folder) / "eval" / eval_folder_name
         )
         if epoch is not None:
             output_folder = output_folder / str(epoch)
@@ -1384,7 +1418,9 @@ def undo_padding_tensor(batch, lengths):
                 eval_kwargs = {
                     f"{test_key_kind}_key": test_key
                 }
-                eval_dataset = datasets["test"]
+                eval_dataset_key = hparams["eval_dataset"]
+                logger.info("Performing final evaluation on the %s dataset", eval_dataset_key)
+                eval_dataset = datasets[eval_dataset_key]
                 eval_dataset = select_eval_subset(eval_dataset, hparams)
                 tts_brain.evaluate(
                     test_set=eval_dataset,
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 61fd5bb08..665a5a570 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -12,11 +12,16 @@
 
 # Implementation of Vall-E: https://arxiv.org/abs/2301.02111
 
+from io import StringIO
 import logging
+import re
+import string
 import torch
 import inspect
+import torchaudio
 from typing import Tuple, Optional
 from speechbrain.dataio.dataio import length_to_mask
+from speechbrain.utils.metric_stats import ErrorRateStats
 
 from torch import Tensor
 from torch import nn
@@ -24,6 +29,13 @@
 from dataclasses import dataclass
 
 from speechbrain.nnet.losses import reduce_loss, truncate
+from speechbrain.lobes.models.huggingface_transformers import Whisper
+from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher
+from speechbrain.utils.data_utils import batch_pad_right
+from speechbrain.utils.logger import get_logger
+from utils.data import undo_padding_tensor
+
+logger = get_logger(__name__)
 
 
 @dataclass
@@ -353,11 +365,13 @@ def inference(
             if torch.any(modality_change_mask):
                 modality_index = torch.where(
                     modality_change_mask, prev_tok[:, 0], modality_index,
-                ).flatten().squeeze()
-                if modality_index.dim() == 0:
-                    modality_index = modality_index.unsqueeze(0)
-                if modality_index.size(0) > 1:
-                    modality_index = modality_index[0:1]
+                )
+                if is_flattened:
+                    modality_index = modality_index.flatten().squeeze()
+                    if modality_index.dim() == 0:
+                        modality_index = modality_index.unsqueeze(0)
+                    if modality_index.size(0) > 1:
+                        modality_index = modality_index[0:1]
                 mask = modality_index_to_mask(modality_index, opts)
                 logging.warning(
                     f"Step {step}: change modality index {modality_index}"
@@ -486,10 +500,10 @@ def inference(
             gen_tokens_list.append(gen_tokens[b][:item_finish_idx])
             gen_scores_list.append(gen_scores[b][:item_finish_idx])
         return gen_tokens_list, gen_scores_list
-    
+
     def apply_lm_head(self, x, track):
         """Applies the language model head
-        
+
         Arguments
         ---------
         """
@@ -1228,3 +1242,164 @@ def masked_nll_loss(
     loss *= mask
     loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets)
     return loss
+
+
+class SampleSelector:
+    """A base class for sample selectors"""
+
+    def select(self, tokens, scores, label):
+        """Performs selection
+
+        Arguments
+        ---------
+        tokens : list
+            The generated tokens
+
+        scores : list
+            The scores
+
+        label : str
+            The label for the sample
+        """
+        raise NotImplementedError()
+
+
+class DefaultSampleSelector(SampleSelector):
+    def __init__(self, **kwargs):
+        pass
+
+    def select(self, tokens, scores, text):
+        return tokens[0]
+
+
+RE_PUNCTUATION = re.compile(
+    "|".join(re.escape(char) for char in string.punctuation)
+)
+
+
+class WhisperASRSampleSelector(SampleSelector):
+    """A selector implemented using Whisper
+    
+    Arguments
+    ---------
+    tokenizer: BaseTokenizer
+        A tokenizer interface
+    source : str
+        The source for the Whisper model
+    savedir : str
+        The path where the Whisper model will be saved
+    model : Whisper
+        Alternatively, a pre-initialized Whisper model instance
+    sample_rate : int
+        The sample rate of the underlying Whisper model
+    tokenizer_sample_rate : int
+        The sample rate of the tokenizer provided
+    min_decode_ratio : float
+        The minimum decode ratio for ASR
+    max_decode_ratio : float
+        The maximum decode ratio for ASR
+    language : str
+        The ASR language
+    debug : bool
+        Whether debug mode is enabled. This will trigger
+        more verbose logging, including a WER report
+    """
+    def __init__(
+        self,
+        tokenizer,
+        source=None,
+        savedir=None,
+        model=None,
+        sample_rate=16000,
+        tokenizer_sample_rate=16000,
+        min_decode_ratio=0.0,
+        max_decode_ratio=1.0,
+        language="english",
+        token_shift=0,
+        offsets=None,
+        debug=False
+    ):
+        self.tokenizer = tokenizer
+        self.sample_rate = sample_rate
+        self.tokenizer_sample_rate = tokenizer_sample_rate
+        if model is not None:
+            self.model = model
+        else:
+            self.model = Whisper(
+                source, savedir, sample_rate, freeze=True, freeze_encoder=True,
+            )
+        self.model.tokenizer.set_prefix_tokens(language, "transcribe", False)
+        self.searcher = S2SWhisperGreedySearcher(
+            self.model,
+            min_decode_ratio=min_decode_ratio,
+            max_decode_ratio=max_decode_ratio,
+        )
+        self.token_shift = token_shift
+        self.offsets = offsets
+        self.debug = debug
+
+    def select(self, tokens, scores, text):
+        tokens, length = batch_pad_right(tokens)
+        tokens_shift = tokens - self.token_shift
+        if self.offsets is not None:
+            tokens_shift = tokens_shift - self.offsets
+        tokens_shift = tokens_shift.clip(0)
+        wav = self.tokenizer.tokens_to_sig(tokens_shift)
+        if self.sample_rate != self.tokenizer_sample_rate:
+            wav = torchaudio.functional.resample(
+                wav,
+                orig_freq=self.tokenizer_sample_rate,
+                new_freq=self.sample_rate
+            )
+        wav = undo_padding_tensor(wav, length)
+        metric = ErrorRateStats()
+        text = text.split(" ")
+        ids = range(len(wav))
+        preds = [self.predict(wav_item).split(" ") for wav_item in wav]
+        metric.append(ids, preds, [text] * len(wav))
+        sample_scores = [score["WER"] for score in metric.scores]
+        idx = torch.argmin(torch.tensor(sample_scores)).item()
+        logger.info(
+            "Ground truth text: %s, sample scores: %s, best: #%d",
+            text,
+            sample_scores,
+            idx
+        )
+        if self.debug:
+            sio = StringIO()
+            metric.write_stats(sio)
+            logger.info("%s", sio.getvalue())
+        return tokens[idx]
+
+    def predict(self, wav):
+        if wav.dim() < 2:
+            wav = wav.unsqueeze(0)
+        wav = self.model.pad_or_trim(wav)
+        mels = self.model.log_mel_spectrogram(wav)
+        enc_out = self.model.forward_encoder(mels)
+        pred, _, _, _ = self.searcher(enc_out.detach(), torch.tensor(1., device=wav.device))
+        pred = self.model.tokenizer.batch_decode(
+            pred, skip_special_tokens=True
+        )[0]
+        pred = self.normalize(pred)
+        return pred
+    
+    def normalize(self, text):
+        """Performs text normalization (uppercase, remove whitespace,
+        remove punctuation)
+
+        Arguments
+        ---------
+        text : str
+            Unnormalized text
+
+        Returns
+        -------
+        text : str
+            Normalized text
+        """
+        text = text.upper()
+        text = text.strip()
+        text = RE_PUNCTUATION.sub("", text)
+        return text
+
diff --git a/benchmarks/DASB/utils/data.py b/benchmarks/DASB/utils/data.py
index 6c68358f5..3ad31419a 100644
--- a/benchmarks/DASB/utils/data.py
+++ b/benchmarks/DASB/utils/data.py
@@ -89,3 +89,36 @@ def _undo_padding(batch, lengths):
 def as_dict(batch):
     """Converts a batch to a dictionary"""
     return {key: getattr(batch, key) for key in batch._PaddedBatch__keys}
+
+
+def undo_padding_tensor(batch, lengths):
+    """Produces Python lists given a batch of sentences with
+    their corresponding relative lengths.
+
+    Arguments
+    ---------
+    batch : torch.Tensor
+        Batch of sentences gathered in a batch.
+    lengths : torch.Tensor
+        Relative length of each sentence in the batch.
+
+    Returns
+    -------
+    as_list : list
+        A python list of the corresponding input tensor.
+
+    Example
+    -------
+    >>> batch=torch.rand([4,100])
+    >>> lengths=torch.tensor([0.5,0.6,0.7,1.0])
+    >>> snt_list=undo_padding(batch, lengths)
+    >>> len(snt_list)
+    4
+    """
+    batch_max_len = batch.shape[1]
+    as_list = []
+    for seq, seq_length in zip(batch, lengths):
+        actual_size = int(torch.round(seq_length * batch_max_len))
+        seq_true = seq.narrow(0, 0, actual_size)
+        as_list.append(seq_true)
+    return as_list

From 5804bbae7ae65db1e0be2e15b5a28300f586e427 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 23 May 2025 23:55:56 -0400
Subject: [PATCH 252/270] DASB: Fix the max validation set size

---
 benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 1030c1b9f..5aae00e3a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -1373,6 +1373,7 @@ def undo_padding_tensor(batch, lengths):
                 "seed": hparams["seed"],
                 "alignments_folder": hparams.get("alignments_folder"),
                 "model_name": hparams["model"].__class__.__name__,
+                "max_valid_size": hparams.get("max_valid_size", 10000)
             },
         )
 

From dd65c62459f247c9d20f6d8820caac306266d69c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sat, 24 May 2025 16:24:48 -0400
Subject: [PATCH 253/270] DASB: Evaluations and fit fixes

---
 benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py     | 4 ++++
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 4 ++--
 benchmarks/DASB/utils/eval.py                        | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
index ebb619757..8ee32cb9d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -73,6 +73,8 @@ def on_evaluation_start(self, output_folder="eval"):
         self.read_reports()
         self.create_reports()
         self.item_ids = []
+        for evaluator_key in self.enabled_evaluators:
+            self.evaluators[evaluator_key].on_evaluation_start()
 
     def on_evaluation_end(self):
         """Invoked at the beginning of the evaluation cycle. The default
@@ -80,6 +82,8 @@ def on_evaluation_end(self):
         """
         logger.info("Ending evaluation")
         self.write_summary()
+        for evaluator_key in self.enabled_evaluators:
+            self.evaluators[evaluator_key].on_evaluation_end()
 
     def create_reports(self):
         """Creates report files and report writers"""
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
index 8fc302473..1e41dd473 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -22,8 +22,8 @@ eval_utmos_judge_id: null
 eval_perf: False
 
 # Inference Fit
-inference_fit_top_k: [20, 30]
-inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2]
+inference_fit_top_k: [10, 20, 30]
+inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2, 1.3]
 inference_fit_key_metric: dwer
 inference_fit_key_metric_kind: min
 
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 6f4d4f808..9e99bcce0 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -393,7 +393,7 @@ def _replace_blanks(self, preds):
     def global_metrics(self):
         global_metrics = {}
         if self.metric_mode == "micro":
-            wer_metric, cer_metric = self.get_asr_metrics("diff")
+            wer_metric, cer_metric = self.get_asr_metrics("regular")
             if wer_metric.scores:
                 global_metrics["wer_micro"] = wer_metric.summarize("WER")
                 global_metrics["cer_micro"] = cer_metric.summarize("WER")

From 427be64c4089550031be523fef33907fe29d5740 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 25 May 2025 09:21:31 -0400
Subject: [PATCH 254/270] DASB: Add sampling temperature

---
 .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml  | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 59c31bb99..876235ffc 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -111,6 +111,7 @@ n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -230,6 +231,8 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
   source: !ref <model_hub>

From 1a9aed4b3a03ab9227341aa78bee9fa5681bd121 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 26 May 2025 19:25:31 -0400
Subject: [PATCH 255/270] DASB: Fix the WER calculation bug

---
 benchmarks/DASB/utils/eval.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 9e99bcce0..1694355ec 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -321,6 +321,7 @@ def evaluate(
                 length=length_ref,
                 text=text,
                 sample_rate=sample_rate_ref,
+                metric_key="ref",
             )
             details.update(
                 {f"{key}_ref": value for key, value in details_ref.items()}
@@ -459,7 +460,7 @@ def __init__(
         self.unbatch = unbatch
         self.to(device)
 
-    def evaluate_samples(self, wavs, length, text, sample_rate):
+    def evaluate_samples(self, wavs, length, text, sample_rate, metric_key="regular"):
         """Evaluates a batch of samples
 
         Arguments
@@ -472,6 +473,8 @@ def evaluate_samples(self, wavs, length, text, sample_rate):
             Text labels corresponding to the waveforms
         sample_rate : int
             The sample rate of the waveforms
+        metric_key : str
+            The key for metrics
 
         Returns
         -------
@@ -487,24 +490,25 @@ def evaluate_samples(self, wavs, length, text, sample_rate):
                     torch.ones(1, device=wavs.device),
                     text[idx : idx + 1],
                     sample_rate,
+                    metric_key,
                 )
                 for idx in range(batch_size)
             ]
             result = {
+                "pred": [result["pred"][0] for result in results],
+                "target": text,
                 "wer": torch.stack(
                     [result["wer"] for result in results]
                 ).squeeze(-1),
                 "cer": torch.stack(
                     [result["cer"] for result in results]
                 ).squeeze(-1),
-                "pred": [result["pred"][0] for result in results],
-                "target": text,
             }
             return result
         else:
             return self._evaluate_samples(wavs, length, text, sample_rate)
 
-    def _evaluate_samples(self, wavs, length, text, sample_rate):
+    def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key):
         """Evaluates a batch of samples. This function is meant
         to be used internally. evaluate_samples will call
         it multiple times if unbatch is enabled.
@@ -519,6 +523,8 @@ def _evaluate_samples(self, wavs, length, text, sample_rate):
             Text labels corresponding to the waveforms
         sample_rate : int
             The sample rate of the waveforms
+        metric_key : bool
+            Whether to compute the metrics            
 
         Returns
         -------
@@ -537,7 +543,7 @@ def _evaluate_samples(self, wavs, length, text, sample_rate):
         )
         predicted_words = [self.normalize(text) for text in predicted_words]
         ids = range(1, len(wavs) + 1)
-        wer_metric, cer_metric = self.get_asr_metrics()
+        wer_metric, cer_metric = self.get_asr_metrics(metric_key)
         predicted_words_split = [item.split(" ") for item in predicted_words]
         text_split = [item.split(" ") for item in text]
         wer_metric.append(ids, predicted_words_split, text_split)
@@ -549,12 +555,13 @@ def _evaluate_samples(self, wavs, length, text, sample_rate):
         cer = torch.tensor(
             [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device
         )
-        return {
+        result = {
             "wer": wer,
             "cer": cer,
             "pred": predicted_words,
             "target": text,
         }
+        return result
 
     def normalize(self, text):
         """Performs text normalization (uppercase, remove whitespace,

From 8e95fd1be00e3126d97323003d8dd9b1553c9864 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 27 May 2025 14:40:49 -0400
Subject: [PATCH 256/270] DASB: VALL-E: Add sample selection to other
 tokenizers

---
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 23 ++++++++++++++++++
 .../TTS/valle/hparams/train_discrete_ssl.yaml | 24 +++++++++++++++++++
 .../TTS/valle/hparams/train_encodec.yaml      | 22 +++++++++++++++++
 .../TTS/valle/hparams/train_mimi.yaml         | 23 ++++++++++++++++++
 .../TTS/valle/hparams/train_wavtokenizer.yaml | 23 ++++++++++++++++++
 5 files changed, 115 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index cc52722b1..58007632c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -110,6 +110,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -157,6 +159,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 freeze_lm_head: False
 
 ####################### Model parameters ###########################
@@ -218,6 +226,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.DACTokenizer
     model_type: !ref <model_type>
@@ -260,3 +271,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 7bf2be75f..c43c01244 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -129,6 +129,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -198,6 +200,13 @@ sample_dataloader_opts:
 token_model_kwargs:
     SSL_layers: !ref <speech_model_layers>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
+
 ####################### Model parameters ###########################
 # Transformer
 d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])"
@@ -258,6 +267,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
     save_path: !ref <kmeans_cache_dir>/<ssl_model_type>
@@ -301,3 +313,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 0114bdcb1..2407a9e0e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -109,6 +109,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -156,6 +158,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 freeze_lm_head: False
 
 ####################### Model parameters ###########################
@@ -217,6 +225,8 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: !ref <model_hub>
@@ -260,3 +270,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index daf37c2a7..b242f353e 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -110,6 +110,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -157,6 +159,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 
 ####################### Model parameters ###########################
 # Transformer
@@ -218,6 +226,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 0.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
     source: !ref <model_hub>
@@ -257,3 +268,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 7780a3fc3..53df3ebeb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -109,6 +109,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -156,6 +158,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 
 ####################### Model parameters ###########################
 # Transformer
@@ -219,6 +227,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
     source: !ref <model_hub>
@@ -260,3 +271,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file

From 6f60f5ae73d8f79b0910693db1f262968bbc8e04 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 27 May 2025 15:04:34 -0400
Subject: [PATCH 257/270] DASB: VALL-E: Add sample selection

---
 .../valle/hparams/train_speech_tokenizer.yaml | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 407562365..7f6a4bef6 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -109,6 +109,8 @@ spk_prompt_length: 150
 n_ctx: !ref <max_audio_length> + <text_max_length> + <spk_prompt_length>
 infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
+inference_nbest: 1
+inference_sampling_temperature: 1.0
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -156,6 +158,12 @@ sample_dataloader_opts:
         padding_kwargs:
             value: !ref <pad_index>
 
+# Sample Selector
+sample_selection: default
+sample_selector_source: openai/whisper-small
+sample_selector_sample_rate: 16000
+sample_selector_debug: False
+
 
 ####################### Model parameters ###########################
 # Transformer
@@ -217,6 +225,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     minlenratio: 1.0
     maxlenratio: !ref <max_length_ratio>
     nq: !ref <audio_tokens_per_step>
+    nbest: !ref <inference_nbest>
+    sampling_temperature: !ref <inference_sampling_temperature>
+
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
     source: !ref <model_hub>  # Only the 24kHz version supports mono audio
@@ -256,3 +267,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
 
 spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
     seed: !ref <seed>
+
+sample_selector: !apply:speechbrain.utils.hparams.choice
+    value: !ref <sample_selection>
+    choices:
+        default: !name:model.valle.DefaultSampleSelector
+        asr: !name:model.valle.WhisperASRSampleSelector
+            tokenizer: !ref <tokenizer>
+            source: !ref <sample_selector_source>
+            sample_rate: !ref <sample_selector_sample_rate>
+            tokenizer_sample_rate: !ref <model_sample_rate>
+            savedir: !ref <pretrained_model_save_folder>
+            debug: !ref <sample_selector_debug>
\ No newline at end of file

From 1c7e41fb272141c3f2cdc3d66032019fdf7e0075 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 27 May 2025 23:05:50 -0400
Subject: [PATCH 258/270] DASB: VALL-E: Device fixes

---
 benchmarks/DASB/model/valle.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 665a5a570..e0d8c08ba 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -1327,7 +1327,8 @@ def __init__(
         else:
             self.model = Whisper(
                 source, savedir, sample_rate, freeze=True, freeze_encoder=True,
-            )
+            ).to("cuda")
+        self.model.device = "cuda"
         self.model.tokenizer.set_prefix_tokens(language, "transcribe", False)
         self.searcher = S2SWhisperGreedySearcher(
             self.model,

From c7d8866404960b4f36972fb581d296520baa7256 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 27 May 2025 23:24:18 -0400
Subject: [PATCH 259/270] DASB: Device fixes

---
 benchmarks/DASB/model/valle.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index e0d8c08ba..1f23e7cd0 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -1317,18 +1317,20 @@ def __init__(
         language="english",
         token_shift=0,
         offsets=None,
-        debug=False
+        debug=False,
+        device="cuda"
     ):
         self.tokenizer = tokenizer
         self.sample_rate = sample_rate
         self.tokenizer_sample_rate = tokenizer_sample_rate
+        # TODO: Pass the device
         if model is not None:
             self.model = model
         else:
             self.model = Whisper(
                 source, savedir, sample_rate, freeze=True, freeze_encoder=True,
-            ).to("cuda")
-        self.model.device = "cuda"
+            ).to(device)
+        self.model.device = device
         self.model.tokenizer.set_prefix_tokens(language, "transcribe", False)
         self.searcher = S2SWhisperGreedySearcher(
             self.model,
@@ -1338,6 +1340,11 @@ def __init__(
         self.token_shift = token_shift
         self.offsets = offsets
         self.debug = debug
+        tokenizer.device = device
+        if hasattr(tokenizer, "codec_vocoder"):
+            tokenizer.codec_vocoder.to(device)
+            tokenizer.codec_vocoder.device = device
+
 
     def select(self, tokens, scores, text):
         tokens, length = batch_pad_right(tokens)

From d3e94a01d9f9d13a18a29e7462fbd52358063083 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 2 Jun 2025 22:53:29 -0400
Subject: [PATCH 260/270] DASB: Inference Fit: Device Fix

---
 benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
index 79fcea5d6..e90fd62d7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -111,6 +111,7 @@ def evaluate(self, dataset, params):
         return metrics
 
     def evaluate_batch(self, batch, params):
+        batch = batch.to(self.device)
         audio_tokens, audio_length = self.inference(batch, params)
         wav = self.create_waveform(audio_tokens, audio_length)
         wav = wav.squeeze(1)

From fc1b0cec64f852a10dccda65a8aebe7fc79fbf2d Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 4 Jun 2025 23:09:02 -0400
Subject: [PATCH 261/270] DASB: add resume logic

---
 .../DASB/LibriTTS/TTS/valle/inference_fit.py  | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
index e90fd62d7..cc9bef811 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -7,6 +7,7 @@
  * Artem Ploujnikov 2024
 """
 
+import json
 import speechbrain as sb
 import sys
 import csv
@@ -72,11 +73,32 @@ def fit(self, dataset):
         logger.info("Parameter Space: %s", format_space(self.space))
         evaluations = self.enumerate_param_space()
         for idx, params in enumerate(tqdm(evaluations, desc="Parameter space")):
-            eval_result = self.evaluate(dataset, params)
+            if self.is_completed(params):
+                eval_result = self.get_result(params)
+            else:
+                eval_result = self.evaluate(dataset, params)
             self.result.append({"idx": idx, **params, **eval_result})
         self.best = self.find_best()
         return self.result, self.best
-    
+
+    def is_completed(self, params):
+        folder_name = params_to_folder_name(params)
+        path = self.output_folder / folder_name / "summary.json"
+        return path.exists()
+
+    def get_result(self, params):
+        params_str = format_params(params)
+        logger.info("Retrieving params for completed run %s", params_str)
+        folder_name = params_to_folder_name(params)
+        path = self.output_folder / folder_name / "summary.json"
+        with open(path) as summary_file:
+            summary = json.load(summary_file)
+            result = {
+                key: summary.get(value, 0.0)
+                for key, value in self.hparams.inference_fit_metrics.items()
+            }
+        return result
+
     def find_best(self):
         best = self.result[0]
         op = (

From 0de5ad2b03a156ea2ccb174ed93d01239b415a5c Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Wed, 4 Jun 2025 23:34:04 -0400
Subject: [PATCH 262/270] DASB: Add top_k customization

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml      | 3 +++
 .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml    | 2 ++
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml  | 2 ++
 .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml  | 2 ++
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml     | 2 ++
 .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml     | 2 ++
 .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml    | 3 ++-
 7 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 58007632c..3c59482b8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -112,6 +112,8 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -228,6 +230,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>    
 
 
 tokenizer: !new:utils.tokenizer_interface.DACTokenizer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index c43c01244..d8ea29110 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -131,6 +131,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -269,6 +270,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>
 
 
 tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 2407a9e0e..b7b19bb0d 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -111,6 +111,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -227,6 +228,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>
 
 tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer
     source: !ref <model_hub>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 876235ffc..04f0dceff 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -112,6 +112,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -232,6 +233,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>
 
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index b242f353e..91249b59b 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -112,6 +112,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -228,6 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>
 
 
 tokenizer: !new:utils.tokenizer_interface.MimiTokenizer
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index 7f6a4bef6..e306f9802 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -111,6 +111,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -227,6 +228,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
+    top_k: !ref <inference_top_k>
 
 
 tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 53df3ebeb..9b1733257 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -111,6 +111,7 @@ infer_max_audio_length: !ref <max_audio_length>
 max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
+inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder
@@ -229,7 +230,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
-
+    top_k: !ref <inference_top_k>
 
 tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper
     source: !ref <model_hub>

From b59d0e4b3b2e8c2ec30be4753d29d1c89eaa0070 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 5 Jun 2025 11:01:20 -0400
Subject: [PATCH 263/270] DASB: Remove a duplicate setting

---
 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index 3c59482b8..db6104f1c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -113,7 +113,6 @@ max_length_ratio: 10.0
 inference_nbest: 1
 inference_sampling_temperature: 1.0
 inference_top_k: 20
-inference_top_k: 20
 debug_infer_max_audio_length: 10
 
 # Label encoder

From 62be9d2e1ebde3dc510a2e539302461f94b0df3e Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 10 Jun 2025 21:56:10 -0400
Subject: [PATCH 264/270] DASB: Add a generator saver/loader for better
 reproducibility when interrupted

---
 .../TTS/tokotron/hparams/train_dac.yaml       |   3 +
 .../tokotron/hparams/train_discrete_ssl.yaml  |   3 +
 .../TTS/tokotron/hparams/train_encodec.yaml   |   3 +
 .../hparams/train_espnet_encodec.yaml         |   3 +
 .../hparams/train_fairseq_hubert.yaml         | 314 ++++++++++++++++++
 .../TTS/tokotron/hparams/train_mimi.yaml      |   3 +
 .../hparams/train_speech_tokenizer.yaml       |   3 +
 .../tokotron/hparams/train_wavtokenizer.yaml  |   3 +
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |   3 +
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   3 +
 .../TTS/valle/hparams/train_encodec.yaml      |   3 +
 .../valle/hparams/train_espnet_encodec.yaml   |   3 +
 .../TTS/valle/hparams/train_mimi.yaml         |   3 +
 benchmarks/DASB/model/custom_model.py         |  65 ++++
 14 files changed, 415 insertions(+)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml

diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
index 01c818370..3fa047b31 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml
@@ -275,12 +275,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
index 4efa9f75c..1fe2ebca9 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml
@@ -324,12 +324,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
index e45794171..3820c8407 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml
@@ -277,12 +277,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
index e45794171..3820c8407 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml
@@ -277,12 +277,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml
new file mode 100644
index 000000000..2b18c0657
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml
@@ -0,0 +1,314 @@
+# ############################################################################
+# Model: Tokenized TTS (WhisperSpeech-inspired)
+# Authors:  Artem Ploujnikov
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+
+seed: 74443
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+run_name: !PLACEHOLDER
+output_folder: !ref results/transformer/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+testing: True # If set to True, the test evlaution is done, otherwise skipped.
+
+# Data files
+data_folder: !PLACEHOLDER
+cached_data_folder: !PLACEHOLDER
+data_folder_alignments: null # e.g., /path/to/LibriSpeech
+prepare_save_folder: !ref <cached_data_folder>
+pretrained_model_save_folder: !ref <prepare_save_folder>
+representation_mode: discrete
+prepare_archive_path: null
+prepare_skip_ignore_folders: False
+data_mode: lite
+train_json: !ref <prepare_save_folder>/train.json
+valid_json: !ref <prepare_save_folder>/valid.json
+test_json: !ref <prepare_save_folder>/test.json
+train_split: !apply:speechbrain.utils.hparams.choice
+    value: !ref <data_mode>
+    choices:
+        lite: ["train-clean-100"]
+        clean: ["train-clean-100", "train-clean-360"]
+        full: ["train-clean-100", "train-clean-360", "train-other-500"]
+valid_split: ["dev-clean"]
+test_split: ["test-clean"]
+frozen_split_path: null
+sample_path: null
+progress_folder: !ref <output_folder>/progress
+progress_current: !ref <progress_folder>/current
+init_from: null
+num_audio_samples: 32
+samples_interval: 5
+
+tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved.
+
+tokens_loader: !new:utils.tokens.TokensLoader
+    data_path: !ref <tokens_folder>
+
+# Position shift
+use_position_shift: True
+max_position_shift: 1000
+position_shift_seed: 42
+position_shift_probability: 1.0
+
+freeze_token_model: True
+
+g2p_src: flexthink/soundchoice-g2p
+kmeans_cache_dir: !ref <save_folder>/kmeans_checkpoint
+kmeans_dataset: LibriSpeech
+model_path: !ref <pretrained_model_save_folder>/fairseq-hubert
+feature_extractor_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher.pt
+kmeans_path: !ref <model_path>/mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin
+vocoder_dense_model_name: "mhubert-base-25hz"
+vocoder_quantizer_model_name: "kmeans"
+vocoder_vocab_size: 500
+
+available_speech_model_layers: [1, 3, 7, 12, 18, 23]
+speech_model_layers: !ref <available_speech_model_layers>
+select_layers: null
+token_offset: 1
+spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec
+spk_emb_shuffle: True
+
+splits: ["train", "valid", "test"]
+split_ratio: [90, 5, 5]
+
+
+ckpt_interval_minutes: 30 # save checkpoint every N min
+
+# Training parameters
+input: text
+number_of_epochs: 1000
+reset_annealing_epoch: null
+batch_size: 16
+valid_batch_size: !ref <batch_size>
+batch_size_guided: 2
+extract_features_batch_size: 32
+grad_accumulation_factor: 1
+max_grad_norm: 0.01
+sorting: random
+num_workers: 4
+skip_prep: False
+overfit_test: False
+overfit_test_sample_count: !ref <batch_size>
+overfit_test_epoch_data_count: 1000
+
+
+# index
+pad_index: 0
+bos_index: 0
+bos_width: 1
+eos_index: 0
+eos_width: 1
+audio_token_shift: 0
+
+# stages related parameters
+lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)"
+lr_warmup_steps: 10000
+lr_annealing_mode: step
+guided_attention_weight: 50.0
+guided_attention_sigma: 0.5
+gate_loss_weight: 1.0
+gate_threshold: 0.5
+gate_loss_beta: 0.2
+gate_loss_gamma: 0.01
+gate_loss_max_weight: 1.
+
+# Inference parameters
+eos_mode: gate
+scale_factor: 4
+
+# Embedding Injection
+spk_emb_injection: null
+
+# Beam Search-specific parameters
+min_decode_ratio: 1.0
+max_decode_ratio: 10.0
+beam_size: 5
+
+
+# Feature parameters
+sample_rate: 24000
+model_sample_rate: 16000
+max_audio_length: 2000
+infer_max_audio_length: !ref <max_audio_length>
+debug_infer_max_audio_length: 10
+
+# Label encoder
+label_encoder: !new:speechbrain.dataio.encoder.TextEncoder
+token_list_file_text: char_en.txt
+token_list_file_phn: arpabet.txt
+token_list_file: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <token_list_file_text>
+        phonemes: !ref <token_list_file_phn>
+
+# Gate offset
+gate_offset: !apply:Tokotron.distance_diff_loss_ramp
+    beta: !ref <gate_loss_beta>
+    gamma: !ref <gate_loss_gamma>
+    max_weight: !ref <gate_loss_max_weight>
+
+silence_padding: !ref <gate_offset>
+use_silence_padding: True
+
+
+spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams
+    source: !ref <spk_emb_src>
+    savedir: !ref <pretrained_model_save_folder>/ecapa
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+valid_dataloader_opts:
+    batch_size: !ref <valid_batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+test_dataloader_opts:
+    batch_size: 1
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+sample_dataloader_opts:
+    batch_size: !ref <batch_size>
+    num_workers: !ref <num_workers>
+    collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+        padding_kwargs:
+            value: !ref <pad_index>
+
+token_model_kwargs:
+    SSL_layers: !ref <speech_model_layers>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 512
+nhead: 4
+enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])"
+dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])"
+layerwise_renorm: True
+d_ffn: 2048
+transformer_dropout: 0.2
+target_dropout: 0.2
+activation: !name:torch.nn.GELU
+vocab_size: 1000
+audio_dim: 1024
+audio_emb_size: !apply:speechbrain.utils.hparams.choice
+    value: !ref <representation_mode>
+    choices:
+        discrete: 1024
+        continuous: 128
+audio_emb_freeze: False
+audio_emb_lr: 0.00001
+audio_emb_weight_decay: 0.001
+audio_emb_pretrained: False
+text_num_tokens: 39
+phn_num_tokens: 52
+input_num_tokens: !apply:speechbrain.utils.hparams.choice
+    value: !ref <input>
+    choices:
+        text: !ref <text_num_tokens>
+        phonemes: !ref <phn_num_tokens>
+audio_tokens_per_step: 6
+attention_type: regularMHA
+############################## models ################################
+emb:
+    spk:
+        kind: "pretrained"
+        dim: 192
+        injection: !ref <spk_emb_injection>
+
+model: !new:Tokotron.TokotronTransformerModel
+    input_num_tokens: !ref <input_num_tokens> # yamllint disable-line rule:line-length
+    audio_num_tokens: !ref <vocab_size>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    d_model: !ref <d_model>
+    d_ffn: !ref <d_ffn>
+    nhead: !ref <nhead>
+    enc_num_layers: !ref <enc_num_layers>
+    dec_num_layers: !ref <dec_num_layers>
+    dropout: !ref <transformer_dropout>
+    target_dropout: !ref <target_dropout>
+    activation: !ref <activation>
+    attention_type: !ref <attention_type>
+    gate_threshold: !ref <gate_threshold>
+    gate_offset: !ref <gate_offset>
+    audio_emb_size: !ref <audio_emb_size>
+    audio_emb_freeze: !ref <audio_emb_freeze>
+    max_audio_length: !ref <max_audio_length>
+    eos_mode: !ref <eos_mode>
+    infer_max_audio_length: !ref <infer_max_audio_length>
+    audio_token_shift: !ref <audio_token_shift>
+    scale_factor: !ref <scale_factor>
+    representation_mode: !ref <representation_mode>
+    emb: !ref <emb>
+  
+vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name
+  dense_model_name: !ref <vocoder_dense_model_name> #"mhubert-base-25hz"
+  quantizer_model_name: !ref <vocoder_quantizer_model_name> # "kmeans",
+  vocab_size: !ref <vocoder_vocab_size> #500
+
+tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer
+  feat_extractor_path: !ref <feature_extractor_path>
+  km_path: !ref <kmeans_path>
+  layer: !ref <layer>
+  vocoder: !ref <vocoder>
+
+modules:
+    model: !ref <model>
+    tokenizer: !ref <tokenizer>
+    compute_cost: !ref <compute_cost>
+
+# define two optimizers here for two-stage training
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+compute_cost: !new:Tokotron.TokotronLoss
+    guided_attention_weight: !ref <guided_attention_weight>
+    guided_attention_sigma: !ref <guided_attention_sigma>
+    gate_weight: !ref <gate_loss_weight>
+    gate_beta: !ref <gate_loss_beta>
+    gate_gamma: !ref <gate_loss_gamma>
+    gate_max_weight: !ref <gate_loss_max_weight>
+    silence_padding: !ref <silence_padding>
+    eos_mode: !ref <eos_mode>
+    eos_index: !ref <eos_index>
+    eos_width: !ref <eos_width>
+    audio_tokens_per_step: !ref <audio_tokens_per_step>
+    audio_token_shift: !ref <audio_token_shift>
+    representation_mode: !ref <representation_mode>
+
+
+lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr>
+    n_warmup_steps: !ref <lr_warmup_steps>
+
+generator: !new:model.custom_model.SaveableGenerator
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        lr_scheduler: !ref <lr_annealing>
+        counter: !ref <epoch_counter>
+        generator: !ref <generator>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler
+    seed: !ref <seed>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
index 156e05b02..8d74e195c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml
@@ -264,12 +264,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index ffb68f2a5..1dfc9a1d7 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -273,12 +273,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
index f4f745716..78975b1a0 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml
@@ -264,12 +264,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index db6104f1c..e437d7007 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -258,12 +258,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index d8ea29110..82ab4d736 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -300,12 +300,15 @@ lr_annealing: !new:model.Tokotron.TargetedNoamScheduler
     n_warmup_steps: !ref <lr_warmup_steps>
     param_group: 0
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index b7b19bb0d..23d6ff2b5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -257,12 +257,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index 04f0dceff..e9b28ac7c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -263,12 +263,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 91249b59b..7c1f269ba 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -255,12 +255,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index cdfcd5ced..0a619e43b 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -1,7 +1,14 @@
 import math
+import re
+import speechbrain as sb
 import torch
+
 from speechbrain.nnet.linear import Linear
 from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens
+from speechbrain.utils.logger import get_logger
+
+
+logger = get_logger(__name__)
 
 
 class AttentionMLP(torch.nn.Module):
@@ -222,3 +229,61 @@ def forward(self, logits):
             dim=1
         )
         return token_logits
+
+
+@sb.utils.checkpoints.register_checkpoint_hooks
+class SaveableGenerator:
+    """A wrapper that can be used to store the state of
+    the random number generator in a checkpoint. It helps
+    with reproducibility in long-running experiments.
+
+    Currently, this only supports CPU and Cuda devices
+    natively. If you need training on other architectures,
+    consider implementing a custom generator.
+
+    Running it on an unsupported device not using the Torch
+    generator interface will simply fail to restore the
+    state but will not cause an error.
+
+    Arguments
+    ---------
+    generators : list, optional
+        A list of generator objects. If not provided, 
+    """
+
+    def __init__(self, generators=None):
+        if generators is None:
+            generators = {
+                "default": torch.default_generator
+            }
+            if torch.cuda.is_available():
+                for idx, generator in torch.cuda.default_generators:
+                    generators[f"cuda:{idx}"] = generator
+        self.generators = generators
+
+    @sb.utils.checkpoints.mark_as_saver
+    def _save(self, path):
+        save_dict = {
+            key: generator.get_state()
+            for key, generator in self.generators.items()
+        }
+        torch.save(save_dict, path)
+
+    @sb.utils.checkpoints.mark_as_loader
+    def _recover(self, path, end_of_epoch):
+        del end_of_epoch
+        save_dict = torch.load(path)
+        for key, state in save_dict.items():
+            if key == "default":
+                torch.default_generator.set_state(state)
+                continue
+            match = re.match(r"cuda:(\d+)", key)
+            if match:
+                if not torch.cuda.is_available():
+                    logger.warn("Unable to restore RNG for %s, CUDA unavailable", key)
+                    continue
+                idx = match.group(1)
+                if idx > torch.cuda.device_count() - 1:
+                    logger.warn("Unable to restore RNG for %s, device not found", key)
+                    continue
+                torch.cuda.default_generators[idx].set_state(state)

From 13f13458ff2cf3d906cf2007cb46e54080580b83 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 12 Jun 2025 15:59:07 -0400
Subject: [PATCH 265/270] DASB: Fixed the saveable generator wrapper to account
 for CUDA deprecations

---
 benchmarks/DASB/model/custom_model.py | 43 ++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 0a619e43b..19f1fa3ab 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -257,8 +257,9 @@ def __init__(self, generators=None):
                 "default": torch.default_generator
             }
             if torch.cuda.is_available():
-                for idx, generator in torch.cuda.default_generators:
-                    generators[f"cuda:{idx}"] = generator
+                for idx in range(torch.cuda.device_count()):
+                    generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper(idx)
+
         self.generators = generators
 
     @sb.utils.checkpoints.mark_as_saver
@@ -282,8 +283,42 @@ def _recover(self, path, end_of_epoch):
                 if not torch.cuda.is_available():
                     logger.warn("Unable to restore RNG for %s, CUDA unavailable", key)
                     continue
-                idx = match.group(1)
+                idx = int(match.group(1))
                 if idx > torch.cuda.device_count() - 1:
                     logger.warn("Unable to restore RNG for %s, device not found", key)
                     continue
-                torch.cuda.default_generators[idx].set_state(state)
+            self.generators[key].set_state(state)
+
+
+class _CudaDefaultGeneratorWrapper:
+    """A generator wrapper for default generators - because torch no longer
+    exposes default_generators
+
+    This class should not be used outside of SaveableGenerator
+
+    Arguments
+    ---------
+    device : int|str
+        The device index or identifier"""
+    def __init__(self, device):
+        self.device = device
+
+    def get_state(self):
+        """Returns the generator state
+
+        Returns
+        -------
+        result : torch.Tensor
+            The generator state
+        """
+        return torch.cuda.get_rng_state(self.device)
+
+    def set_state(self, new_state):
+        """"Sets the generator state
+
+        Arguments
+        ---------
+        new_state : dict
+            The new state
+        """
+        torch.cuda.set_rng_state(new_state, self.device)

From cf90559334ea71c6b32dc64956354748d153a497 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Sun, 15 Jun 2025 23:31:28 -0400
Subject: [PATCH 266/270] DASB: Fix an issue with Discrete SSL + generators

---
 .../TTS/valle/hparams/train_discrete_ssl.yaml        |  1 +
 .../TTS/valle/hparams/train_speech_tokenizer.yaml    |  3 +++
 .../TTS/valle/hparams/train_wavtokenizer.yaml        |  3 +++
 benchmarks/DASB/model/valle.py                       | 12 +++++++++---
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 82ab4d736..1950f2886 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -329,4 +329,5 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
+            token_model_kwargs: !ref <token_model_kwargs>
             debug: !ref <sample_selector_debug>
\ No newline at end of file
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index e306f9802..b201aaba4 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -254,12 +254,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index 9b1733257..ae2ce2d95 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -257,12 +257,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 1f23e7cd0..1e098463a 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -1303,6 +1303,9 @@ class WhisperASRSampleSelector(SampleSelector):
     debug : bool
         Whether debug mode is enabled. This will trigger
         more verbose logging, including a WER report
+    token_model_kwargs : dict
+        Additional arguments for the tokenizer
+        decoding function
     """
     def __init__(
         self,
@@ -1318,7 +1321,8 @@ def __init__(
         token_shift=0,
         offsets=None,
         debug=False,
-        device="cuda"
+        token_model_kwargs=None,
+        device="cuda",
     ):
         self.tokenizer = tokenizer
         self.sample_rate = sample_rate
@@ -1340,19 +1344,21 @@ def __init__(
         self.token_shift = token_shift
         self.offsets = offsets
         self.debug = debug
+        if token_model_kwargs is None:
+            token_model_kwargs = {}
+        self.token_model_kwargs = token_model_kwargs
         tokenizer.device = device
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(device)
             tokenizer.codec_vocoder.device = device
 
-
     def select(self, tokens, scores, text):
         tokens, length = batch_pad_right(tokens)
         tokens_shift = tokens - self.token_shift
         if self.offsets is not None:
             tokens_shift = tokens_shift - self.offsets
         tokens_shift = tokens_shift.clip(0)
-        wav = self.tokenizer.tokens_to_sig(tokens_shift)
+        wav = self.tokenizer.tokens_to_sig(tokens_shift, **self.token_model_kwargs)
         if self.sample_rate != self.tokenizer_sample_rate:
             wav = torchaudio.functional.resample(
                 wav,

From b9488e4f051de424024470dff3b8770deb0b99ee Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Mon, 7 Jul 2025 16:10:00 -0400
Subject: [PATCH 267/270] DASB: Cosmetic changes

---
 .../TTS/tokotron/hparams/train_mimi.yaml      |   4 +-
 .../hparams/train_speech_tokenizer.yaml       |   2 +-
 .../TTS/tokotron/hparams/train_sqcodec.yaml   |   2 +-
 .../DASB/LJSpeech/TTS/tokotron/train.py       |  68 +++--
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   2 +-
 .../TTS/valle/hparams/train_encodec.yaml      |   2 +-
 .../valle/hparams/train_espnet_encodec.yaml   |  16 +-
 .../TTS/valle/hparams/train_mimi.yaml         |   2 +-
 .../TTS/valle/hparams/train_sqcodec.yaml      |   4 +-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |   2 +-
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   |  50 ++--
 .../hparams/train_fairseq_hubert.yaml         |  16 +-
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  72 +++--
 .../DASB/LibriTTS/TTS/valle/evaluation.py     |  12 +-
 .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml |   4 +-
 .../LibriTTS/TTS/valle/hparams/train_dac.yaml |   6 +-
 .../TTS/valle/hparams/train_discrete_ssl.yaml |   4 +-
 .../TTS/valle/hparams/train_encodec.yaml      |   4 +-
 .../valle/hparams/train_espnet_encodec.yaml   |  19 +-
 .../TTS/valle/hparams/train_mimi.yaml         |   4 +-
 .../valle/hparams/train_speech_tokenizer.yaml |   4 +-
 .../TTS/valle/hparams/train_sqcodec.yaml      |   9 +-
 .../TTS/valle/hparams/train_wavtokenizer.yaml |   4 +-
 .../DASB/LibriTTS/TTS/valle/inference_fit.py  |  60 ++--
 .../LibriTTS/TTS/valle/tokenizer_prepare.py   | 108 +++++++
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   | 215 ++++++++------
 benchmarks/DASB/LibriTTS/libritts_prepare.py  |  81 ++++-
 benchmarks/DASB/model/Tokotron.py             |  21 +-
 benchmarks/DASB/model/custom_model.py         |  81 +++--
 benchmarks/DASB/model/sq_codec.py             |  49 +--
 benchmarks/DASB/model/valle.py                | 278 +++++++++++-------
 benchmarks/DASB/utils/eval.py                 |  20 +-
 benchmarks/DASB/utils/tokenizer_interface.py  |  21 +-
 33 files changed, 811 insertions(+), 435 deletions(-)
 create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
index 505460dfa..3842caa8f 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml
@@ -154,7 +154,7 @@ transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
 audio_num_tokens: 2048
-audio_emb_size: 1024    
+audio_emb_size: 1024
 audio_emb_freeze: False
 audio_emb_pretrained: False
 audio_token_offsets: False
@@ -166,7 +166,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
-flatten: false
+flatten: False
 attention_type: regularMHA
 
 ############################## models ################################
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
index 0ff172529..cb420591f 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml
@@ -165,7 +165,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens>
         phonemes: !ref <phn_num_tokens>
 audio_tokens_per_step: 2
-flatten: false
+flatten: False
 bandwidth: 1.5
 attention_type: regularMHA
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
index f0ab3d9c1..6e87dedfe 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml
@@ -156,7 +156,7 @@ transformer_dropout: 0.2
 target_dropout: 0.2
 activation: !name:torch.nn.GELU
 audio_num_tokens: 19683
-audio_emb_size:  36
+audio_emb_size: 36
 audio_emb_freeze: False
 audio_emb_pretrained: False
 audio_token_offsets: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 5b9082da5..229d645fe 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -283,7 +283,9 @@ def on_stage_start(self, stage, epoch):
             self.hparams, "token_model_kwargs", {}
         )
 
-        self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity())
+        self.transform_audio = getattr(
+            self.hparams, "transform_audio", torch.nn.Identity()
+        )
 
     def on_stage_end(self, stage, stage_loss, epoch):
         """Gets called at the end of an epoch.
@@ -330,14 +332,14 @@ def on_stage_end(self, stage, stage_loss, epoch):
                 valid_stats=stage_stats,
             )
 
-            # Save the current checkpoint and delete previous checkpoints.        
+            # Save the current checkpoint and delete previous checkpoints.
             ckpt_kwargs = {
                 f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key],
             }
             self.checkpointer.save_and_keep_only(
                 meta={"loss": stage_stats["loss"], **eval_summary_stats},
                 num_to_keep=hparams["ckpt_keep"],
-                **ckpt_kwargs
+                **ckpt_kwargs,
             )
 
     def get_summary_stats(self):
@@ -578,7 +580,7 @@ def audio_ref_pipeline(wav):
                 hparams["speech_model_layers"]
                 if "speech_model_layers" in hparams
                 else audio_tokens_per_step
-            )
+            ),
         )
         if silence_token.dim() == 2:
             silence_token = silence_token.squeeze(-1)
@@ -652,6 +654,22 @@ def audio_pipeline(id):
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
 
+    sort_datasets(datasets, hparams)
+    apply_data_scale(datasets, hparams)
+
+    return datasets, silence_padding
+
+
+def sort_datasets(datasets, hparams):
+    """Sorts datasets according to hyperparameters
+
+    Arguments
+    ---------
+    datasets : dict
+        a key -> value dictionary of datasets (the keys are "train", "valid" and "test")
+    hparams : dict
+        a dictionary of hyperparameters
+    """
     # Sorting training data with ascending order makes the code  much
     # faster  because we minimize zero-padding. In most of the cases, this
     # does not harm the performance.
@@ -666,13 +684,25 @@ def audio_pipeline(id):
         hparams["train_dataloader_opts"]["shuffle"] = False
 
     elif hparams["sorting"] == "random":
-        hparams["train_dataloader_opts"]["shuffle"] = True
-        pass
-
+        if not hparams["overfit_test"]:
+            hparams["train_dataloader_opts"]["shuffle"] = True
     else:
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
+
+
+def apply_data_scale(datasets, hparams):
+    """Selects a fractional dataset if the corresponding parameter is specified,
+    using random sampling
+
+    Arguments
+    ---------
+    datasets : dict
+        a dictionary of datasets
+    hparams : dict
+        parsed hyperparameters
+    """
     data_scale = hparams.get("data_scale")
     if data_scale:
         scaled_data_count = int(len(datasets["train"]) * data_scale)
@@ -680,8 +710,6 @@ def audio_pipeline(id):
             select_n=scaled_data_count
         )
 
-    return datasets, silence_padding
-
 
 def init_sequence_encoder(hparams):
     """Initialize a sequence encoder
@@ -926,27 +954,33 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        test_summary_file = (
+            Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        )
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:
-            test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+            test_summary_file = (
+                Path(hparams["output_folder"])
+                / "eval"
+                / "test"
+                / "summary.json"
+            )
             if test_summary_file.exists():
-                logging.info("Test run already completed: %s", test_summary_file)
+                logging.info(
+                    "Test run already completed: %s", test_summary_file
+                )
             else:
                 eval_kwargs = {}
                 test_key_kind = hparams.get("test_key_kind", "min")
                 test_key = hparams.get("test_key")
                 if test_key:
-                    eval_kwargs = {
-                        f"{test_key_kind}_key": test_key
-                    }
+                    eval_kwargs = {f"{test_key_kind}_key": test_key}
                 tts_brain.evaluate(
                     test_set=datasets["test"],
                     test_loader_kwargs=hparams["test_dataloader_opts"],
-                    **eval_kwargs
+                    **eval_kwargs,
                 )
 
-
     # Save final checkpoint (fixed name)
     tts_brain.checkpointer.save_checkpoint(name="latest")
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
index bba258f8d..541fc2917 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -207,7 +207,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 6
-flatten: false
+flatten: False
 
 freeze_lm_head: False
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
index cae286efd..bfc8c58e4 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml
@@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 bandwidth: 6
 
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
index 5aae5e0db..8a1e65e2c 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -173,7 +173,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 espnet_repo: https://github.com/espnet/espnet
 espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef
 model_hub: espnet/libritts_encodec_24k
@@ -205,13 +205,13 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     top_k: !ref <infer_top_k>
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
-  source: !ref <model_hub>
-  model_config: !ref <model_config>
-  n_codebook: !ref <audio_tokens_per_step>
-  save_path: !ref <pretrained_model_save_folder>
-  sample_rate: !ref <sample_rate>
-  model_ckpt: !ref <model_ckpt>
-  espnet_commit: !ref <espnet_commit>
+    source: !ref <model_hub>
+    model_config: !ref <model_config>
+    n_codebook: !ref <audio_tokens_per_step>
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    model_ckpt: !ref <model_ckpt>
+    espnet_commit: !ref <espnet_commit>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
index edae05d51..21d95dbf9 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml
@@ -174,7 +174,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 bandwidth: 6
 
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
index fb1ca4d33..d36a0cff0 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml
@@ -169,9 +169,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice
 model_vocab_size: !ref <vocab_size> * 2
 
 audio_token_shift: 19683
- 
+
 audio_tokens_per_step: 4
-flatten: true
+flatten: True
 ternary_num_digits: 10
 pred_mode: ternary
 freeze_lm_head: False
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
index 110839413..730eb08a5 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -177,7 +177,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         text: !ref <text_num_tokens> +  <special_num_tokens>
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
-audio_tokens_per_step:  1
+audio_tokens_per_step: 1
 bandwidth: 6
 
 
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index c932fc872..986eb9a7c 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -198,7 +198,7 @@ def compute_loss_stats(
         logits_nar,
         targets_nar,
         mask,
-        reduction="batch"
+        reduction="batch",
     ):
         """Computes an autoregressive/non-autoregressive loss breakdown,
         to be used for metrics/stats
@@ -213,7 +213,7 @@ def compute_loss_stats(
             The non-autoregressive predictions
         targets_nar : torch.Tensor
             The targets for non-autoregressive prediction
-        
+
         Returns
         -------
         stats: dict
@@ -222,13 +222,11 @@ def compute_loss_stats(
         stats = {}
         if self.train_ar:
             stats["loss_ar"] = self.hparams.compute_cost(
-                logits_ar, targets=targets_ar, mask=mask,
-                reduction=reduction,
+                logits_ar, targets=targets_ar, mask=mask, reduction=reduction,
             )
         if self.train_nar:
             stats["loss_nar"] = self.hparams.compute_cost(
-                logits_nar, targets=targets_nar, mask=mask,
-                reduction=reduction,
+                logits_nar, targets=targets_nar, mask=mask, reduction=reduction,
             )
         return stats
 
@@ -280,11 +278,13 @@ def apply_curriculum(self):
         if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
-        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
-            self.train_nar = False
         elif (
-            self.hparams.number_of_epochs_nar is not None
-            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        ):
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_nar is not None and epoch <= (
+            self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar
         ):
             self.train_ar = False
             if self.hparams.freeze_lm_head:
@@ -367,7 +367,7 @@ def evaluate_batch(self, batch, stage):
                 audio_tokens, audio_length = self.inference(batch)
                 if self.hparams.flip_layers:
                     audio_tokens = audio_tokens.flip(2)
-                wav = self.create_waveform(audio_tokens, audio_length)                
+                wav = self.create_waveform(audio_tokens, audio_length)
                 wav = wav.squeeze(1)
                 self.save_samples(
                     batch=batch, wav=wav, length=audio_length, stage=stage
@@ -438,7 +438,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
             self.checkpointer.save_and_keep_only(
                 meta={"loss": stage_stats["loss"], **eval_summary_stats},
                 num_to_keep=hparams["ckpt_keep"],
-                **ckpt_kwargs
+                **ckpt_kwargs,
             )
         elif stage == sb.Stage.TEST:
             self.hparams.train_logger.log_stats(
@@ -498,8 +498,7 @@ def _get_inference_opts(self):
         if not self.hparams.use_token_offsets:
             tracks = torch.zeros_like(tracks)
         track_start = (
-            self.hparams.audio_token_shift
-            + tracks * self.hparams.vocab_size
+            self.hparams.audio_token_shift + tracks * self.hparams.vocab_size
         )
         if self.hparams.flip_layers:
             track_start = track_start.flip(0)
@@ -523,7 +522,9 @@ def save_samples(self, batch, wav, length, stage):
         samples = undo_padding_tensor(wav, length)
         for uttid, sample in zip(batch.uttid, samples):
             file_name = output_folder / f"pred_{uttid}.wav"
-            write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate)
+            write_audio(
+                file_name, sample.detach().cpu(), self.hparams.model_sample_rate
+            )
 
     def save_eval(self, stage):
         """Saves evaluation results
@@ -652,7 +653,12 @@ def sig_pipeline(wav):
         sig = sb.dataio.dataio.read_audio(wav)
         return sig
 
-    dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline, prompt_pipeline]
+    dynamic_items = [
+        sig_pipeline,
+        text_pipeline,
+        tokens_pipeline,
+        prompt_pipeline,
+    ]
 
     init_sequence_encoder(hparams)
     use_spk_emb = hparams.get("use_spk_emb", False)
@@ -761,7 +767,7 @@ def init_sequence_encoder(hparams):
         an encoder instance"""
     encoder = hparams["label_encoder"]
     token_list_file_name = hparams["token_list_file"]
-    tokens = read_token_list(token_list_file_name)    
+    tokens = read_token_list(token_list_file_name)
     encoder.add_unk()
     for token in hparams["special_tokens"]:
         token_key = token.replace("<", "").replace(">", "")
@@ -990,17 +996,17 @@ def undo_padding_tensor(batch, lengths):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        test_summary_file = (
+            Path(hparams["output_folder"]) / "eval" / "test" / "summary.json"
+        )
         if test_summary_file.exists():
             logging.info("Test run already completed: %s", test_summary_file)
         else:
             test_key_kind = hparams["test_key_kind"]
             test_key = hparams["test_key"]
-            eval_kwargs = {
-                f"{test_key_kind}_key": test_key
-            }
+            eval_kwargs = {f"{test_key_kind}_key": test_key}
             tts_brain.evaluate(
                 test_set=datasets["test"],
                 test_loader_kwargs=hparams["test_dataloader_opts"],
-                **eval_kwargs
+                **eval_kwargs,
             )
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml
index 2b18c0657..d30420925 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml
@@ -254,17 +254,17 @@ model: !new:Tokotron.TokotronTransformerModel
     scale_factor: !ref <scale_factor>
     representation_mode: !ref <representation_mode>
     emb: !ref <emb>
-  
+
 vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name
-  dense_model_name: !ref <vocoder_dense_model_name> #"mhubert-base-25hz"
-  quantizer_model_name: !ref <vocoder_quantizer_model_name> # "kmeans",
-  vocab_size: !ref <vocoder_vocab_size> #500
+    dense_model_name: !ref <vocoder_dense_model_name> #"mhubert-base-25hz"
+    quantizer_model_name: !ref <vocoder_quantizer_model_name> # "kmeans",
+    vocab_size: !ref <vocoder_vocab_size> #500
 
 tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer
-  feat_extractor_path: !ref <feature_extractor_path>
-  km_path: !ref <kmeans_path>
-  layer: !ref <layer>
-  vocoder: !ref <vocoder>
+    feat_extractor_path: !ref <feature_extractor_path>
+    km_path: !ref <kmeans_path>
+    layer: !ref <layer>
+    vocoder: !ref <vocoder>
 
 modules:
     model: !ref <model>
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 323134e90..7dc4c4ab2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -335,21 +335,33 @@ def on_fit_start(self):
     def check_init(self):
         init_from = getattr(self.hparams, "init_from", None)
         if init_from is not None:
-            logger.info("Initializing with pre-trained weights from %s", init_from)
+            logger.info(
+                "Initializing with pre-trained weights from %s", init_from
+            )
             init_from_path = Path(init_from)
             model_path = init_from_path / "model.ckpt"
             with open(model_path, "rb") as model_file:
-                model_state_dict = torch.load(model_file, map_location=self.device)
+                model_state_dict = torch.load(
+                    model_file, map_location=self.device
+                )
                 tgt_state_dict = self.modules.model.state_dict()
                 ignore_keys = []
                 for k, v in model_state_dict.items():
-                    if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape:
+                    if (
+                        k in tgt_state_dict
+                        and tgt_state_dict[k].shape != v.shape
+                    ):
                         logger.warning("Ignoring shape mismatch for %s", k)
                         ignore_keys.append(k)
                 for k in ignore_keys:
                     del model_state_dict[k]
-                self.modules.model.load_state_dict(model_state_dict, strict=False)
-            logger.info("Successfully initialized with pre-trained weights from %s", init_from)
+                self.modules.model.load_state_dict(
+                    model_state_dict, strict=False
+                )
+            logger.info(
+                "Successfully initialized with pre-trained weights from %s",
+                init_from,
+            )
 
     @torch.no_grad()
     def evaluate_batch(self, batch, stage):
@@ -525,7 +537,7 @@ def tokens_pipeline(label):
                     hparams["speech_model_layers"]
                     if "speech_model_layers" in hparams
                     else audio_tokens_per_step
-                )
+                ),
             )
         else:
             silence_padding = get_silence_repr(hparams["ssl_model"],)
@@ -552,11 +564,9 @@ def tokens_pipeline(label):
 
     tokens_loader = hparams.get("tokens_loader")
     if layer_idx is not None:
-        tokens_loader_kwargs = {
-            "num_codebooks": layer_idx
-        }
+        tokens_loader_kwargs = {"num_codebooks": layer_idx}
     else:
-        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}    
+        tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step}
 
     @sb.utils.data_pipeline.takes("uttid")
     @sb.utils.data_pipeline.provides("audio_pad", "audio_bos")
@@ -633,6 +643,28 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
             )
             resample_fn[dataset](epoch=0)
 
+    sort_datasets(datasets, hparams)
+    # Exclude samples without phonemes
+    if hparams["input"] == "phonemes":
+        for key in datasets:
+            datasets[key] = datasets[key].filtered_sorted(
+                key_test={"phn": lambda value: value}
+            )
+    datasets["sample"] = select_sample(hparams, datasets)
+    return datasets, silence_padding, resample_fn
+
+
+def sort_datasets(datasets, hparams):
+    """Sorts datasets according to hyperparameters
+
+    Arguments
+    ---------
+    datasets : dict
+        a key -> value dictionary of datasets (the keys are "train", "valid" and "test")
+    hparams : dict
+        a dictionary of hyperparameters
+    """
+
     # Sorting training data with ascending order makes the code  much
     # faster  because we minimize zero-padding. In most of the cases, this
     # does not harm the performance.
@@ -655,15 +687,6 @@ def spk_emb_random_match(uttid, dataset, spk_sample):
             "sorting must be random, ascending or descending"
         )
 
-    # Exclude samples without phonemes
-    if hparams["input"] == "phonemes":
-        for key in datasets:
-            datasets[key] = datasets[key].filtered_sorted(
-                key_test={"phn": lambda value: value}
-            )
-    datasets["sample"] = select_sample(hparams, datasets)
-    return datasets, silence_padding, resample_fn
-
 
 def select_sample(hparams, datasets):
     """Selects a sample of files for sample generation, freezing the sample if
@@ -1015,17 +1038,18 @@ def apply_overfit_test(hparams, dataset):
 
     # Load best checkpoint for evaluation
     if hparams["testing"]:
-        test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
+        test_summary_file = next(
+            Path(hparams["output_folder"]).glob("eval/test/*/summary.json"),
+            None,
+        )
         if test_summary_file is not None:
             logging.info("Test run already completed: %s", test_summary_file)
         else:
             test_key_kind = hparams["test_key_kind"]
             test_key = hparams["test_key"]
-            eval_kwargs = {
-                f"{test_key_kind}_key": test_key
-            }
+            eval_kwargs = {f"{test_key_kind}_key": test_key}
             tts_brain.evaluate(
                 test_set=datasets["test"],
                 test_loader_kwargs=hparams["test_dataloader_opts"],
-                **eval_kwargs
+                **eval_kwargs,
             )
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
index 8ee32cb9d..58fdd5abb 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -246,10 +246,14 @@ def summarize(self, field=None):
             ).items()
         }
         for evaluator_key in self.enabled_evaluators:
-            result.update({
-                f"{evaluator_key}_{stat_key}": value
-                for stat_key, value in
-                self.evaluators[evaluator_key].global_metrics().items()})
+            result.update(
+                {
+                    f"{evaluator_key}_{stat_key}": value
+                    for stat_key, value in self.evaluators[evaluator_key]
+                    .global_metrics()
+                    .items()
+                }
+            )
         if field is not None:
             result = result[field]
         return result
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
index 1e41dd473..f4e975175 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml
@@ -71,8 +71,8 @@ eval_summary_log:
   spk_sim: spk_sim_score_mean
 
 inference_fit_space:
-    top_k: !ref <inference_fit_top_k>
-    sampling_temperature: !ref <inference_fit_sampling_temperature>
+  top_k: !ref <inference_fit_top_k>
+  sampling_temperature: !ref <inference_fit_sampling_temperature>
 
 inference_fit_metrics:
   utmos: utmos_utmos_mean
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
index e437d7007..85020bdff 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml
@@ -201,7 +201,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 2
-flatten: false
+flatten: False
 
 # Model Settings
 model_type: 24khz
@@ -229,7 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
     nq: !ref <audio_tokens_per_step>
     nbest: !ref <inference_nbest>
     sampling_temperature: !ref <inference_sampling_temperature>
-    top_k: !ref <inference_top_k>    
+    top_k: !ref <inference_top_k>
 
 
 tokenizer: !new:utils.tokenizer_interface.DACTokenizer
@@ -287,4 +287,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
index 1950f2886..24316c3d2 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml
@@ -244,7 +244,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 6
-flatten: false
+flatten: False
 
 freeze_lm_head: False
 
@@ -330,4 +330,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
             token_model_kwargs: !ref <token_model_kwargs>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
index 23d6ff2b5..e78119670 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml
@@ -200,7 +200,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 
 # Model Settings
 model_hub: facebook/encodec_24khz
@@ -286,4 +286,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
index e9b28ac7c..31b425824 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml
@@ -58,7 +58,6 @@ duration_min: null
 duration_max: null
 
 
-
 ckpt_key: dwer
 ckpt_key_kind: min
 ckpt_keep: 2
@@ -200,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 
 # Model Settings
 espnet_repo: https://github.com/espnet/espnet
@@ -237,13 +236,13 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions
 
 
 tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface
-  source: !ref <model_hub>
-  model_config: !ref <model_config>
-  n_codebook: !ref <audio_tokens_per_step>
-  save_path: !ref <pretrained_model_save_folder>
-  sample_rate: !ref <sample_rate>
-  model_ckpt: !ref <model_ckpt>
-  espnet_commit: !ref <espnet_commit>
+    source: !ref <model_hub>
+    model_config: !ref <model_config>
+    n_codebook: !ref <audio_tokens_per_step>
+    save_path: !ref <pretrained_model_save_folder>
+    sample_rate: !ref <sample_rate>
+    model_ckpt: !ref <model_ckpt>
+    espnet_commit: !ref <espnet_commit>
 
 modules:
     model: !ref <model>
@@ -292,4 +291,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
index 7c1f269ba..a0d19ce8c 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml
@@ -200,7 +200,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 
 # Model Settings
 model_hub: kyutai/mimi
@@ -284,4 +284,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
index b201aaba4..52dec45c3 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml
@@ -199,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 8
-flatten: false
+flatten: False
 
 # Model Settings
 model_hub: fnlp/SpeechTokenizer
@@ -283,4 +283,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
index 66fb3535a..93b8bcd09 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml
@@ -184,7 +184,7 @@ model_vocab_size: !ref <vocab_size> * 2
 
 audio_token_shift: 19683
 audio_tokens_per_step: 4
-flatten: true
+flatten: True
 ternary_num_digits: 10
 pred_mode: ternary
 
@@ -211,7 +211,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length
     qk_norm: !ref <qk_norm>
     lm_head: !ref <lm_head>
     emb: !ref <emb>
-    logits_to_probs: !ref <logits_to_probs>    
+    logits_to_probs: !ref <logits_to_probs>
 
 inference_opts: !name:model.valle.SpeechLMInferenceOptions
     start: !ref <bos_index>
@@ -240,7 +240,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice
 
 emb: !new:speechbrain.nnet.containers.Sequential
     ternary: !new:model.sq_codec.TernaryEmbedding
-        num_digits: !ref <ternary_num_digits>    
+        num_digits: !ref <ternary_num_digits>
         flat: True
     linear: !new:speechbrain.nnet.linear.Linear
         input_size: !ref <ternary_num_digits> * <audio_tokens_per_step>
@@ -276,12 +276,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
     lr_initial: !ref <lr>
     n_warmup_steps: !ref <lr_warmup_steps>
 
+generator: !new:model.custom_model.SaveableGenerator
+
 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
     checkpoints_dir: !ref <save_folder>
     recoverables:
         model: !ref <model>
         lr_scheduler: !ref <lr_annealing>
         counter: !ref <epoch_counter>
+        generator: !ref <generator>
 
 epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
     limit: !ref <number_of_epochs>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
index ae2ce2d95..38831c660 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml
@@ -199,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice
         phonemes: !ref <phn_num_tokens> + <special_num_tokens>
 
 audio_tokens_per_step: 1
-flatten: false
+flatten: False
 
 # Model Settings
 model_hub: novateur/WavTokenizer-medium-music-audio-75token
@@ -286,4 +286,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice
             sample_rate: !ref <sample_selector_sample_rate>
             tokenizer_sample_rate: !ref <model_sample_rate>
             savedir: !ref <pretrained_model_save_folder>
-            debug: !ref <sample_selector_debug>
\ No newline at end of file
+            debug: !ref <sample_selector_debug>
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
index cc9bef811..f81252b3a 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -32,8 +32,10 @@
 
 logger = get_logger(__name__)
 
+
 class InferenceFit:
     """An inference fit wrapper"""
+
     def __init__(self, hparams, run_opts):
         device = run_opts.get("device", "cpu")
         self.hparams = SimpleNamespace(**hparams)
@@ -50,7 +52,9 @@ def __init__(self, hparams, run_opts):
         if not self.hparams.use_token_offsets:
             self.offsets = torch.zeros_like(self.offsets)
         self.output_folder_rel = "eval/inference_fit"
-        self.output_folder = Path(self.hparams.output_folder) / self.output_folder_rel
+        self.output_folder = (
+            Path(self.hparams.output_folder) / self.output_folder_rel
+        )
         self.token_model_kwargs = getattr(
             self.hparams, "token_model_kwargs", {}
         )
@@ -120,8 +124,12 @@ def evaluate(self, dataset, params):
         params_str = format_params(params)
         logger.info("Starting evaluation of %s", params_str)
         folder_name = params_to_folder_name(params)
-        self.evaluation_metric.on_evaluation_start(f"{self.output_folder_rel}/{folder_name}")
-        for batch in tqdm(dataloader, desc="Evaluation run", total=len(dataset)):
+        self.evaluation_metric.on_evaluation_start(
+            f"{self.output_folder_rel}/{folder_name}"
+        )
+        for batch in tqdm(
+            dataloader, desc="Evaluation run", total=len(dataset)
+        ):
             self.evaluate_batch(batch, params)
         logger.info("Finished evaluation of %s", params_str)
         self.evaluation_metric.on_evaluation_end()
@@ -184,13 +192,13 @@ def inference(self, batch, params):
         inference = self.modules.model.inference
         inference_results = [
             inference(
-                prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts(params)
+                prefix=prefix_item.unsqueeze(0),
+                opts=self._get_inference_opts(params),
             )
             for prefix_item in prefix_items
         ]
         inferred_tokens = [
-            self._pad_inferred_sample(result)
-            for result in inference_results
+            self._pad_inferred_sample(result) for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
         audio_length = audio_length.to(self.device)
@@ -220,10 +228,7 @@ def _pad_inferred_sample(self, result):
         min_length = getattr(self.hparams, "infer_min_length", 10)
         sample_length, tracks = sample.shape
         if sample_length < min_length:
-            sample = pad_right_to(
-                sample,
-                (min_length, tracks),
-            )[0]
+            sample = pad_right_to(sample, (min_length, tracks),)[0]
         return sample
 
     def create_waveform(self, audio, length):
@@ -246,9 +251,7 @@ def create_waveform(self, audio, length):
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
-        wav = tokenizer.tokens_to_sig(
-            audio, **self.token_model_kwargs
-        )
+        wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs)
         wav = clean_padding(wav, length)
         wav = wav.to(self.device)
         return wav
@@ -263,8 +266,7 @@ def _get_inference_opts(self, params):
         if not self.hparams.use_token_offsets:
             tracks = torch.zeros_like(tracks)
         track_start = (
-            self.hparams.audio_token_shift
-            + tracks * self.hparams.vocab_size
+            self.hparams.audio_token_shift + tracks * self.hparams.vocab_size
         )
         if self.hparams.flip_layers:
             track_start = track_start.flip(0)
@@ -280,17 +282,13 @@ def _get_inference_opts(self, params):
             ).expand_as(mask)
         ] = True
         return self.hparams.inference_opts(
-            masks={self.hparams.bos_index: mask},
-            **params,
-            device=self.device,
+            masks={self.hparams.bos_index: mask}, **params, device=self.device,
         )
 
     def recover(self):
         test_key_kind = hparams["test_key_kind"]
         test_key = hparams["test_key"]
-        kwargs = {
-            f"{test_key_kind}_key": test_key
-        }
+        kwargs = {f"{test_key_kind}_key": test_key}
         logger.info("Revovering a checkpoint")
         ckpt = self.hparams.checkpointer.recover_if_possible(**kwargs)
         if not ckpt:
@@ -317,23 +315,16 @@ def enumerate_space(space, entry=None, points=None):
 
 def format_space(space):
     return ", ".join(
-        f"{parameter}: {values}"
-        for parameter, values in space.items()
+        f"{parameter}: {values}" for parameter, values in space.items()
     )
 
 
 def format_params(params):
-    return ", ".join(
-        f"{key}={value}"
-        for key, value in params.items()
-    )
+    return ", ".join(f"{key}={value}" for key, value in params.items())
 
 
 def params_to_folder_name(params):
-    params_str = "-".join(
-        f"{key}-{value}"
-        for key, value in params.items()
-    )
+    params_str = "-".join(f"{key}-{value}" for key, value in params.items())
     return f"eval-{params_str}"
 
 
@@ -361,8 +352,11 @@ def params_to_folder_name(params):
             "%s not found - not using evaluation hyperparameters",
             eval_hparams_file,
         )
-    hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True)
-    from train import dataio_prepare, select_eval_subset # noqa
+    hparams = load_hyperpyyaml(
+        yaml_content, overrides, overrides_must_match=True
+    )
+    from train import dataio_prepare, select_eval_subset  # noqa
+
     datasets, _ = dataio_prepare(hparams)
     dataset = datasets["valid"]
     dataset = select_eval_subset(dataset, hparams)
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py
new file mode 100644
index 000000000..3fe83556b
--- /dev/null
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py
@@ -0,0 +1,108 @@
+"""A script to prepare annotations for tokenizers
+
+"""
+
+import json
+import os
+import re
+import speechbrain as sb
+
+from pathlib import Path
+from speechbrain.lobes.models.g2p.dataio import build_token_char_map
+from speechbrain.utils.logger import get_logger
+
+
+logger = get_logger(__name__)
+MULTI_SPACE = re.compile(r"\s{2,}")
+
+
+def phn2txt(phn, phoneme_map):
+    """Encodes phonemes using a character map for use with SentencePiece
+
+    Arguments
+    ---------
+    phn: list
+        a list of original phonemes (ARPABET)
+    phoneme_map: dict
+        the phoneme-to-character map
+
+    Returns
+    -------
+    value: str
+        the mapped string representation
+    """
+    value = "".join(phoneme_map[phoneme] for phoneme in phn).strip()
+    value = MULTI_SPACE.sub(" ", value)
+    return value
+
+
+def prepare_annotation(src, destination_file_name, phonemes):
+    """Prepares the annotation file
+
+    Arguments
+    ---------
+    src: datasets.arrow_dataset.Dataset
+        the source dataset
+    destination_file_name: str
+        the path to the annotation file to be created
+    phonemes: list
+        the list of phonemes
+    """
+    phoneme_map = build_token_char_map(phonemes)
+    annotation = {
+        key: {
+            "label": item["label"],
+            "phonemes": phn2txt(item["phn"], phoneme_map),
+        }
+        for key, item in src.items()
+    }
+    with open(destination_file_name, "w", encoding="utf-8") as dst_file:
+        json.dump(annotation, dst_file, indent=2)
+
+
+DATA_SPLITS = ["train", "valid", "test"]
+
+
+def prepare_tokenizer(splits, save_folder, input, phonemes):
+    """Prepares annotations for the tokenizer
+
+    Arguments
+    ---------
+    datasets: list
+        the list of dataset splits
+    save_folder: str
+        the path to the folder where annotations will be saved
+    input : str
+        identifies what type of input will be used (text or phonemes)
+    phonemes: list
+        the list of phonemes
+    """
+    save_folder = Path(save_folder)
+    if input == "text":
+        for key in splits:
+            src_file_name = save_folder / f"{key}.json"
+            destination_file_name = (
+                save_folder / f"tokenizer_annotation_{key}.json"
+            )
+            destination_file_name.symlink_to(src_file_name)
+    else:
+        for key in splits:
+            destination_file_name = (
+                save_folder / f"tokenizer_annotation_{key}.json"
+            )
+            if destination_file_name.exists():
+                logger.info(
+                    "Annotation file '%s' already exists", destination_file_name
+                )
+            else:
+                logger.info(
+                    "Creating tokenizer annotation '%s'", destination_file_name,
+                )
+                data_file_name = save_folder / f"{key}.json"
+                with open(data_file_name) as data_file:
+                    data = json.load(data_file)
+                prepare_annotation(
+                    src=data,
+                    destination_file_name=destination_file_name,
+                    phonemes=phonemes,
+                )
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 5aae00e3a..13efdaf26 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -38,10 +38,11 @@
 sys.path.append(base_dir)
 
 from evaluation import SpeechEvaluationMetricStats  # noqa: E402
-from model.valle import DefaultSampleSelector
+from model.valle import DefaultSampleSelector  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
+
 # Brain class for speech recognition training
 class VALLEBrain(sb.Brain):
     """Class that manages the training loop. See speechbrain.core.Brain."""
@@ -85,9 +86,7 @@ def create_waveform(self, audio, length):
         if hasattr(tokenizer, "codec_vocoder"):
             tokenizer.codec_vocoder.to(self.device)
             tokenizer.codec_vocoder.device = self.device
-        wav = tokenizer.tokens_to_sig(
-            audio, **self.token_model_kwargs
-        )
+        wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs)
         wav = clean_padding(wav, length)
         wav = wav.to(self.device)
         return wav
@@ -196,7 +195,7 @@ def compute_objectives(self, predictions, batch, stage):
 
         loss = torch.mean(torch.stack(loss_components))
         return loss
-    
+
     def compute_loss_stats(
         self,
         logits_ar,
@@ -204,11 +203,11 @@ def compute_loss_stats(
         logits_nar,
         targets_nar,
         mask,
-        reduction="batch"
+        reduction="batch",
     ):
         """Computes an autoregressive/non-autoregressive loss breakdown,
         to be used for metrics/stats
-        
+
         Arguments
         ---------
         logits_ar : torch.Tensor
@@ -219,7 +218,7 @@ def compute_loss_stats(
             The non-autoregressive predictions
         targets_nar : torch.Tensor
             The targets for non-autoregressive prediction
-        
+
         Returns
         -------
         stats: dict
@@ -228,13 +227,11 @@ def compute_loss_stats(
         stats = {}
         if self.train_ar:
             stats["loss_ar"] = self.hparams.compute_cost(
-                logits_ar, targets=targets_ar, mask=mask,
-                reduction=reduction,
+                logits_ar, targets=targets_ar, mask=mask, reduction=reduction,
             )
         if self.train_nar:
             stats["loss_nar"] = self.hparams.compute_cost(
-                logits_nar, targets=targets_nar, mask=mask,
-                reduction=reduction,
+                logits_nar, targets=targets_nar, mask=mask, reduction=reduction,
             )
         return stats
 
@@ -258,7 +255,7 @@ def on_stage_start(self, stage, epoch):
         if hasattr(hparams, "speech_model_layers"):
             self.layer_idx = get_selected_layer_indexes(
                 hparams.available_speech_model_layers,
-                hparams.speech_model_layers
+                hparams.speech_model_layers,
             )
         else:
             self.layer_idx = None
@@ -274,7 +271,7 @@ def on_stage_start(self, stage, epoch):
                 self.evaluation_metric.on_evaluation_start()
                 self.is_evaluating = True
             else:
-                logger.info("No evaluation on epoch %d", epoch)            
+                logger.info("No evaluation on epoch %d", epoch)
         elif stage == sb.Stage.TEST:
             self.evaluation_metric.on_evaluation_start()
             self.is_evaluating = True
@@ -290,14 +287,11 @@ def init_sample_selector(self, stage):
         if stage == sb.Stage.TRAIN:
             self.sample_selector = None
         else:
-            sample_selector = getattr(
-                self.hparams, "sample_selector", None
-            )
+            sample_selector = getattr(self.hparams, "sample_selector", None)
             if not sample_selector:
                 sample_selector = DefaultSampleSelector
             self.sample_selector = sample_selector(
-                token_shift=self.hparams.audio_token_shift,
-                offsets=self.offsets
+                token_shift=self.hparams.audio_token_shift, offsets=self.offsets
             )
 
     def apply_curriculum(self):
@@ -314,11 +308,13 @@ def apply_curriculum(self):
         if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten:
             # NOTE: If there is only one track it's autoregressive
             self.train_nar = False
-        elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar:
-            self.train_nar = False
         elif (
-            self.hparams.number_of_epochs_nar is not None
-            and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar)
+            self.hparams.number_of_epochs_ar is not None
+            and epoch <= self.hparams.number_of_epochs_ar
+        ):
+            self.train_nar = False
+        elif self.hparams.number_of_epochs_nar is not None and epoch <= (
+            self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar
         ):
             self.train_ar = False
             if self.hparams.freeze_lm_head:
@@ -471,7 +467,7 @@ def on_stage_end(self, stage, stage_loss, epoch):
             self.checkpointer.save_and_keep_only(
                 meta={"loss": stage_stats["loss"], **eval_summary_stats},
                 num_to_keep=hparams["ckpt_keep"],
-                **ckpt_kwargs
+                **ckpt_kwargs,
             )
         elif stage == sb.Stage.TEST:
             self.hparams.train_logger.log_stats(
@@ -511,16 +507,13 @@ def inference(self, batch):
         ]
         logger.info("Running selection")
         inference_results = [
-            self.sample_selector.select(
-                tokens,
-                scores,
-                label
+            self.sample_selector.select(tokens, scores, label)
+            for (tokens, scores), label in zip(
+                inference_results, batch.label_norm_eval
             )
-            for (tokens, scores), label in zip(inference_results, batch.label_norm_eval)
         ]
         inferred_tokens = [
-            self._pad_inferred_sample(result)
-            for result in inference_results
+            self._pad_inferred_sample(result) for result in inference_results
         ]
         audio, audio_length = batch_pad_right(inferred_tokens)
         audio_length = audio_length.to(self.device)
@@ -549,10 +542,7 @@ def _pad_inferred_sample(self, result):
         min_length = getattr(self.hparams, "infer_min_length", 10)
         sample_length, tracks = sample.shape
         if sample_length < min_length:
-            sample = pad_right_to(
-                sample,
-                (min_length, tracks),
-            )[0]
+            sample = pad_right_to(sample, (min_length, tracks),)[0]
         return sample
 
     def _get_inference_opts(self):
@@ -565,8 +555,7 @@ def _get_inference_opts(self):
         if not self.hparams.use_token_offsets:
             tracks = torch.zeros_like(tracks)
         track_start = (
-            self.hparams.audio_token_shift
-            + tracks * self.hparams.vocab_size
+            self.hparams.audio_token_shift + tracks * self.hparams.vocab_size
         )
         if self.hparams.flip_layers:
             track_start = track_start.flip(0)
@@ -626,7 +615,7 @@ def fit_batch(self, batch):
         if self.hparams.lr_annealing_mode == "step":
             self.hparams.lr_annealing(self.optimizer)
         return loss
-    
+
     def fit(
         self,
         epoch_counter,
@@ -688,7 +677,7 @@ def fit(
         if not (
             isinstance(train_set, DataLoader)
             or isinstance(train_set, LoopedLoader)
-        ):        
+        ):
             train_set = self.make_dataloader(
                 train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs
             )
@@ -698,7 +687,7 @@ def fit(
             valid_set = sample_dataset(
                 dataset=valid_set,
                 count=self.hparams.valid_inter_data_count,
-                seed=self.hparams.seed
+                seed=self.hparams.seed,
             )
 
         valid_set = self.make_dataloader(
@@ -728,7 +717,6 @@ def fit(
                 break
 
 
-
 INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"}
 
 
@@ -767,7 +755,7 @@ def dataio_prepare(hparams):
         hparams["vocab_size"], hparams["audio_tokens_per_step"]
     ).unsqueeze(0)
     if not hparams["use_token_offsets"]:
-        offsets = torch.zeros_like(offsets)    
+        offsets = torch.zeros_like(offsets)
     if hparams["flip_layers"]:
         offsets = offsets.flip(-1)
 
@@ -786,7 +774,6 @@ def dataio_prepare(hparams):
     else:
         num_codebooks = hparams["audio_tokens_per_step"]
 
-
     @sb.utils.data_pipeline.takes("label")
     @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval")
     def text_pipeline(label):
@@ -824,9 +811,7 @@ def spk_prompt(uttid, spk_sample):
         "audio", "prefix", "prompt", "prefix_length", "length"
     )
     def prompt_pipeline(id, tokens, spk_prompt):
-        audio = tokens_loader.tokens_by_uttid(
-            id, num_codebooks=num_codebooks
-        )
+        audio = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks)
         if hparams["flip_layers"]:
             audio = audio.flip(-1)
         yield audio
@@ -904,35 +889,80 @@ def sig_pipeline(wav):
             spk_samplers=spk_samplers,
         )
         resample_fn[dataset](epoch=0)
-        if hparams["input"] == "phonemes":
-            dynamic_dataset = dynamic_dataset.filtered_sorted(
-                key_test={"has_alignments": lambda value: value}
-            )
-        duration_min = hparams.get("duration_min")
-        duration_max = hparams.get("duration_max")
-        if duration_min or duration_max:
-            key_min_value = None
-            key_max_value = None
-            if duration_min:
-                key_min_value = {"duration": duration_min}
-            if duration_max:
-                key_max_value = {"duration": duration_max}
-            dynamic_dataset = dynamic_dataset.filtered_sorted(
-                key_min_value=key_min_value,
-                key_max_value=key_max_value,
-            )
-        dynamic_dataset = dynamic_dataset.filtered_sorted(
-            key_test={
-                "wrd": lambda wrd: not any(
-                    "{" in item
-                    for item in wrd
-                )
-            }
-        )
+        dataset = filter_alignments(dataset, hparams)
+        dataset = filter_duration(dataset, hparams)
 
         datasets[dataset] = dynamic_dataset
         hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False
 
+    sort_datasets(datasets, hparams)
+
+    return datasets, resample_fn
+
+
+def filter_duration(dataset, hparams):
+    """Filters the dataset by sample duration
+
+    Arguments
+    ---------
+    dataset: speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    hparams: dict
+        Hyperparameters
+
+    Returns
+    -------
+    result : speechbrain.dataio.dataset.DynamicItemDataset
+        A filtered dataset
+    """
+    duration_min = hparams.get("duration_min")
+    duration_max = hparams.get("duration_max")
+    if duration_min or duration_max:
+        key_min_value = None
+        key_max_value = None
+        if duration_min:
+            key_min_value = {"duration": duration_min}
+        if duration_max:
+            key_max_value = {"duration": duration_max}
+        dataset = dataset.filtered_sorted(
+            key_min_value=key_min_value, key_max_value=key_max_value,
+        )
+    return dataset
+
+
+def filter_alignments(dataset, hparams):
+    """Filters the dataset by the presence of alignments if
+    phonemes are selected as a source
+
+    Arguments
+    ---------
+    dataset: speechbrain.dataio.dataset.DynamicItemDataset
+        A dataset
+    hparams: dict
+        Hyperparameters
+
+    Returns
+    -------
+    result : speechbrain.dataio.dataset.DynamicItemDataset
+        A filtered dataset
+    """
+    if hparams["input"] == "phonemes":
+        dataset = dataset.filtered_sorted(
+            key_test={"has_alignments": lambda value: value}
+        )
+    return dataset
+
+
+def sort_datasets(datasets, hparams):
+    """Sorts datasets according to hyperparameters
+
+    Arguments
+    ---------
+    datasets : dict
+        a key -> value dictionary of datasets (the keys are "train", "valid" and "test")
+    hparams : dict
+        a dictionary of hyperparameters
+    """
     # Sorting training data with ascending order makes the code  much
     # faster  because we minimize zero-padding. In most of the cases, this
     # does not harm the performance.
@@ -953,7 +983,6 @@ def sig_pipeline(wav):
         raise NotImplementedError(
             "sorting must be random, ascending or descending"
         )
-    return datasets, resample_fn
 
 
 def sample_dataset(dataset, count, seed):
@@ -974,14 +1003,8 @@ def sample_dataset(dataset, count, seed):
     generator = torch.Generator()
     generator.manual_seed(seed)
     indexes = torch.randperm(len(dataset)).tolist()[:count]
-    data_ids = [
-        dataset.data_ids[idx]
-        for idx in indexes
-    ]
-    return FilteredSortedDynamicItemDataset(
-        dataset,
-        data_ids,
-    )
+    data_ids = [dataset.data_ids[idx] for idx in indexes]
+    return FilteredSortedDynamicItemDataset(dataset, data_ids,)
 
 
 def get_offsets(vocab_size, tracks):
@@ -1132,7 +1155,7 @@ def get_selected_layer_indexes(available_layers, selected_layers):
 
     Returns
     -------
-    layer_idx : list    
+    layer_idx : list
         The layer indexes
     """
     if not (selected_layers and available_layers):
@@ -1260,9 +1283,13 @@ def select_eval_subset(dataset, hparams, key="eval_subset"):
         with open(eval_subset_path) as eval_subset_file:
             eval_subset_ids = [line.strip() for line in eval_subset_file]
         existing_ids = dataset.data_ids
-        eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids]
+        eval_subset_ids = [
+            uttid for uttid in eval_subset_ids if uttid in existing_ids
+        ]
         if not eval_subset_ids:
-            raise ValueError("{eval_subset_path}: no items found in the dataset")
+            raise ValueError(
+                "{eval_subset_path}: no items found in the dataset"
+            )
         subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids)
     else:
         subset = dataset
@@ -1373,7 +1400,7 @@ def undo_padding_tensor(batch, lengths):
                 "seed": hparams["seed"],
                 "alignments_folder": hparams.get("alignments_folder"),
                 "model_name": hparams["model"].__class__.__name__,
-                "max_valid_size": hparams.get("max_valid_size", 10000)
+                "max_valid_size": hparams.get("max_valid_size", 10000),
             },
         )
 
@@ -1410,21 +1437,27 @@ def undo_padding_tensor(batch, lengths):
 
         # Load best checkpoint for evaluation
         if hparams["testing"]:
-            test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None)
+            test_summary_file = next(
+                Path(hparams["output_folder"]).glob("eval/test/*/summary.json"),
+                None,
+            )
             if test_summary_file is not None:
-                logging.info("Test run already completed: %s", test_summary_file)
+                logging.info(
+                    "Test run already completed: %s", test_summary_file
+                )
             else:
                 test_key_kind = hparams["test_key_kind"]
                 test_key = hparams["test_key"]
-                eval_kwargs = {
-                    f"{test_key_kind}_key": test_key
-                }
+                eval_kwargs = {f"{test_key_kind}_key": test_key}
                 eval_dataset_key = hparams["eval_dataset"]
-                logger.info("Performing final evaluation on the %s dataset", eval_dataset_key)
+                logger.info(
+                    "Performing final evaluation on the %s dataset",
+                    eval_dataset_key,
+                )
                 eval_dataset = datasets[eval_dataset_key]
                 eval_dataset = select_eval_subset(eval_dataset, hparams)
                 tts_brain.evaluate(
                     test_set=eval_dataset,
                     test_loader_kwargs=hparams["test_dataloader_opts"],
-                    **eval_kwargs
+                    **eval_kwargs,
                 )
diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py
index 52594eaf9..dda10826d 100644
--- a/benchmarks/DASB/LibriTTS/libritts_prepare.py
+++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py
@@ -109,16 +109,40 @@ def prepare_libritts(
     # If specific splits are provided, creates data manifest files accordingly
     if train_split:
         wav_list = prepare_split(data_folder, train_split)
-        create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+        create_json(
+            wav_list,
+            save_json_train,
+            sample_rate,
+            data_folder,
+            alignments_folder,
+            model_name,
+            skip_resample,
+        )
     if valid_split:
         wav_list = prepare_split(data_folder, valid_split)
         # TODO add better way to speedup evaluation
         if max_valid_size is not None and len(wav_list) > max_valid_size:
             wav_list = random.sample(wav_list, max_valid_size)
-        create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+        create_json(
+            wav_list,
+            save_json_valid,
+            sample_rate,
+            data_folder,
+            alignments_folder,
+            model_name,
+            skip_resample,
+        )
     if test_split:
         wav_list = prepare_split(data_folder, test_split)
-        create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample)
+        create_json(
+            wav_list,
+            save_json_test,
+            sample_rate,
+            data_folder,
+            alignments_folder,
+            model_name,
+            skip_resample,
+        )
 
     if skip(save_json_train, save_json_valid, save_json_test):
         logger.info("Preparation completed.")
@@ -132,12 +156,29 @@ def prepare_libritts(
         data_split = split_sets(wav_list, split_ratio)
         # Creating json files
         create_json(
-            data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample
+            data_split["train"],
+            save_json_train,
+            sample_rate,
+            alignments_folder,
+            model_name,
+            skip_resample,
+        )
+        create_json(
+            data_split["valid"],
+            save_json_valid,
+            sample_rate,
+            alignments_folder,
+            model_name,
+            skip_resample,
         )
         create_json(
-            data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample
+            data_split["test"],
+            save_json_test,
+            sample_rate,
+            alignments_folder,
+            model_name,
+            skip_resample,
         )
-        create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample)
 
 
 def prepare_split(data_folder, split_list):
@@ -180,7 +221,15 @@ def prepare_split(data_folder, split_list):
     return wav_list
 
 
-def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False):
+def create_json(
+    wav_list,
+    json_file,
+    sample_rate,
+    data_folder,
+    alignments_folder=None,
+    model_name=None,
+    skip_resample=False,
+):
     """
     Creates the json file given a list of wav files.
     Arguments
@@ -266,7 +315,9 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder
             "segment": True if "train" in json_file else False,
         }
         if alignments_folder is not None:
-            alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file)
+            alignments_file_name = get_alignment_path(
+                data_folder, alignments_folder, wav_file
+            )
             alignments = parse_alignments(alignments_file_name)
             json_dict[uttid].update(alignments)
 
@@ -309,9 +360,16 @@ def get_alignment_path(data_folder, alignments_folder, file_name):
         file_name_rel = file_name.relative_to(data_folder)
     data_slice = file_name_rel.parts[0]
 
-    textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent
+    textgrid_folder = file_name_rel.relative_to(
+        Path(data_slice) / "LibriTTS" / data_slice
+    ).parent.parent
     textgrid_file_name = f"{file_name_rel.stem}.TextGrid"
-    textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name
+    textgrid_path = (
+        Path(alignments_folder)
+        / data_slice
+        / textgrid_folder
+        / textgrid_file_name
+    )
 
     return textgrid_path
 
@@ -382,6 +440,7 @@ def check_folders(*folders):
             return False
     return True
 
+
 def parse_alignments(file_name):
     """Parses a given LibriSpeech-Alignments TextGrid file and
     converts the results to the desired format (to be used in JSON
@@ -417,7 +476,7 @@ def parse_alignments(file_name):
             "wrd_start": [],
             "wrd_end": [],
             "wrd_count": 0,
-            "unk_count": None
+            "unk_count": None,
         }
 
     text_grid = textgrids.TextGrid()
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index 6a2de5859..bb414b0d6 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -25,7 +25,11 @@
 from speechbrain.nnet.attention import RelPosEncXL
 from speechbrain.nnet.embedding import Embedding
 from speechbrain.nnet.linear import Linear
-from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss
+from speechbrain.nnet.losses import (
+    kldiv_loss,
+    mse_loss,
+    compute_masked_loss,
+)
 from speechbrain.dataio.dataio import length_to_mask
 from speechbrain.utils.data_utils import concat_padded_features
 from speechbrain.nnet.schedulers import NoamScheduler
@@ -446,7 +450,7 @@ def __init__(
         audio_dim=1024,
         show_inference_progress=True,
         transform_audio=None,
-        feed_audio=None
+        feed_audio=None,
     ):
         super().__init__()
         self.decoder = None
@@ -722,7 +726,7 @@ def __init__(
         emb=None,
         audio_emb=None,
         out_proj=None,
-        multihead_input=True
+        multihead_input=True,
     ):
         super().__init__()
         self.in_emb = Embedding(
@@ -1290,7 +1294,9 @@ def forward(
         max_len = out_len - 1
         if self.multihead_output:
             out_reshaped = (
-                out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim)
+                out.transpose(1, 2).reshape(
+                    batch_size * heads, out_len, tok_dim
+                )
             )[:, :max_len]
         else:
             out_reshaped = out
@@ -1329,14 +1335,14 @@ def forward(
                 )
 
         audio_reshaped = audio_reshaped[:, :max_len]
-        if self.multihead_output:        
+        if self.multihead_output:
             lengths_reshaped = (
                 audio_length.unsqueeze(-1)
                 .expand(batch_size, heads)
                 .reshape(batch_size * heads)
             )
         else:
-            lengths_reshaped = audio_length            
+            lengths_reshaped = audio_length
         seq_loss = self.seq_cost(
             out_reshaped[:, :tok_len],
             audio_reshaped,
@@ -1903,7 +1909,6 @@ def get_silence_token(
     unsqueeze=False,
     device=None,
     num_codebooks=None,
-
 ):
     """Attempts to find out the silence tokens for a given model,
     if applicable
@@ -2092,4 +2097,4 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys):
         "collate_fn": partial(
             token_collate_fn, silence_token=silence_token, token_keys=token_keys
         ),
-    }
\ No newline at end of file
+    }
diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py
index 19f1fa3ab..31110cb58 100644
--- a/benchmarks/DASB/model/custom_model.py
+++ b/benchmarks/DASB/model/custom_model.py
@@ -4,7 +4,7 @@
 import torch
 
 from speechbrain.nnet.linear import Linear
-from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens
+from model.sq_codec import tokens_to_ternary
 from speechbrain.utils.logger import get_logger
 
 
@@ -132,20 +132,16 @@ class TernaryPredictionHead(torch.nn.Module):
     num_positions : int
         the number of positions
     """
+
     def __init__(self, d_model, num_positions, d_hidden=512, norm=True):
         super().__init__()
         self.num_positions = num_positions
         self.d_model = d_model
         self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity()
-        self.lin_hidden = Linear(
-            input_size=d_model,
-            n_neurons=d_hidden,
-        )
+        self.lin_hidden = Linear(input_size=d_model, n_neurons=d_hidden,)
         self.act = torch.nn.LeakyReLU()
         self.lin_p = Linear(
-            input_size=d_hidden,
-            n_neurons=num_positions * 3,
-            bias=False
+            input_size=d_hidden, n_neurons=num_positions * 3, bias=False
         )
 
     def forward(self, x, track=None):
@@ -193,9 +189,12 @@ class TernaryLogitTokenizer(torch.nn.Module):
         "probability" : treats the outputs as a probability distribution
         "argmax" : "hard" mode, only the top probability is used. Cannot be used with
         top_k sampling with k > 1
-        
+
     """
-    def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10):
+
+    def __init__(
+        self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10
+    ):
         super().__init__()
         self.num_positions = num_positions
         if num_tokens is None:
@@ -204,30 +203,45 @@ def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10):
         self.num_tracks = num_tracks
         self.chunk_size = chunk_size
         self.register_buffer("vocab", torch.arange(num_tokens))
-        self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1)
+        self.register_buffer(
+            "vocab_ternary",
+            tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions)
+            + 1,
+        )
         self.register_buffer("idx", torch.arange(3)[None, None, None, None, :])
 
     def forward(self, logits):
         batch_size, max_len, num_positions, _ = logits.shape
         logits = logits.softmax(-1)
-        logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3)
+        logits = logits.reshape(
+            batch_size,
+            max_len,
+            self.num_tracks,
+            1,
+            num_positions // self.num_tracks,
+            3,
+        )
         chunks = logits.chunk(
-            dim=1,
-            chunks=math.ceil(logits.size(1) / self.chunk_size)
+            dim=1, chunks=math.ceil(logits.size(1) / self.chunk_size)
         )
         token_logits_chunks = []
         for chunk in chunks:
-            token_logits_raw = torch.where(
-                self.vocab_ternary[:, None, None, :, :, None] == self.idx,
-                chunk,
-                torch.ones_like(chunk)
-            ).prod(-1).log().sum(-1).exp()
+            token_logits_raw = (
+                torch.where(
+                    self.vocab_ternary[:, None, None, :, :, None] == self.idx,
+                    chunk,
+                    torch.ones_like(chunk),
+                )
+                .prod(-1)
+                .log()
+                .sum(-1)
+                .exp()
+            )
             token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True)
-            token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2))
-        token_logits = torch.cat(
-            token_logits_chunks,
-            dim=1
-        )
+            token_logits_chunks.append(
+                (token_logits_raw / token_logits_raw_sum).squeeze(2)
+            )
+        token_logits = torch.cat(token_logits_chunks, dim=1)
         return token_logits
 
 
@@ -248,17 +262,17 @@ class SaveableGenerator:
     Arguments
     ---------
     generators : list, optional
-        A list of generator objects. If not provided, 
+        A list of generator objects. If not provided,
     """
 
     def __init__(self, generators=None):
         if generators is None:
-            generators = {
-                "default": torch.default_generator
-            }
+            generators = {"default": torch.default_generator}
             if torch.cuda.is_available():
                 for idx in range(torch.cuda.device_count()):
-                    generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper(idx)
+                    generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper(
+                        idx
+                    )
 
         self.generators = generators
 
@@ -281,11 +295,15 @@ def _recover(self, path, end_of_epoch):
             match = re.match(r"cuda:(\d+)", key)
             if match:
                 if not torch.cuda.is_available():
-                    logger.warn("Unable to restore RNG for %s, CUDA unavailable", key)
+                    logger.warn(
+                        "Unable to restore RNG for %s, CUDA unavailable", key
+                    )
                     continue
                 idx = int(match.group(1))
                 if idx > torch.cuda.device_count() - 1:
-                    logger.warn("Unable to restore RNG for %s, device not found", key)
+                    logger.warn(
+                        "Unable to restore RNG for %s, device not found", key
+                    )
                     continue
             self.generators[key].set_state(state)
 
@@ -300,6 +318,7 @@ class _CudaDefaultGeneratorWrapper:
     ---------
     device : int|str
         The device index or identifier"""
+
     def __init__(self, device):
         self.device = device
 
diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py
index 2c52ee8ac..e5c1ea970 100644
--- a/benchmarks/DASB/model/sq_codec.py
+++ b/benchmarks/DASB/model/sq_codec.py
@@ -126,7 +126,9 @@ def build_codec_model(self, config):
         exp_model_config = OmegaConf.load(config)
         scalar_codec = ScalarModel(**exp_model_config.generator.config)
         device = next(iter(scalar_codec.parameters())).device
-        parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False)
+        parameter_dict = torch.load(
+            self.ckpt_path, map_location=device, weights_only=False
+        )
         scalar_codec.load_state_dict(parameter_dict["codec_model"])
         return scalar_codec
 
@@ -1290,6 +1292,7 @@ class TernaryEmbedding(nn.Module):
     ---------
     num_digits : int
         The number of ternary digits"""
+
     def __init__(self, num_digits, emb_size=512, flat=False):
         super().__init__()
         self.num_digits = num_digits
@@ -1338,7 +1341,9 @@ def decimal_to_ternary_matrix(decimals, D):
         corresponds to a batch, and each column is represented as a ternary number.
     """
     B, T = decimals.shape
-    ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device)
+    ternary_matrix = torch.zeros(
+        (B, D, T), dtype=torch.long, device=decimals.device
+    )
     for pos in range(D):
         ternary_matrix[:, pos, :] = decimals % 3  # Modulo operation
         decimals //= 3  # Floor division for next ternary digit
@@ -1403,13 +1408,17 @@ def ternary_matrix_to_decimal_torch(matrix):
     ) = (
         matrix.shape
     )  # B is the batch size, D is the number of digits, N is the number of ternary numbers
-    powers_of_three = 3 ** torch.arange(D, device=matrix.device)  # [3^0, 3^1, ..., 3^(D-1)]
+    powers_of_three = 3 ** torch.arange(
+        D, device=matrix.device
+    )  # [3^0, 3^1, ..., 3^(D-1)]
 
     # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1]
     powers_of_three = powers_of_three[:, None]  # Shape [D, 1]
 
     # Compute dot product using broadcasting: matrix * powers_of_three along D axis
-    decimals = torch.sum(matrix * powers_of_three, axis=1)  # Sum along the D axis
+    decimals = torch.sum(
+        matrix * powers_of_three, axis=1
+    )  # Sum along the D axis
 
     return decimals
 
@@ -1442,7 +1451,7 @@ def ternary_to_decimal(ternary, n_codebook=4):
         (Batch x Length x num_positions) - ternary digits
     n_codebooks : torch.Tensor
         The number of codebooks
-    
+
     Returns
     -------
     result: torch.Tensor
@@ -1473,7 +1482,9 @@ def ternary_logits_to_tokens(logits, n_codebook=4):
         Token IDs
     """
     ternary_matrix = logits_to_ternary(logits)
-    tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook)
+    tokens = ternary_to_decimal(
+        ternary_matrix.transpose(-1, -2), n_codebook=n_codebook
+    )
     return tokens
 
 
@@ -1498,10 +1509,9 @@ def tokens_to_ternary(tokens, D=9):
     batch_size = tokens.size(0)
     n_codebook = tokens.size(2)
     tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone()
-    ternary_matrix = torch.cat([
-        decimal_to_ternary_matrix(item, D=D) - 1
-        for item in tokens
-    ], dim=1)
+    ternary_matrix = torch.cat(
+        [decimal_to_ternary_matrix(item, D=D) - 1 for item in tokens], dim=1
+    )
     ternary_matrix = ternary_matrix.transpose(1, 2)
     if not has_batch:
         ternary_matrix = ternary_matrix[0]
@@ -1525,7 +1535,15 @@ def logits_to_ternary(logits):
     return ternary
 
 
-def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"):
+def ternary_loss(
+    predictions,
+    targets,
+    length=None,
+    mask=None,
+    targets_type="ternary",
+    num_positions=9,
+    reduction="mean",
+):
     if targets.dim() < 3:
         targets = targets.unsqueeze(-1)
     if targets_type == "tokens":
@@ -1534,15 +1552,10 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter
     targets_cat = targets + 1
     predictions_loss = predictions.permute(0, 3, 1, 2).contiguous()
     loss = nn.functional.nll_loss(
-        predictions_loss,
-        targets_cat,
-        reduction="none"
+        predictions_loss, targets_cat, reduction="none"
     )
     if length is not None:
-        mask = length_to_mask(
-            length * max_len,
-            max_len
-        )
+        mask = length_to_mask(length * max_len, max_len)
     mask = mask.unsqueeze(-1)
     if mask is not None:
         loss = loss * mask
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 1e098463a..340dcdc0f 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -135,7 +135,7 @@ def __init__(
             n_layer=ar_layer,
             qk_norm=qk_norm,
             dropout=dropout,
-            target_dropout=target_dropout
+            target_dropout=target_dropout,
         )
         if nq > 1:
             # NOTE: An NAR encoder is not needed if there is only one track
@@ -217,9 +217,13 @@ def forward(
                 :, 1:
             ]  # [B, T, V]
             max_len = dec_seq.size(1)
-            mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool()
+            mask = length_to_mask(
+                dec_seq_lengths * max_len - 1, max_len - 1
+            ).bool()
             mask = mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-            h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask)
+            h_nar = self.nar_decoder(
+                input_nar_emb, nar_level_idx - 1, mask=mask
+            )
 
         # Logits
         logits_ar, logits_nar = None, None
@@ -255,32 +259,7 @@ def prepare_input(self, dec_seq_emb, prefix_len, level):
         mask = torch.logical_or(level_mask, prefix_mask)
         return dec_seq_emb.masked_fill(~mask, 0.0).sum(2)
 
-    @torch.no_grad()
-    def inference(
-        self, prefix, opts, enc_seq=None, suffix=None,
-    ):
-        """Vall-E Inference.
-
-        Arguments
-        ---------
-        prefix : torch.Tensor
-            Prefix part of dec_seq (B, T, nq).
-        opts : SpeechLMInferenceOptions
-            inference options.
-        enc_seq : torch.Tensor
-            Encoder token sequence (B, T, nq).
-        suffix : torch.Tensor
-            suffix part of dec_seq (B, T, nq),
-            usually the target sequence for teacher-forcing.
-
-        Returns
-        -------
-        gen_tokens_list : list
-            Generated tokens
-        gen_scores_list : list
-            The scores associated with the generated tokens
-        """
-
+    def _init_inference(self, prefix, opts, enc_seq, suffix):
         # (1) initialization
         cache = self.ar_decoder.init()
 
@@ -324,6 +303,59 @@ def inference(
         if is_flattened:
             prev_tok = prev_tok.expand(1, tracks)
         mask_cache = []
+        return (
+            prefix_emb,
+            generated,
+            finish_idx,
+            cache,
+            modality_index,
+            mask,
+            mask_cache,
+            prev_tok,
+            minlen,
+            maxlen,
+            is_flattened,
+        )
+
+    @torch.inference_mode()
+    def inference(
+        self, prefix, opts, enc_seq=None, suffix=None,
+    ):
+        """Vall-E Inference.
+
+        Arguments
+        ---------
+        prefix : torch.Tensor
+            Prefix part of dec_seq (B, T, nq).
+        opts : SpeechLMInferenceOptions
+            inference options.
+        enc_seq : torch.Tensor
+            Encoder token sequence (B, T, nq).
+        suffix : torch.Tensor
+            suffix part of dec_seq (B, T, nq),
+            usually the target sequence for teacher-forcing.
+
+        Returns
+        -------
+        gen_tokens_list : list
+            Generated tokens
+        gen_scores_list : list
+            The scores associated with the generated tokens
+        """
+        (
+            prefix_emb,
+            generated,
+            finish_idx,
+            cache,
+            modality_index,
+            mask,
+            mask_cache,
+            prev_tok,
+            minlen,
+            maxlen,
+            is_flattened,
+        ) = self._init_inference(prefix, opts, enc_seq, suffix)
+
         modality_tokens = torch.tensor(
             list(opts.masks.keys()), device=prefix.device
         )
@@ -334,15 +366,13 @@ def inference(
                 prev_tok = prev_tok.unsqueeze(1)
             prev_emb = self.emb(prev_tok).squeeze(2)  # [B, 1, D]
             h_ar = self.ar_decoder(prev_emb, kv_cache=cache)
-            logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0))  # [B, 1, V]
+            logits = self.logits_to_probs(
+                self.apply_lm_head(h_ar, 0)
+            )  # [B, 1, V]
             if logits.dim() < 4:
                 logits = logits.unsqueeze(-2)
             gen_tok, gen_score = logits_to_tokens(
-                logits,
-                opts,
-                mask,
-                allow_eos=step >= minlen,
-                nq_level=0,
+                logits, opts, mask, allow_eos=step >= minlen, nq_level=0,
             )
             # [B, 1, 1] -> [B, 1]
             gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1)
@@ -403,10 +433,12 @@ def inference(
             valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0]
         if len(valid_idx) == 0:
             self.ar_decoder.reset()
-            logging.warning(f"No valid examples. Return None")
+            logging.warning("No valid examples. Return None")
             return [], []
         elif len(valid_idx) < prefix.size(0):
-            logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid")
+            logging.info(
+                "Only %d of %d are valid", len(valid_idx), prefix.size(0)
+            )
 
         finish_idx = finish_idx[valid_idx]
         prefix_emb = prefix_emb[valid_idx]
@@ -426,70 +458,18 @@ def inference(
         self.ar_decoder.reset()
 
         # (4) non-auto-regressive loop on the remained code layers
-        # (4.1) NAR initialization
-        if opts.search_algo == "teacher_force":
-            prev_tok = suffix[:, :, 0]
-        else:
-            prev_tok = gen_tokens_ar[:, :, 0]
-        start_token = torch.tensor(
-            [opts.start], device=prefix.device
-        )[None, None, :]
-
-        # (4.2) NAR loop
         if self.nq > 1:
-            start_emb = self.emb(start_token).squeeze().tile(
-                len(valid_idx), 1, 1
-            )  # [B, 1, D]
-            prev_emb = torch.cat(
-                [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
-            )  # [B, T, D]
-
-            ones = torch.ones_like(valid_idx)
-            mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
-            mask = mask.unsqueeze(1).unsqueeze(1)
-            generated = {"token": [], "score": []}
-
-            mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
-            vocab_mask = torch.cat(mask_cache, dim=1)
-
-            for step in range(1, opts.nq):
-                h_nar = self.nar_decoder(
-                    prev_emb, ones * step - 1, mask=mask
-                )  # [B, T, D]
-                
-                logits = self.apply_lm_head(h_nar, step)
-                logits = self.logits_to_probs(logits)
-                gen_tok, gen_score = logits_to_tokens(
-                    logits.unsqueeze(2),
-                    opts,
-                    vocab_mask,
-                    search_algo="greedy_search",
-                    allow_eos=False,
-                    nq_level=step,
-                )
-                gen_tok, gen_score = (
-                    gen_tok.squeeze(2),
-                    gen_score.squeeze(2),
-                )  # [B, T]
-
-                generated["token"].append(gen_tok[:, prefix.size(1) :])
-                generated["score"].append(gen_score[:, prefix.size(1) :])
-
-                if opts.search_algo == "teacher_force":
-                    prev_tok = suffix[:, :, step]
-                else:
-                    prev_tok = generated["token"][-1]
-                prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
-                prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
-
-            # (5) combine AR and NAR results
-            gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
-            gen_scores_nar = torch.stack(generated["score"], dim=2)
-
-            gen_tokens = torch.cat(
-                [gen_tokens_ar, gen_tokens_nar], dim=2
-            )  # [B, T, nq]
-            gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+            gen_tokens, gen_scores = self._nar_inference(
+                opts,
+                gen_tokens_ar,
+                gen_scores_ar,
+                valid_idx,
+                prefix_emb,
+                prefix,
+                suffix,
+                finish_idx,
+                mask_cache,
+            )
         else:
             gen_tokens = gen_tokens_ar
             gen_scores = gen_scores_ar
@@ -501,6 +481,83 @@ def inference(
             gen_scores_list.append(gen_scores[b][:item_finish_idx])
         return gen_tokens_list, gen_scores_list
 
+    def _nar_inference(
+        self,
+        opts,
+        gen_tokens_ar,
+        gen_scores_ar,
+        valid_idx,
+        prefix_emb,
+        prefix,
+        suffix,
+        finish_idx,
+        mask_cache,
+    ):
+        # (4.1) NAR initialization
+        if opts.search_algo == "teacher_force":
+            prev_tok = suffix[:, :, 0]
+        else:
+            prev_tok = gen_tokens_ar[:, :, 0]
+        start_token = torch.tensor([opts.start], device=prefix.device)[
+            None, None, :
+        ]
+
+        start_emb = (
+            self.emb(start_token).squeeze().tile(len(valid_idx), 1, 1)
+        )  # [B, 1, D]
+        prev_emb = torch.cat(
+            [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1
+        )  # [B, T, D]
+
+        ones = torch.ones_like(valid_idx)
+        mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool()
+        mask = mask.unsqueeze(1).unsqueeze(1)
+        generated = {"token": [], "score": []}
+
+        mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache
+        vocab_mask = torch.cat(mask_cache, dim=1)
+
+        # (4.2) NAR loop
+        for step in range(1, opts.nq):
+            h_nar = self.nar_decoder(
+                prev_emb, ones * step - 1, mask=mask
+            )  # [B, T, D]
+
+            logits = self.apply_lm_head(h_nar, step)
+            logits = self.logits_to_probs(logits)
+            gen_tok, gen_score = logits_to_tokens(
+                logits.unsqueeze(2),
+                opts,
+                vocab_mask,
+                search_algo="greedy_search",
+                allow_eos=False,
+                nq_level=step,
+            )
+            gen_tok, gen_score = (
+                gen_tok.squeeze(2),
+                gen_score.squeeze(2),
+            )  # [B, T]
+
+            generated["token"].append(gen_tok[:, prefix.size(1) :])
+            generated["score"].append(gen_score[:, prefix.size(1) :])
+
+            if opts.search_algo == "teacher_force":
+                prev_tok = suffix[:, :, step]
+            else:
+                prev_tok = generated["token"][-1]
+            prev_emb[:, prefix.size(1) :] += self.emb(prev_tok)  # [B, T, D]
+            prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb
+
+        # (5) combine AR and NAR results
+        gen_tokens_nar = torch.stack(generated["token"], dim=2)  # [B, T, nq]
+        gen_scores_nar = torch.stack(generated["score"], dim=2)
+
+        gen_tokens = torch.cat(
+            [gen_tokens_ar, gen_tokens_nar], dim=2
+        )  # [B, T, nq]
+        gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2)
+        return gen_tokens, gen_scores
+
     def apply_lm_head(self, x, track):
         """Applies the language model head
 
@@ -630,7 +687,8 @@ class TransformerDecoder(nn.Module):
         The target dropout probability
     layer_class : type
         The layer type to be used
-    """    
+    """
+
     def __init__(
         self,
         n_ctx,
@@ -1279,7 +1337,7 @@ def select(self, tokens, scores, text):
 
 class WhisperASRSampleSelector(SampleSelector):
     """A selector implemented using Whisper
-    
+
     Arguments
     ---------
     tokenizer: BaseTokenizer
@@ -1307,6 +1365,7 @@ class WhisperASRSampleSelector(SampleSelector):
         Additional arguments for the tokenizer
         decoding function
     """
+
     def __init__(
         self,
         tokenizer,
@@ -1358,12 +1417,14 @@ def select(self, tokens, scores, text):
         if self.offsets is not None:
             tokens_shift = tokens_shift - self.offsets
         tokens_shift = tokens_shift.clip(0)
-        wav = self.tokenizer.tokens_to_sig(tokens_shift, **self.token_model_kwargs)
+        wav = self.tokenizer.tokens_to_sig(
+            tokens_shift, **self.token_model_kwargs
+        )
         if self.sample_rate != self.tokenizer_sample_rate:
             wav = torchaudio.functional.resample(
                 wav,
                 orig_freq=self.tokenizer_sample_rate,
-                new_freq=self.sample_rate
+                new_freq=self.sample_rate,
             )
         wav = undo_padding_tensor(wav, length)
         metric = ErrorRateStats()
@@ -1377,7 +1438,7 @@ def select(self, tokens, scores, text):
             "Ground truth text: %s, sample scores: %s, best: #%d",
             text,
             sample_scores,
-            idx
+            idx,
         )
         if self.debug:
             sio = StringIO()
@@ -1391,13 +1452,15 @@ def predict(self, wav):
         wav = self.model.pad_or_trim(wav)
         mels = self.model.log_mel_spectrogram(wav)
         enc_out = self.model.forward_encoder(mels)
-        pred, _, _, _ = self.searcher(enc_out.detach(), torch.tensor(1., device=wav.device))
+        pred, _, _, _ = self.searcher(
+            enc_out.detach(), torch.tensor(1.0, device=wav.device)
+        )
         pred = self.model.tokenizer.batch_decode(
             pred, skip_special_tokens=True
         )[0]
         pred = self.normalize(pred)
         return pred
-    
+
     def normalize(self, text):
         """Performs text normalization (uppercase, remove whitespace,
         remove punctuation)
@@ -1416,4 +1479,3 @@ def normalize(self, text):
         text = text.strip()
         text = RE_PUNCTUATION.sub("", text)
         return text
-
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 1694355ec..5d90069ef 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -250,7 +250,7 @@ def __call__(self, wavs, length):
 
 class ASRSpeechEvaluator(SpeechEvaluator):
     """A superclass for ASR speech evaluators
-    
+
     Arguments
     ---------
     sample_rate : int
@@ -365,10 +365,12 @@ def compute_diff_rate(self, details, device):
         cer_metric.append(ids, pred, pred_ref)
         count = len(ids)
         dwer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores[-count:]], device=device
+            [score["WER"] for score in wer_metric.scores[-count:]],
+            device=device,
         )
         dcer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores[-count:]], device=device
+            [score["WER"] for score in cer_metric.scores[-count:]],
+            device=device,
         )
         return {"dwer": dwer, "dcer": dcer}
 
@@ -460,7 +462,9 @@ def __init__(
         self.unbatch = unbatch
         self.to(device)
 
-    def evaluate_samples(self, wavs, length, text, sample_rate, metric_key="regular"):
+    def evaluate_samples(
+        self, wavs, length, text, sample_rate, metric_key="regular"
+    ):
         """Evaluates a batch of samples
 
         Arguments
@@ -524,7 +528,7 @@ def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key):
         sample_rate : int
             The sample rate of the waveforms
         metric_key : bool
-            Whether to compute the metrics            
+            Whether to compute the metrics
 
         Returns
         -------
@@ -550,10 +554,12 @@ def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key):
         cer_metric.append(ids, predicted_words_split, text_split)
         count = len(ids)
         wer = torch.tensor(
-            [score["WER"] for score in wer_metric.scores[-count:]], device=wavs.device
+            [score["WER"] for score in wer_metric.scores[-count:]],
+            device=wavs.device,
         )
         cer = torch.tensor(
-            [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device
+            [score["WER"] for score in cer_metric.scores[-count:]],
+            device=wavs.device,
         )
         result = {
             "wer": wer,
diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py
index 1ba9bc21a..bc2a43966 100644
--- a/benchmarks/DASB/utils/tokenizer_interface.py
+++ b/benchmarks/DASB/utils/tokenizer_interface.py
@@ -7,6 +7,8 @@
 ---------
 * Pooneh Mousavi, 2024
 """
+
+import importlib
 import sys
 import os
 import torch
@@ -559,17 +561,18 @@ def _load(self):
             filename=self.model_ckpt,
             source=self.source,
             savedir=str(self.save_path),
-            save_filename=str(Path(self.model_ckpt).name)
+            save_filename=str(Path(self.model_ckpt).name),
         )
         config_file_name = fetch(
             filename=self.model_config,
             source=self.source,
             savedir=str(self.save_path),
-            save_filename="config.yaml"
+            save_filename="config.yaml",
         )
         with open(config_file_name) as config_file:
             config = yaml.safe_load(config_file)
         from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec
+
         self.encodec = ESPNetEncodec(**config["codec_conf"])
         device = next(iter(self.encodec.parameters())).device
         state_dict = torch.load(ckpt_file_name, map_location=device)
@@ -581,7 +584,7 @@ def _load(self):
 
     def _load_espnet(self):
         try:
-            import espnet2
+            importlib.import_module("espnet2")
         except ModuleNotFoundError:
             self._download_espnet()
 
@@ -590,13 +593,17 @@ def _download_espnet(self):
         espnet_path = self.save_path / "espnet"
         if not espnet_path.exists():
             logger.info("Cloining %s into %s", self.espnet_repo, espnet_path)
-            cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)])
+            cmd = shlex.join(
+                ["git", "clone", self.espnet_repo, str(espnet_path)]
+            )
             run_shell(cmd)
         else:
             logger.info("%s already exists", espnet_path)
         if self.espnet_commit:
             logger.info("Checking out %s", self.espnet_commit)
-            cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit])
+            cmd = shlex.join(
+                ["git", "-C", str(espnet_path), "checkout", self.espnet_commit]
+            )
             run_shell(cmd)
         logger.info("Installing")
         cmd = shlex.join(["pip", "install", "-e", str(espnet_path)])
@@ -609,7 +616,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs):
         if signal.dim() < 3:
             signal = signal.unsqueeze(1)
         tokens = self.encodec.encode(signal)
-        return tokens.permute(1, 2, 0)[:, :, :self.n_codebook]
+        return tokens.permute(1, 2, 0)[:, :, : self.n_codebook]
 
     @torch.no_grad()
     def tokens_to_sig(self, tokens, **kwargs):
@@ -628,4 +635,4 @@ def get_pretrained_embeddings(
         """
         raise ValueError(
             "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization."
-        )
\ No newline at end of file
+        )

From c6a20d85d8f956be8174450e9bcf10d352e3a2a7 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Tue, 8 Jul 2025 00:24:27 -0400
Subject: [PATCH 268/270] DASB: TTS: Fix docstrings

---
 .../LJSpeech/TTS/tokotron/audio_tokens.py     |   1 -
 .../DASB/LJSpeech/TTS/valle/evaluation.py     |   5 +
 benchmarks/DASB/LJSpeech/TTS/valle/train.py   |  13 ++
 .../DASB/LibriTTS/TTS/tokotron/evaluate.py    |  53 ++++++
 .../DASB/LibriTTS/TTS/tokotron/train.py       |  20 +++
 .../DASB/LibriTTS/TTS/valle/evaluation.py     |   5 +
 .../DASB/LibriTTS/TTS/valle/inference_fit.py  | 130 +++++++++++++-
 .../LibriTTS/TTS/valle/tokenizer_prepare.py   |   2 -
 benchmarks/DASB/LibriTTS/TTS/valle/train.py   |  26 +++
 benchmarks/DASB/model/Tokotron.py             |  26 +++
 benchmarks/DASB/model/valle.py                | 170 +++++++++++++++---
 benchmarks/DASB/utils/eval.py                 |  28 +++
 12 files changed, 444 insertions(+), 35 deletions(-)
 delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py
deleted file mode 120000
index e34e113e5..000000000
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../utils/audio_tokens.py
\ No newline at end of file
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
index d5aaa649d..9700e8363 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py
@@ -1,3 +1,8 @@
+"""TTS evaluation tools
+
+Authors
+ * Artem Ploujnikov 2024
+"""
 import json
 import torch
 import logging
diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
index 986eb9a7c..899c0f159 100644
--- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py
@@ -518,6 +518,19 @@ def _get_inference_opts(self):
         )
 
     def save_samples(self, batch, wav, length, stage):
+        """Saves audio samples
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            An audio batch
+        wav : torch.Tensor
+            Generated audio
+        length : torch.Tensor
+            Relative lengths
+        stage : speechbrain.Stage
+            The training stage
+        """
         output_folder = self._get_eval_output_folder(stage)
         samples = undo_padding_tensor(wav, length)
         for uttid, sample in zip(batch.uttid, samples):
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
index aa7ee2c4b..9b2801ee8 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py
@@ -169,6 +169,23 @@ def create_reports(self):
             self.perf_writer.writeheader()
 
     def infer(self, tokens, tokens_length, emb):
+        """Performs inference
+
+        Arguments
+        ---------
+        tokens : torch.Tensor
+            A token sequence
+        tokens_length : torch.Tensor
+            Relative lengths
+        emb : dict
+            Embeddings for conditioning
+
+        Returns
+        -------
+        wav : torch.Tensor
+            The waveform
+        stats : dict
+            Statistics"""
         stats = {}
         if self.hparams.eval_perf:
             flop_counter = FlopCounterMode()
@@ -190,6 +207,21 @@ def infer(self, tokens, tokens_length, emb):
         return infer_out, stats
 
     def vocoder(self, infer_out, emb):
+        """Runs the vocoder to create a waveform
+
+        Arguments
+        ---------
+        infer_out : Tokotron.TokotronInfernceOutput
+            Inference output
+        emb : dict
+            Embeddings for conditioning
+
+        Returns
+        -------
+        wav : torch.Tensor
+            The waveform
+        stats : dict
+            Statistics"""
         stats = {}
         if self.hparams.eval_perf:
             flop_counter = FlopCounterMode()
@@ -363,6 +395,14 @@ def write_summary(self):
             json.dump(summary, output_file, indent=4)
 
     def write_perf_stats(self, uttid, details):
+        """Outputs performance statistics
+
+        Arguments
+        ---------
+        uttid : list
+            A list of utterance IDs
+        details : dict
+            Performance details"""
         self.perf_writer.writerow({"uttid": " ".join(uttid), **details})
         self.perf_file.flush()
 
@@ -408,6 +448,19 @@ def flatten(value):
 
 
 def ascii_only(values):
+    """Retains only ASCII characters from the values in a
+    dictionary
+
+    Arguments
+    ---------
+    values : dict
+        a key/value dictionary
+
+    Returns
+    -------
+    result : dict
+        The same dictionary but with non-ASCII characters
+    """
     return {
         key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value
         for key, value in values.items()
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 7dc4c4ab2..7926a3f04 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -134,6 +134,26 @@ def compute_forward(self, batch, stage):
         return predictions, features
 
     def prepare_features(self, batch):
+        """Prepares Tokotron TTS features
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A batch of data
+
+        Returns
+        -------
+        audio_bos : torch.Tensor
+            Audio represnetations (discrete or continuous) with the BOS marker
+        audio_bos_length : torch.Tensor
+            Relative lengths of audio representations with the BOS marker
+        audio_tgt : torch.Tensor
+            Audio prediction targets
+        audio_tgt_length : torch.Tensor
+            Audio prediction targets - relative lengths
+        spk_emb : torch.Tensor
+            Speaker embeddings
+        """
         if self.hparams.spk_emb_shuffle:
             wav, wav_length = batch.spk_emb_random_match
         else:
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
index 58fdd5abb..017a5c367 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py
@@ -1,3 +1,8 @@
+"""TTS evaluation tools
+
+Authors
+ * Artem Ploujnikov 2024
+"""
 import json
 import torch
 import logging
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
index f81252b3a..759014220 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py
@@ -34,7 +34,15 @@
 
 
 class InferenceFit:
-    """An inference fit wrapper"""
+    """A wrapper class for hyperparameter fitting
+
+    Arguments
+    ---------
+    hparams : dict
+        Parsed hyperparameters
+    run_opts : dict
+        Parsed run options
+    """
 
     def __init__(self, hparams, run_opts):
         device = run_opts.get("device", "cpu")
@@ -64,8 +72,8 @@ def fit(self, dataset):
 
         Arguments
         ---------
-        dataset: DynamicItemDataset
-            a dataset
+        dataset: speechbrain.dataio.dataset.DynamicItemDataset
+            A dataset instance
 
         Returns
         -------
@@ -86,11 +94,35 @@ def fit(self, dataset):
         return self.result, self.best
 
     def is_completed(self, params):
+        """Determines whether the fitting run has been completed
+
+        Arguments
+        ---------
+        params : torch.Tensor
+            the parameters to evaluate
+
+        Returns
+        -------
+        result : bool
+            Whether the run has been completed
+        """
         folder_name = params_to_folder_name(params)
         path = self.output_folder / folder_name / "summary.json"
         return path.exists()
 
     def get_result(self, params):
+        """Retrieves the result for a completed run
+
+        Arguments
+        ---------
+        params : torch.Tensor
+            A hyperparameter search entry
+
+        Returns
+        -------
+        result : dict
+            The result of the run
+        """
         params_str = format_params(params)
         logger.info("Retrieving params for completed run %s", params_str)
         folder_name = params_to_folder_name(params)
@@ -104,6 +136,13 @@ def get_result(self, params):
         return result
 
     def find_best(self):
+        """Finds the best run result based on the metric chosen
+
+        Returns
+        -------
+        result : dict
+            The best result
+        """
         best = self.result[0]
         op = (
             operator.lt
@@ -117,9 +156,31 @@ def find_best(self):
         return best
 
     def enumerate_param_space(self):
+        """Enumerates the parameter space
+
+        Returns
+        -------
+        result : generator
+            The parameter space (each element is a dictionary of hyperparameters)
+        """
         return enumerate_space(self.space)
 
     def evaluate(self, dataset, params):
+        """Performs evaluation at a particular point
+        in the hyperparameter space
+
+        Arguments
+        ---------
+        dataset : speechbrain.dataio.dataset.DynamicItemDataset
+            A dataset instance
+        params : dict
+            The hyperparameter dictionary
+
+        Returns
+        -------
+        metrics : dictionary
+            a key/value dictionary with the metrics computed
+        """
         dataloader = sb.dataio.dataloader.make_dataloader(dataset)
         params_str = format_params(params)
         logger.info("Starting evaluation of %s", params_str)
@@ -141,6 +202,14 @@ def evaluate(self, dataset, params):
         return metrics
 
     def evaluate_batch(self, batch, params):
+        """Evaluates a single batch
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            A single batch of data
+        params : dict
+            A set of hyperparameters to try"""
         batch = batch.to(self.device)
         audio_tokens, audio_length = self.inference(batch, params)
         wav = self.create_waveform(audio_tokens, audio_length)
@@ -155,6 +224,7 @@ def evaluate_batch(self, batch, params):
         )
 
     def write_report(self):
+        """Outputs the hyperparameter fitting report"""
         if self.result is None:
             logger.warning("Nothing to report")
             return
@@ -286,6 +356,7 @@ def _get_inference_opts(self, params):
         )
 
     def recover(self):
+        """Recovers a checkpoint according to the settings specified"""
         test_key_kind = hparams["test_key_kind"]
         test_key = hparams["test_key"]
         kwargs = {f"{test_key_kind}_key": test_key}
@@ -298,6 +369,24 @@ def recover(self):
 
 
 def enumerate_space(space, entry=None, points=None):
+    """Enumerates the hyperparameter space for a full
+    grid search
+
+    Arguments
+    ---------
+    space : dict
+        A key -> value dictionary with hyperparameter names as keys
+        and sets of values to try as values
+    entry : dict
+        The entry being constructed
+    points : list
+        The list of points being constructed
+
+    Returns
+    -------
+    result : list
+        All configurations to try
+    """
     if points is None:
         points = []
     if not space:
@@ -314,16 +403,51 @@ def enumerate_space(space, entry=None, points=None):
 
 
 def format_space(space):
+    """Formats a hyperparameter space for display
+
+    Arguments
+    ---------
+    space : dict
+        A space definition
+
+    Returns
+    -------
+    result : str
+        A formatted space for display"""
     return ", ".join(
         f"{parameter}: {values}" for parameter, values in space.items()
     )
 
 
 def format_params(params):
+    """Formats a set of hyperparameters (a single point in the hyperparameter
+    space) for display
+
+    Arguments
+    ---------
+    params : dict
+        A dictionary of hyperparameter values
+
+    Returns
+    -------
+    result : str
+        A formatted hyperparameter dictionary
+    """
     return ", ".join(f"{key}={value}" for key, value in params.items())
 
 
 def params_to_folder_name(params):
+    """Formats a dictionary of hyperparameters as a folder name (for ease of reading)
+
+    Arguments
+    ---------
+    params : dict
+        A dictionary of hyperparameter values
+
+    Returns
+    -------
+    result : str
+        The corresponding folder name"""
     params_str = "-".join(f"{key}-{value}" for key, value in params.items())
     return f"eval-{params_str}"
 
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py
index 3fe83556b..896e6e4f5 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py
@@ -3,9 +3,7 @@
 """
 
 import json
-import os
 import re
-import speechbrain as sb
 
 from pathlib import Path
 from speechbrain.lobes.models.g2p.dataio import build_token_char_map
diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
index 13efdaf26..6cf1c7eca 100644
--- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py
@@ -575,6 +575,19 @@ def _get_inference_opts(self):
         )
 
     def save_samples(self, batch, wav, length, stage):
+        """Saves audio samples
+
+        Arguments
+        ---------
+        batch : PaddedBatch
+            An audio batch
+        wav : torch.Tensor
+            Generated audio
+        length : torch.Tensor
+            Relative lengths
+        stage : speechbrain.Stage
+            The training stage
+        """
         output_folder = self._get_eval_output_folder(stage)
         samples = undo_padding_tensor(wav, length)
         for uttid, sample in zip(batch.uttid, samples):
@@ -611,6 +624,19 @@ def _get_eval_output_folder(self, stage):
         return output_folder
 
     def fit_batch(self, batch):
+        """Fit one batch, using the default implementation with per-step
+        annealing
+
+        Arguments
+        ---------
+        batch : list of torch.Tensors
+            Batch of data to use for training. Default implementation assumes
+            this batch has two elements: inputs and targets.
+
+        Returns
+        -------
+        detached loss
+        """
         loss = super().fit_batch(batch)
         if self.hparams.lr_annealing_mode == "step":
             self.hparams.lr_annealing(self.optimizer)
diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py
index bb414b0d6..833e98da4 100644
--- a/benchmarks/DASB/model/Tokotron.py
+++ b/benchmarks/DASB/model/Tokotron.py
@@ -78,16 +78,22 @@
 
 
 class EosMode(Enum):
+    """The method of determining end-of-sequence"""
+
     GATE = "gate"
     TOKEN = "token"
 
 
 class DecoderMode(Enum):
+    """The method of determining what type of decoder to use"""
+
     AUTOREGRESSIVE = "autoregressive"
     FORWARD = "forward"
 
 
 class RepresentationMode(Enum):
+    """Inidcates the type of representations to use for audio (discrete or continuous)"""
+
     DISCRETE = "discrete"
     CONTINUOUS = "continuous"
 
@@ -1817,10 +1823,30 @@ def __call__(self, opt):
 
 
 class PositionalEncoding(TransformerPositionalEncoding):
+    """A wrapper for the positional encoding that does not try
+    to be loaded from state dictionaries"""
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def load_state_dict(self, state_dict, strict=True, assign=False):
+        """Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
+
+        Arguments
+        ---------
+        state_dict : dict
+            A dict containing parameters and persistent buffers.
+        strict : (bool, optional)
+            Whether to strictly enforce that the keys
+        assign (bool, optional): whether to assign items in the state
+            dictionary to their corresponding keys in the module
+
+        Returns
+        -------
+        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+            * **missing_keys** is a list of str containing the missing keys
+            * **unexpected_keys** is a list of str containing the unexpected keys
+        """
         pass
 
 
diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index 340dcdc0f..d1e1c33bf 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -235,6 +235,23 @@ def forward(
         return logits_ar, logits_nar
 
     def prepare_input(self, dec_seq_emb, prefix_len, level):
+        """Prepares the input sequence by adding up
+        embeddings that are not masked
+
+        Arguments
+        ---------
+        dec_seq_emb : torch.Tensor
+            The decoder sequence embedding
+        prefix_len : torch.Tensor
+            The prefix lengths
+        level : int | torch.Tensor
+            The level number or a level mask
+
+        Returns
+        -------
+        result : torch.Tensor
+            The combined embedding
+        """
         # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage.
         # This is because both prefix_mask and level_mask are broadcastable and will
         # trigger user warning.
@@ -790,7 +807,23 @@ def forward(self, x):
 
 
 class Linear(nn.Linear):
+    """A linear layer wrapper that performs automatic
+    type conversions
+    """
+
     def forward(self, x: Tensor) -> Tensor:
+        """Computes the forward pass
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            The input data
+
+        Returns
+        -------
+        result : torch.Tensor
+            The result
+        """
         return F.linear(
             x,
             self.weight.to(x.dtype),
@@ -873,6 +906,31 @@ def forward(
 
 
 class ValleNARDecoder(TransformerDecoder):
+    """The VALL-E non-autoregressive decoder
+
+    Arguments
+    ---------
+    n_level : int
+        The number of levels
+    n_ctx : int
+        The context length
+    n_state : int
+        The number of states
+    n_head : int
+        The number of attention heads
+    n_layer : int
+        The number of layers
+    causal : bool
+        Whether to operate in causal mode (i.e. avoid attending
+        to future steps)
+    qk_norm : bool
+        Queries/Keys Normalization
+    dropout : float
+        The dropout probability
+    layer_class : type
+        The layer class to use
+    """
+
     def __init__(
         self,
         n_level,
@@ -885,30 +943,6 @@ def __init__(
         dropout=0.0,
         layer_class=ResidualAttentionBlockAdaLN,
     ):
-        """The VALL-E non-autoregressive decoder
-
-        Arguments
-        ---------
-        n_level : int
-            The number of levels
-        n_ctx : int
-            The context length
-        n_state : int
-            The number of states
-        n_head : int
-            The number of attention heads
-        n_layer : int
-            The number of layers
-        causal : bool
-            Whether to operate in causal mode (i.e. avoid attending
-            to future steps)
-        qk_norm : bool
-            Queries/Keys Normalization
-        dropout : float
-            The dropout probability
-        layer_class : type
-            The layer class to use
-        """
         super().__init__(
             n_ctx=n_ctx,
             n_state=n_state,
@@ -1125,6 +1159,20 @@ def install_kv_cache_hook(model, cache):
     hooks = []
 
     def save_to_cache(module, _, output):
+        """Saves the output in the module cache
+
+        Arguments
+        ---------
+        module : torch.Tensor
+            A module instance
+        output : torch.Tensor
+            The module output
+
+        Returns
+        -------
+        result : torch.Tensor
+            Concatenated outputs
+        """
         if module not in cache:
             # save as-is, for the first token or cross attention
             cache[module] = output
@@ -1132,8 +1180,15 @@ def save_to_cache(module, _, output):
             cache[module] = torch.cat([cache[module], output], dim=1).detach()
         return cache[module]
 
-    def install_hooks(layer: torch.nn.Module):
-        if isinstance(layer, MultiHeadAttention):
+    def install_hooks(layer):
+        """Installs the forward/backward hooks
+
+        Arguments
+        ---------
+        layer : torch.nn.Module
+            A layer instance
+        """
+        if isinstance(layer):
             hooks.append(layer.key.register_forward_hook(save_to_cache))
             hooks.append(layer.value.register_forward_hook(save_to_cache))
 
@@ -1255,8 +1310,22 @@ def install_continuous_features(
 
 
 def modality_index_to_mask(
-    modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions,
+    modality_index, inference_opts,
 ):
+    """Converts a modality index to a mask
+
+    Arguments
+    ---------
+    modality_index : int
+        The modality index
+    inference_opts : SpeechLMInferenceOptions
+        The inference options
+
+    Returns
+    -------
+    result : torch.Tensor
+        The result
+    """
     assert modality_index.dim() == 1
     modality_index = modality_index.cpu().tolist()
     mask = torch.stack(
@@ -1305,7 +1374,7 @@ def masked_nll_loss(
 class SampleSelector:
     """A base class for sample selectors"""
 
-    def select(self, tokens, scores, label):
+    def select(self, tokens, scores, text):
         """Performs selection
 
         Arguments
@@ -1316,17 +1385,33 @@ def select(self, tokens, scores, label):
         scores : list
             The scores
 
-        label : str
+        text : str
             The label for the sample
         """
         raise NotImplementedError()
 
 
 class DefaultSampleSelector(SampleSelector):
+    """A default no-op sample selector that simply selects the
+    first sample (useful only when nbest=1)"""
+
     def __init__(self, **kwargs):
         pass
 
     def select(self, tokens, scores, text):
+        """Performs selection
+
+        Arguments
+        ---------
+        tokens : list
+            The generated tokens
+
+        scores : list
+            The scores
+
+        text : str
+            The label for the sample
+        """
         return tokens[0]
 
 
@@ -1364,6 +1449,8 @@ class WhisperASRSampleSelector(SampleSelector):
     token_model_kwargs : dict
         Additional arguments for the tokenizer
         decoding function
+    device : str | torch.Device
+        The target device
     """
 
     def __init__(
@@ -1412,6 +1499,19 @@ def __init__(
             tokenizer.codec_vocoder.device = device
 
     def select(self, tokens, scores, text):
+        """Performs selection
+
+        Arguments
+        ---------
+        tokens : list
+            The generated tokens
+
+        scores : list
+            The scores
+
+        text : str
+            The label for the sample
+        """
         tokens, length = batch_pad_right(tokens)
         tokens_shift = tokens - self.token_shift
         if self.offsets is not None:
@@ -1447,6 +1547,18 @@ def select(self, tokens, scores, text):
         return tokens[idx]
 
     def predict(self, wav):
+        """Makes an ASR prediction
+
+        Arguments
+        ---------
+        wav : torch.Tensor
+            A raw waveform
+
+        Returns
+        -------
+        text : str
+            The text predicted by the ASR
+        """
         if wav.dim() < 2:
             wav = wav.unsqueeze(0)
         wav = self.model.pad_or_trim(wav)
diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py
index 5d90069ef..1e9c7d2ed 100644
--- a/benchmarks/DASB/utils/eval.py
+++ b/benchmarks/DASB/utils/eval.py
@@ -213,6 +213,12 @@ def on_evaluation_end(self):
         pass
 
     def global_metrics(self):
+        """Returns global metrics (not tied to a specific sample)
+
+        Returns
+        -------
+        metrics : dict
+            A dictionary of metrics"""
         return {}
 
 
@@ -266,6 +272,7 @@ def __init__(self, sample_rate=16000, metric_mode="macro"):
         self.metrics = {}
 
     def on_evaluation_start(self):
+        """Invoked when evaluation starts"""
         self.metrics = {}
 
     def evaluate(
@@ -375,6 +382,21 @@ def compute_diff_rate(self, details, device):
         return {"dwer": dwer, "dcer": dcer}
 
     def get_asr_metrics(self, kind="regular"):
+        """Returns the ASR metrics
+
+        Arguments
+        ---------
+        kind : the kind of metrics to obtain
+            'regular' - a new metric for each sample
+            'micro' - a global shared metric
+
+        Returns
+        -------
+        wer_metric : ErrorRateStats
+            the Word Error Rate (WER) metric
+        cer_metric : ErrorRateStats
+            the Character Error Rate (CER) metric
+        """
         if self.metric_mode == "micro":
             if kind not in self.metrics:
                 metrics = init_asr_metrics()
@@ -394,6 +416,12 @@ def _replace_blanks(self, preds):
         return [" " if item == "" else item for item in preds]
 
     def global_metrics(self):
+        """Returns global metrics (not tied to a specific sample)
+
+        Returns
+        -------
+        metrics : dict
+            A dictionary of metrics"""
         global_metrics = {}
         if self.metric_mode == "micro":
             wer_metric, cer_metric = self.get_asr_metrics("regular")

From ac7d6d6a9908235b85b78dc194804df0a9982fa5 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:46:45 -0400
Subject: [PATCH 269/270] DASB: Update a docstring

---
 benchmarks/DASB/model/valle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py
index d1e1c33bf..98f7067a8 100644
--- a/benchmarks/DASB/model/valle.py
+++ b/benchmarks/DASB/model/valle.py
@@ -1,4 +1,4 @@
-"""An adaptation of ESPNET VALL-E
+"""An adaptation of ESPNET VALL-E for SpeechBrain
 Originally by Jinchuan Tian
 
 https://github.com/espnet/espnet

From 17bde9de272428a41d30dcb1cdaf0871316ea842 Mon Sep 17 00:00:00 2001
From: flexthink <flexthink@users.noreply.github.com>
Date: Fri, 18 Jul 2025 16:20:05 -0400
Subject: [PATCH 270/270] DASB: Cosmetic changes to pass pre-commit

---
 benchmarks/DASB/LJSpeech/TTS/tokotron/train.py      | 2 +-
 benchmarks/DASB/LJSpeech/extraction/extract.py      | 2 +-
 benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 2 +-
 benchmarks/DASB/LibriSpeech/extraction/extract.py   | 2 +-
 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py      | 4 ++--
 benchmarks/DASB/run_hparam_optimization.sh          | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
index 229d645fe..161a1fd93 100644
--- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py
@@ -28,7 +28,7 @@
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
-from model.Tokotron import (
+from model.Tokotron import (  # noqa: E402
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py
index 556d8a9d0..bb25afa87 100644
--- a/benchmarks/DASB/LJSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LJSpeech/extraction/extract.py
@@ -80,7 +80,7 @@
 
     if hparams["save_embedding"]:
         save_folder = pl.Path(hparams["save_folder"])
-        logger.info(f"Saving embeddings ...")
+        logger.info("Saving embeddings ...")
         tokens_extractor.save_pretrained_embeddings(
             (save_folder / "embeddings").as_posix(),
             vocab_size=hparams["vocab_size"],
diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
index 938ce8b96..098986565 100644
--- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
+++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py
@@ -387,7 +387,7 @@ def text_pipeline(wrd):
     )
     hparams["train_logger"].log_stats(
         stats_meta={
-            f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}",
+            "Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}",
             "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}",
         },
     )
diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py
index 814d252be..3a649d24f 100644
--- a/benchmarks/DASB/LibriSpeech/extraction/extract.py
+++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py
@@ -88,7 +88,7 @@
 
     if hparams["save_embedding"]:
         save_folder = pl.Path(hparams["save_folder"])
-        logger.info(f"Saving embeddings ...")
+        logger.info("Saving embeddings ...")
         tokens_extractor.save_pretrained_embeddings(
             (save_folder / "embeddings").as_posix(),
             vocab_size=hparams["vocab_size"],
diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
index 7926a3f04..abb2cda88 100644
--- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
+++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py
@@ -30,13 +30,13 @@
 base_dir = str(Path(__file__).resolve().parent.parent.parent.parent)
 sys.path.append(base_dir)
 
-from model.Tokotron import (
+from model.Tokotron import (  # noqa: E402
     RepresentationMode,
     get_silence_repr,
     get_silence_token,
     use_silence_padding,
     feature_pad_to,
-)  # noqa: E402
+)
 from evaluate import TokotronEvaluator  # noqa: E402
 
 logger = logging.getLogger(__name__)
diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh
index c0b06b09a..554ed10f0 100755
--- a/benchmarks/DASB/run_hparam_optimization.sh
+++ b/benchmarks/DASB/run_hparam_optimization.sh
@@ -224,7 +224,7 @@ while [[ $# -gt 0 ]]; do
           eval_run_additional_flags+="$name $value "
         fi
         additional_flags+="$name $value " # store additional flags
-      fi    
+      fi
       shift # past argument
       ;;