From a4f38bd2a9cbb76e8bd56c944c66a8fccf0a7c04 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 5 Nov 2024 09:31:52 -0500 Subject: [PATCH 001/270] add tokenizer_interface --- benchmarks/DASB/model/tokenizer_interface.py | 164 +++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 benchmarks/DASB/model/tokenizer_interface.py diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py new file mode 100644 index 000000000..892bef6b3 --- /dev/null +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -0,0 +1,164 @@ + +""" +Unified interface for tokenizers, standardizing the output shape of encode and decode functions. + +This class reshapes the outputs of various tokenizers to ensure consistency, simplifying integration with recipes and workflows. + +Authors +--------- +* Pooneh Mousavi, 2024 +""" + +import torch + +from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.discrete.dac import DAC +from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface + + +class Tokenizer_Encodec(Encodec): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks, _ = self.encode(sig, lens) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + sig = self.decode(toks)[:, 0] # [B, T] + return sig + +class Tokenizer_DAC(DAC): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks, _ = self( + sig[:, None], n_quantizers=kwargs['num_codebooks'] + ) # [B, K, N] + toks = toks.movedim(-1, -2) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + qfeats, _, _ = self.quantizer.from_codes( + toks.movedim(-1, -2) # [B, K, N] + ) + sig = self.decode(qfeats)[:, 0] # [B, T] + return sig + +class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface): + @torch.no_grad() + def sig_to_toks(self, sig, lens,**kwargs): + # sig: [B, T] + self.eval() + toks = self(sig)[ + : kwargs['num_codebooks'] + ] # [K, B, N] + toks = toks.movedim(-3, -1) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks,**kwargs): + # toks: [B, N, K] + self.eval() + toks = toks.movedim(-1, -3) # [K, B, N] + sig = self.decode(toks) # [B, T] + return sig + +class Tokenizer_DiscreteSSL(DiscreteSSL): + @torch.no_grad() + def sig_to_toks(self, sig, lens): + # sig: [B, T] + self.hparams.codec_quantizer.to(self.device).eval() + toks, _, _ = self.hparams.codec_quantizer( + sig, + lens, + SSL_layers=self.hparams.SSL_layers, + deduplicates=[False] * len(self.hparams.SSL_layers), + bpe_tokenizers=[None] * len(self.hparams.SSL_layers), + ) # [B, N, K] + return toks + + @torch.no_grad() + def toks_to_sig(self, toks): + # toks: [B, N, K] + self.hparams.codec_vocoder.device = self.device + self.hparams.codec_vocoder.to(self.device).eval() + + # Add offset for embedding layer + all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids + # TODO: remove after testing + assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23) + offsets = torch.arange( + 0, + len(all_layer_ids) * self.hparams.vocab_size, + self.hparams.vocab_size, + device=self.device, + ) + offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] + offsets = offsets[offset_idxes] + toks = toks + offsets + 1 + + # Handle missing codebooks + if len(self.hparams.SSL_layers) < len(all_layer_ids): + full_toks = torch.zeros( + *toks.shape[:2], + len(all_layer_ids), + dtype=toks.dtype, + device=self.device, + ) + for i, idx in enumerate(offset_idxes): + full_toks[..., idx] = toks[..., i] + toks = full_toks + + self.hparams.codec_vocoder.tokenize = False + sig = self.hparams.codec_vocoder(toks)[:, 0] # [B, T] + return sig + +class Tokenizer: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + @torch.no_grad() + def encode(self,sig, lens,**kwargs): + toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs) + return toks + + @torch.no_grad() + def decode(self,sig,**kwargs): + sig = self.tokenizer.toks_to_sig(sig,**kwargs) + return sig + + +# model_hub = "facebook/encodec_24khz" +# save_path = "savedir" +# model = Tokenizer_Encodec(model_hub, save_path) +# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT) +# inputs = torch.rand([3, 2000]) +# model_hub = "facebook/hubert-large-ll60k" +# save_path = "savedir" +# ssl_layer_num = [7,23] +# deduplicate =[False, True] +# bpe_tokenizers=[None, None] +# kmeans_repo_id = "speechbrain/SSL_Quantization" +# kmeans_dataset = "LJSpeech" +# num_clusters = 1000 +# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True) +# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) +model_hub = "fnlp/SpeechTokenizer" +save_path = "savedir" +model =Tokenizer_SpeechTokenizer(model_hub, save_path) # doctest: +SKIP +tokenizer= Tokenizer(model) +audio = torch.randn(4, 1000) +length = torch.tensor([1.0, .5, .75, 1.0]) +tokens = tokenizer.encode(audio, length,num_codebooks=2) +print(tokens.shape) +rec = tokenizer.decode(tokens) +print(rec.shape) \ No newline at end of file From 0c2b751c595c9a63a2bd66b14f32e2faa13478d8 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 6 Nov 2024 17:53:55 -0500 Subject: [PATCH 002/270] add reactored version of ASR --- .../ASR-refactor/hparams/LSTM/dac.yaml | 232 +++++++++ .../ASR-refactor/hparams/LSTM/encodec.yaml | 232 +++++++++ .../hparams/LSTM/speech_tokenizer.yaml | 222 +++++++++ .../ASR-refactor/hparams/contextnet/dac.yaml | 225 +++++++++ .../hparams/contextnet/encodec.yaml | 223 +++++++++ .../hparams/contextnet/speech_tokenizer.yaml | 213 +++++++++ .../ASR-refactor/librispeech_prepare.py | 1 + .../DASB/LibriSpeech/ASR-refactor/train.py | 447 ++++++++++++++++++ benchmarks/DASB/model/ __init__.py | 1 + benchmarks/DASB/model/custom_model.py | 17 +- benchmarks/DASB/model/tokenizer_interface.py | 231 ++++----- 11 files changed, 1933 insertions(+), 111 deletions(-) create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor/train.py create mode 100644 benchmarks/DASB/model/ __init__.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml new file mode 100644 index 000000000..4accc2241 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -0,0 +1,232 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml new file mode 100644 index 000000000..03c29ddbb --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml @@ -0,0 +1,232 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml new file mode 100644 index 000000000..8105204a5 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -0,0 +1,222 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml new file mode 100644 index 000000000..eabeef113 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml @@ -0,0 +1,225 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: DAC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml new file mode 100644 index 000000000..c0411bd76 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml @@ -0,0 +1,223 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/Contexnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml new file mode 100644 index 000000000..77ef2c540 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml @@ -0,0 +1,213 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: SpeechTokenizer +# Encoder: Contextnet Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speechtokenizer/contextnet/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 640 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py new file mode 100644 index 000000000..61b6c56f4 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -0,0 +1,447 @@ +#!/usr/bin/env/python3 +"""Recipe for training an discrete tokens ctc ASR system with librispeech. + +Decoding is performed with greedy decoding at validation time. +At test time, beamsearch is used with an optional external language model. + +Authors + * Pooneh Mousavi 2024 +""" + +import os +import sys +import torch +import torchaudio +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + + +logger = logging.getLogger(__name__) + +_CACHE = {"size": 0} + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + + + # Add waveform augmentation if specified. + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] + + current_epoch = self.hparams.epoch_counter.current + + # compute features + # Extract tokens (cache them at first epoch if augmentation is disabled) + key = tuple(sorted(batch.id)) + try: + in_toks = _CACHE[key] + in_toks = in_toks.to(self.device) + except KeyError: + with torch.no_grad(): + self.hparams.tokenizer.eval().to(self.device) + in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q] + if stage != sb.Stage.TRAIN or ( + stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment")) + ): + if _CACHE["size"] < self.hparams.cache_size: + _CACHE[key] = in_toks.cpu() + _CACHE["size"] += in_toks.numel() + + # Extract embeddings + in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D] + + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) #[B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2) #[B, T, D] + + # forward modules + if type(self.modules.encoder).__name__ == "ContextNet": + enc_out = self.modules.encoder(in_embs) + + elif type(self.modules.encoder).__name__ == "LSTM": + enc_out, _ = self.modules.encoder( + in_embs + ) + + else: + raise NotImplementedError + + # output layer for ctc log-probabilities + logits = self.modules.ctc_lin(enc_out) + p_ctc = self.hparams.log_softmax(logits) + + p_tokens = None + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) + + return p_ctc, wav_lens, p_tokens + + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + ids = batch.id + tokens, tokens_lens = batch.tokens + + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.wrd] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.wer_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # log stats and save checkpoint at end-of-epoch + if stage == sb.Stage.VALID: + if type(self.hparams.scheduler).__name__ == "NewBobScheduler": + lr, new_lr = self.hparams.scheduler( + stage_stats["loss"] + ) + sb.nnet.schedulers.update_learning_rate( + self.optimizer, new_lr + ) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr = self.hparams.scheduler.current_lr + steps = self.optimizer_step + + else: + raise NotImplementedError + + optimizer = self.optimizer.__class__.__name__ + epoch_stats = { + "epoch": epoch, + "lr": lr, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w: + self.wer_metric.write_stats(w) + + def on_fit_batch_end(self, batch, outputs, loss, should_step): + if should_step and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + self.hparams.scheduler(self.optimizer) + + + +def dataio_prepare(hparams, tokenizer): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions.""" + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + # test is separate + test_datasets = {} + for csv_file in hparams["test_csv"]: + name = Path(csv_file).stem + test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_file, replacements={"data_root": data_folder} + ) + test_datasets[name] = test_datasets[name].filtered_sorted( + sort_key="duration" + ) + + datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + info = torchaudio.info(wav) + resampled = torchaudio.transforms.Resample( + info.sample_rate, hparams["sample_rate"], + )(sig) + #resampled = resampled.unsqueeze(0) + return resampled + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.provides( + "wrd", "char_list", "tokens_list", "tokens" + ) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) + yield char_list + tokens_list = tokenizer.sp.encode_as_ids(wrd) + yield tokens_list + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, ["id", "sig", "wrd", "char_list", "tokens"], + ) + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_val = hparams["dynamic_batch_sampler_val"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_val, + ) + + return ( + train_data, + valid_data, + test_datasets, + train_batch_sampler, + valid_batch_sampler, + ) + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # If distributed_launch=True then + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + + # here we create the datasets objects as well as tokenization and encoding + ( + train_data, + valid_data, + test_datasets, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) + + # Use pretrained embeddings + if hparams["pretrain_embeddings"]: + embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"]) + hparams["discrete_embedding_layer"].init_embedding(embs) + + + # Log number of parameters/buffers + codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()]) + model_params = sum( + [ + x.numel() + for module in hparams["modules"].values() + for x in module.state_dict().values() + ] + ) + hparams["train_logger"].log_stats( + stats_meta={ + f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}", + "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", + }, + ) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["model_opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # Adding objects to trainer. + asr_brain.tokenizer = tokenizer + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] + + from speechbrain.decoders.ctc import CTCBeamSearcher + + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], + vocab_list=vocab_list, + ) + + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py new file mode 100644 index 000000000..e7db8766a --- /dev/null +++ b/benchmarks/DASB/model/ __init__.py @@ -0,0 +1 @@ +from model.tokenizer_interface import EncodecTokenizer \ No newline at end of file diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index b6e11a0d2..d3bf3cc9f 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -57,9 +57,9 @@ def __init__( num_codebooks, vocab_size, emb_dim, - pad_index=0, init=False, freeze=False, + hidden_dim =None, ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size @@ -70,10 +70,17 @@ def __init__( ).requires_grad_(not self.freeze) self.init = init - def init_embedding(self, weights): - with torch.no_grad(): - self.embedding.weight = torch.nn.Parameter(weights) + # Add a linear layer to match dimensions if necessary + if hidden_dim is not None and hidden_dim != emb_dim: + self.proj_layer = torch.nn.Linear(emb_dim, hidden_dim) + else: + self.proj_layer = None + + def init_embedding(self, weights): + self.embedding.weight.data.copy_(weights) + + def forward(self, in_tokens): """Computes the embedding for discrete tokens. a sample. @@ -97,4 +104,6 @@ def forward(self, in_tokens): ) # Forward Pass to embedding and in_embs = self.embedding(in_tokens) + if self.proj_layer is not None: + in_embs = self.proj_layer(in_embs) return in_embs diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 892bef6b3..351652a57 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -10,92 +10,152 @@ """ import torch +from abc import ABC, abstractmethod +from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.discrete.dac import DAC +from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface -from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec -from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL -from speechbrain.lobes.models.discrete.dac import DAC -from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface +class BaseTokenizer(ABC): + @abstractmethod + @torch.no_grad() + def sig_to_tokens(self, signal, lengths, **kwargs): + """Abstract method to encode a signal into tokens.""" + pass + + @abstractmethod + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + """Abstract method to decode tokens into a signal.""" + pass + + @abstractmethod + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + pass -class Tokenizer_Encodec(Encodec): +class EncodecTokenizer(Encodec, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks, _ = self.encode(sig, lens) # [B, N, K] - return toks + tokens, _ = self.encode(signal, lengths) # [B, T, N_Q] + return tokens @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - sig = self.decode(toks)[:, 0] # [B, T] - return sig - -class Tokenizer_DAC(DAC): + signal = self.decode(tokens)[:, 0] # [B, T] + return signal + @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + embeddings = self.vocabulary + return embeddings.reshape(-1, embeddings.shape[-1]) + +class DACTokenizer(DAC, BaseTokenizer): + @torch.no_grad() + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks, _ = self( - sig[:, None], n_quantizers=kwargs['num_codebooks'] - ) # [B, K, N] - toks = toks.movedim(-1, -2) # [B, N, K] - return toks + tokens, _ = self( + signal[:, None], n_quantizers=kwargs['num_codebooks'] + ) # [B, N_Q, T] + return tokens.movedim(-1, -2) # [B, T, N_Q] @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - qfeats, _, _ = self.quantizer.from_codes( - toks.movedim(-1, -2) # [B, K, N] + quantized_feats, _, _ = self.quantizer.from_codes( + tokens.movedim(-1, -2) # [B, N_Q, T] ) - sig = self.decode(qfeats)[:, 0] # [B, T] - return sig - -class Tokenizer_SpeechTokenizer(SpeechTokenizer_interface): + signal = self.decode(quantized_feats)[:, 0] # [B, T] + return signal + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200 + toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) + toks = ( + toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone() + ) # [C, K, 1] + self.to(kwargs["device"]).eval() + with torch.no_grad(): + z_q, z_p, _ = self.quantizer.from_codes(toks) + z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) # [C, D, 1] * K + z_qs = [] + for i, z_p_i in enumerate(z_ps): + with torch.no_grad(): + z_q_i = ( + self.quantizer.quantizers[i].out_proj(z_p_i) + ) # [C, H, 1] + z_qs.append(z_q_i) + assert (z_q == sum(z_qs)).all() + embeddings = torch.cat(z_qs)[:, :, 0] # [CK, H] + return embeddings + +class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens,**kwargs): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths, **kwargs): + # signal: [B, T] self.eval() - toks = self(sig)[ - : kwargs['num_codebooks'] - ] # [K, B, N] - toks = toks.movedim(-3, -1) # [B, N, K] - return toks + tokens = self(signal)[: kwargs['num_codebooks']] # [N_Q, B, T] + return tokens.movedim(-3, -1) # [B, T, N_Q] @torch.no_grad() - def toks_to_sig(self, toks,**kwargs): - # toks: [B, N, K] + def tokens_to_sig(self, tokens, **kwargs): + # tokens: [B, T, N_Q] self.eval() - toks = toks.movedim(-1, -3) # [K, B, N] - sig = self.decode(toks) # [B, T] - return sig - -class Tokenizer_DiscreteSSL(DiscreteSSL): + tokens = tokens.movedim(-1, -3) # [N_Q, B, T] + return self.decode(tokens) # [B, T] + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + """Return pretrained codebook embedding.""" + # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360 + toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) + toks = ( + toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone() + ) # [K, C, 1] + self.to(kwargs["device"]).eval() + embs = [] + for i, indices in enumerate(toks): + layer = self.model.quantizer.vq.layers[i] + with torch.no_grad(): + quantized = layer.decode(indices) # [C, H, 1] + embs.append(quantized) + assert ( + self.model.quantizer.decode(toks) == sum(embs) + ).all() + embeddings = torch.cat(embs)[:, :, 0] # [CK, H] + return embeddings + +class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): @torch.no_grad() - def sig_to_toks(self, sig, lens): - # sig: [B, T] + def sig_to_tokens(self, signal, lengths): + # signal: [B, T] self.hparams.codec_quantizer.to(self.device).eval() - toks, _, _ = self.hparams.codec_quantizer( - sig, - lens, + tokens, _, _ = self.hparams.codec_quantizer( + signal, + lengths, SSL_layers=self.hparams.SSL_layers, deduplicates=[False] * len(self.hparams.SSL_layers), bpe_tokenizers=[None] * len(self.hparams.SSL_layers), - ) # [B, N, K] - return toks + ) # [B, T, N_Q] + return tokens @torch.no_grad() - def toks_to_sig(self, toks): - # toks: [B, N, K] - self.hparams.codec_vocoder.device = self.device + def tokens_to_sig(self, tokens): + # tokens: [B, T, N_Q] self.hparams.codec_vocoder.to(self.device).eval() - # Add offset for embedding layer all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids - # TODO: remove after testing - assert tuple(all_layer_ids) == (1, 3, 7, 12, 18, 23) offsets = torch.arange( 0, len(all_layer_ids) * self.hparams.vocab_size, @@ -104,61 +164,18 @@ def toks_to_sig(self, toks): ) offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] offsets = offsets[offset_idxes] - toks = toks + offsets + 1 + tokens += offsets + 1 - # Handle missing codebooks if len(self.hparams.SSL_layers) < len(all_layer_ids): - full_toks = torch.zeros( - *toks.shape[:2], + full_tokens = torch.zeros( + *tokens.shape[:2], len(all_layer_ids), - dtype=toks.dtype, + dtype=tokens.dtype, device=self.device, ) for i, idx in enumerate(offset_idxes): - full_toks[..., idx] = toks[..., i] - toks = full_toks + full_tokens[..., idx] = tokens[..., i] + tokens = full_tokens self.hparams.codec_vocoder.tokenize = False - sig = self.hparams.codec_vocoder(toks)[:, 0] # [B, T] - return sig - -class Tokenizer: - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - @torch.no_grad() - def encode(self,sig, lens,**kwargs): - toks = self.tokenizer.sig_to_toks(sig, lens,**kwargs) - return toks - - @torch.no_grad() - def decode(self,sig,**kwargs): - sig = self.tokenizer.toks_to_sig(sig,**kwargs) - return sig - - -# model_hub = "facebook/encodec_24khz" -# save_path = "savedir" -# model = Tokenizer_Encodec(model_hub, save_path) -# from speechbrain.lobes.models.huggingface_transformers.hubert import (HuBERT) -# inputs = torch.rand([3, 2000]) -# model_hub = "facebook/hubert-large-ll60k" -# save_path = "savedir" -# ssl_layer_num = [7,23] -# deduplicate =[False, True] -# bpe_tokenizers=[None, None] -# kmeans_repo_id = "speechbrain/SSL_Quantization" -# kmeans_dataset = "LJSpeech" -# num_clusters = 1000 -# ssl_model = HuBERT(model_hub, save_path,output_all_hiddens=True) -# model = DiscreteSSL(save_path, ssl_model, kmeans_repo_id=kmeans_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) -model_hub = "fnlp/SpeechTokenizer" -save_path = "savedir" -model =Tokenizer_SpeechTokenizer(model_hub, save_path) # doctest: +SKIP -tokenizer= Tokenizer(model) -audio = torch.randn(4, 1000) -length = torch.tensor([1.0, .5, .75, 1.0]) -tokens = tokenizer.encode(audio, length,num_codebooks=2) -print(tokens.shape) -rec = tokenizer.decode(tokens) -print(rec.shape) \ No newline at end of file + return self.hparams.codec_vocoder(tokens)[:, 0] # [B, T] From 17898c3472ec45ae2173b5894f7c7e550918d9d4 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:40:14 -0500 Subject: [PATCH 003/270] fix precommit --- .../ASR-refactor/hparams/LSTM/dac.yaml | 4 +- .../ASR-refactor/hparams/LSTM/encodec.yaml | 4 +- .../hparams/LSTM/speech_tokenizer.yaml | 2 +- .../ASR-refactor/hparams/contextnet/dac.yaml | 4 +- .../hparams/contextnet/encodec.yaml | 5 +- .../hparams/contextnet/speech_tokenizer.yaml | 4 +- .../DASB/LibriSpeech/ASR-refactor/train.py | 78 +++++++++-------- benchmarks/DASB/model/ __init__.py | 2 +- benchmarks/DASB/model/custom_model.py | 6 +- benchmarks/DASB/model/tokenizer_interface.py | 84 +++++++------------ 10 files changed, 84 insertions(+), 109 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml index 4accc2241..806305774 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml @@ -182,7 +182,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml index 03c29ddbb..18d967244 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml @@ -182,7 +182,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -193,7 +193,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml index 8105204a5..55d7c3c91 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -172,7 +172,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml index eabeef113..aa7d2e141 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml @@ -175,7 +175,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -186,7 +186,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml index c0411bd76..a1b5262d3 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml @@ -107,7 +107,6 @@ encoder_dim: 1024 pretrain_embeddings: False freeze_embedding: False - output_neurons: 31 # BPE parameters @@ -173,7 +172,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -184,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml index 77ef2c540..c12d6f79f 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml @@ -163,7 +163,7 @@ test_beam_search: beam_size: !ref beam_prune_logp: !ref token_prune_min_logp: !ref - prune_history: !ref + prune_history: !ref alpha: 0.8 beta: 1.2 @@ -174,7 +174,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index 61b6c56f4..baa80c80e 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -18,6 +18,7 @@ from speechbrain.tokenizers.SentencePiece import SentencePiece from hyperpyyaml import load_hyperpyyaml from pathlib import Path + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) sys.path.append(base_dir) @@ -32,11 +33,10 @@ def compute_forward(self, batch, stage): """Forward computations from the waveform batches to the output probabilities.""" batch = batch.to(self.device) wavs, wav_lens = batch.sig - # Add waveform augmentation if specified. if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): - wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] + wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] current_epoch = self.hparams.epoch_counter.current @@ -49,33 +49,38 @@ def compute_forward(self, batch, stage): except KeyError: with torch.no_grad(): self.hparams.tokenizer.eval().to(self.device) - in_toks = self.hparams.tokenizer.sig_to_tokens(wavs, wav_lens,num_codebooks=hparams['num_codebooks']) #[B, T, N-Q] + in_toks = self.hparams.tokenizer.sig_to_tokens( + wavs, wav_lens, num_codebooks=hparams["num_codebooks"] + ) # [B, T, N-Q] if stage != sb.Stage.TRAIN or ( - stage == sb.Stage.TRAIN and (not hasattr(self.hparams, "wav_augment")) + stage == sb.Stage.TRAIN + and (not hasattr(self.hparams, "wav_augment")) ): if _CACHE["size"] < self.hparams.cache_size: _CACHE[key] = in_toks.cpu() _CACHE["size"] += in_toks.numel() # Extract embeddings - in_embs = self.modules.discrete_embedding_layer(in_toks) #[B, T, N-Q, D] + in_embs = self.modules.discrete_embedding_layer( + in_toks + ) # [B, T, N-Q, D] - # Attention-Pooling - att_w = self.modules.attention_mlp(in_embs) #[B, T, N-Q, 1] - in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze(-2) #[B, T, D] + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) # [B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze( + -2 + ) # [B, T, D] # forward modules if type(self.modules.encoder).__name__ == "ContextNet": enc_out = self.modules.encoder(in_embs) elif type(self.modules.encoder).__name__ == "LSTM": - enc_out, _ = self.modules.encoder( - in_embs - ) + enc_out, _ = self.modules.encoder(in_embs) else: raise NotImplementedError - + # output layer for ctc log-probabilities logits = self.modules.ctc_lin(enc_out) p_ctc = self.hparams.log_softmax(logits) @@ -89,7 +94,6 @@ def compute_forward(self, batch, stage): p_tokens = test_searcher(p_ctc, wav_lens) return p_ctc, wav_lens, p_tokens - def compute_objectives(self, predictions, batch, stage): """Computes the loss (CTC+NLL) given predictions and targets.""" @@ -98,14 +102,13 @@ def compute_objectives(self, predictions, batch, stage): ids = batch.id tokens, tokens_lens = batch.tokens - # Label Augmentation if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): tokens = self.hparams.wav_augment.replicate_labels(tokens) tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) - + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - + if stage == sb.Stage.VALID: # Decode token terms to words predicted_words = self.tokenizer( @@ -149,19 +152,15 @@ def on_stage_end(self, stage, stage_loss, epoch): # log stats and save checkpoint at end-of-epoch if stage == sb.Stage.VALID: if type(self.hparams.scheduler).__name__ == "NewBobScheduler": - lr, new_lr = self.hparams.scheduler( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.optimizer, new_lr - ) - elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr, new_lr = self.hparams.scheduler(stage_stats["loss"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": lr = self.hparams.scheduler.current_lr steps = self.optimizer_step - + else: raise NotImplementedError - + optimizer = self.optimizer.__class__.__name__ epoch_stats = { "epoch": epoch, @@ -185,15 +184,19 @@ def on_stage_end(self, stage, stage_loss, epoch): test_stats=stage_stats, ) if if_main_process(): - with open(self.hparams.output_wer_folder, "w", encoding="utf-8") as w: + with open( + self.hparams.output_wer_folder, "w", encoding="utf-8" + ) as w: self.wer_metric.write_stats(w) def on_fit_batch_end(self, batch, outputs, loss, should_step): - if should_step and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + if ( + should_step + and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler" + ): self.hparams.scheduler(self.optimizer) - def dataio_prepare(hparams, tokenizer): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions.""" @@ -251,7 +254,7 @@ def audio_pipeline(wav): resampled = torchaudio.transforms.Resample( info.sample_rate, hparams["sample_rate"], )(sig) - #resampled = resampled.unsqueeze(0) + # resampled = resampled.unsqueeze(0) return resampled sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) @@ -272,7 +275,6 @@ def text_pipeline(wrd): sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - # 4. Set output: sb.dataio.dataset.set_output_keys( datasets, ["id", "sig", "wrd", "char_list", "tokens"], @@ -319,7 +321,6 @@ def text_pipeline(wrd): # create ddp_group with the right communication protocol sb.utils.distributed.ddp_init_group(run_opts) - # Create experiment directory sb.create_experiment_directory( experiment_directory=hparams["output_folder"], @@ -327,7 +328,6 @@ def text_pipeline(wrd): overrides=overrides, ) - # Dataset prep (parsing Librispeech) from librispeech_prepare import prepare_librispeech # noqa @@ -369,12 +369,17 @@ def text_pipeline(wrd): # Use pretrained embeddings if hparams["pretrain_embeddings"]: - embs= hparams["tokenizer"].get_pretrained_embeddings(device=run_opts["device"],num_codebooks=hparams['num_codebooks'], vocab_size=hparams["vocab_size"]) + embs = hparams["tokenizer"].get_pretrained_embeddings( + device=run_opts["device"], + num_codebooks=hparams["num_codebooks"], + vocab_size=hparams["vocab_size"], + ) hparams["discrete_embedding_layer"].init_embedding(embs) - # Log number of parameters/buffers - codec_params = sum([x.numel() for x in hparams["tokenizer"].state_dict().values()]) + codec_params = sum( + [x.numel() for x in hparams["tokenizer"].state_dict().values()] + ) model_params = sum( [ x.numel() @@ -407,8 +412,7 @@ def text_pipeline(wrd): from speechbrain.decoders.ctc import CTCBeamSearcher test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], - vocab_list=vocab_list, + **hparams["test_beam_search"], vocab_list=vocab_list, ) train_dataloader_opts = hparams["train_dataloader_opts"] diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py index e7db8766a..b59bcdfa5 100644 --- a/benchmarks/DASB/model/ __init__.py +++ b/benchmarks/DASB/model/ __init__.py @@ -1 +1 @@ -from model.tokenizer_interface import EncodecTokenizer \ No newline at end of file +from model.tokenizer_interface import EncodecTokenizer diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index d3bf3cc9f..1c655fc65 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -59,7 +59,7 @@ def __init__( emb_dim, init=False, freeze=False, - hidden_dim =None, + hidden_dim=None, ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size @@ -76,11 +76,9 @@ def __init__( else: self.proj_layer = None - def init_embedding(self, weights): self.embedding.weight.data.copy_(weights) - - + def forward(self, in_tokens): """Computes the embedding for discrete tokens. a sample. diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 351652a57..604e3a403 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -1,4 +1,3 @@ - """ Unified interface for tokenizers, standardizing the output shape of encode and decode functions. @@ -12,9 +11,13 @@ import torch from abc import ABC, abstractmethod from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec -from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import DiscreteSSL +from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( + DiscreteSSL, +) from speechbrain.lobes.models.discrete.dac import DAC -from speechbrain.lobes.models.discrete.speechtokenizer_interface import SpeechTokenizer_interface +from speechbrain.lobes.models.discrete.speechtokenizer_interface import ( + SpeechTokenizer_interface, +) class BaseTokenizer(ABC): @@ -29,13 +32,14 @@ def sig_to_tokens(self, signal, lengths, **kwargs): def tokens_to_sig(self, tokens, **kwargs): """Abstract method to decode tokens into a signal.""" pass - + @abstractmethod @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" pass + class EncodecTokenizer(Encodec, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): @@ -50,20 +54,21 @@ def tokens_to_sig(self, tokens, **kwargs): self.eval() signal = self.decode(tokens)[:, 0] # [B, T] return signal - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" embeddings = self.vocabulary return embeddings.reshape(-1, embeddings.shape[-1]) + class DACTokenizer(DAC, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): # signal: [B, T] self.eval() tokens, _ = self( - signal[:, None], n_quantizers=kwargs['num_codebooks'] + signal[:, None], n_quantizers=kwargs["num_codebooks"] ) # [B, N_Q, T] return tokens.movedim(-1, -2) # [B, T, N_Q] @@ -76,7 +81,7 @@ def tokens_to_sig(self, tokens, **kwargs): ) signal = self.decode(quantized_feats)[:, 0] # [B, T] return signal - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" @@ -88,24 +93,25 @@ def get_pretrained_embeddings(self, **kwargs): self.to(kwargs["device"]).eval() with torch.no_grad(): z_q, z_p, _ = self.quantizer.from_codes(toks) - z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) # [C, D, 1] * K + z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) z_qs = [] for i, z_p_i in enumerate(z_ps): with torch.no_grad(): - z_q_i = ( - self.quantizer.quantizers[i].out_proj(z_p_i) + z_q_i = self.quantizer.quantizers[i].out_proj( + z_p_i ) # [C, H, 1] z_qs.append(z_q_i) assert (z_q == sum(z_qs)).all() - embeddings = torch.cat(z_qs)[:, :, 0] # [CK, H] + embeddings = torch.cat(z_qs)[:, :, 0] return embeddings + class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths, **kwargs): # signal: [B, T] self.eval() - tokens = self(signal)[: kwargs['num_codebooks']] # [N_Q, B, T] + tokens = self(signal)[: kwargs["num_codebooks"]] # [N_Q, B, T] return tokens.movedim(-3, -1) # [B, T, N_Q] @torch.no_grad() @@ -114,7 +120,7 @@ def tokens_to_sig(self, tokens, **kwargs): self.eval() tokens = tokens.movedim(-1, -3) # [N_Q, B, T] return self.decode(tokens) # [B, T] - + @torch.no_grad() def get_pretrained_embeddings(self, **kwargs): """Return pretrained codebook embedding.""" @@ -128,54 +134,22 @@ def get_pretrained_embeddings(self, **kwargs): for i, indices in enumerate(toks): layer = self.model.quantizer.vq.layers[i] with torch.no_grad(): - quantized = layer.decode(indices) # [C, H, 1] + quantized = layer.decode(indices) embs.append(quantized) - assert ( - self.model.quantizer.decode(toks) == sum(embs) - ).all() - embeddings = torch.cat(embs)[:, :, 0] # [CK, H] + assert (self.model.quantizer.decode(toks) == sum(embs)).all() + embeddings = torch.cat(embs)[:, :, 0] return embeddings + class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): @torch.no_grad() def sig_to_tokens(self, signal, lengths): - # signal: [B, T] - self.hparams.codec_quantizer.to(self.device).eval() - tokens, _, _ = self.hparams.codec_quantizer( - signal, - lengths, - SSL_layers=self.hparams.SSL_layers, - deduplicates=[False] * len(self.hparams.SSL_layers), - bpe_tokenizers=[None] * len(self.hparams.SSL_layers), - ) # [B, T, N_Q] - return tokens + pass @torch.no_grad() def tokens_to_sig(self, tokens): - # tokens: [B, T, N_Q] - self.hparams.codec_vocoder.to(self.device).eval() - - all_layer_ids = self.hparams.codec_quantizer.ssl_layer_ids - offsets = torch.arange( - 0, - len(all_layer_ids) * self.hparams.vocab_size, - self.hparams.vocab_size, - device=self.device, - ) - offset_idxes = [all_layer_ids.index(x) for x in self.hparams.SSL_layers] - offsets = offsets[offset_idxes] - tokens += offsets + 1 - - if len(self.hparams.SSL_layers) < len(all_layer_ids): - full_tokens = torch.zeros( - *tokens.shape[:2], - len(all_layer_ids), - dtype=tokens.dtype, - device=self.device, - ) - for i, idx in enumerate(offset_idxes): - full_tokens[..., idx] = tokens[..., i] - tokens = full_tokens - - self.hparams.codec_vocoder.tokenize = False - return self.hparams.codec_vocoder(tokens)[:, 0] # [B, T] + pass + + @torch.no_grad() + def get_pretrained_embeddings(self, **kwargs): + pass From db1590ee346dab0896723cf8184ba8b1e12355b8 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:54:09 -0500 Subject: [PATCH 004/270] fix flake --- benchmarks/DASB/LibriSpeech/ASR-refactor/train.py | 5 +---- benchmarks/DASB/model/ __init__.py | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py index baa80c80e..99eeb81fe 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py @@ -27,6 +27,7 @@ _CACHE = {"size": 0} + # Define training procedure class ASR(sb.Brain): def compute_forward(self, batch, stage): @@ -38,8 +39,6 @@ def compute_forward(self, batch, stage): if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): wavs, wav_lens = self.hparams.wav_augment(wavs, wav_lens) # [B, T] - current_epoch = self.hparams.epoch_counter.current - # compute features # Extract tokens (cache them at first epoch if augmentation is disabled) key = tuple(sorted(batch.id)) @@ -156,8 +155,6 @@ def on_stage_end(self, stage, stage_loss, epoch): sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": lr = self.hparams.scheduler.current_lr - steps = self.optimizer_step - else: raise NotImplementedError diff --git a/benchmarks/DASB/model/ __init__.py b/benchmarks/DASB/model/ __init__.py index b59bcdfa5..e69de29bb 100644 --- a/benchmarks/DASB/model/ __init__.py +++ b/benchmarks/DASB/model/ __init__.py @@ -1 +0,0 @@ -from model.tokenizer_interface import EncodecTokenizer From 3361ac6e9c21e94d2957d76347c7c19bfeab88ad Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 8 Nov 2024 09:56:08 -0500 Subject: [PATCH 005/270] fix blank index --- .../LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml index 55d7c3c91..99d423b87 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml @@ -183,7 +183,7 @@ ctc_cost: !name:speechbrain.nnet.losses.ctc_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True -scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 From 2678a24761d92ba71745574a006b76c10113b828 Mon Sep 17 00:00:00 2001 From: Chaanks Date: Sat, 9 Nov 2024 14:36:05 +0100 Subject: [PATCH 006/270] add tokens extraction / loading --- benchmarks/DASB/utils/tokens.py | 249 ++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 benchmarks/DASB/utils/tokens.py diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py new file mode 100644 index 000000000..b334106f1 --- /dev/null +++ b/benchmarks/DASB/utils/tokens.py @@ -0,0 +1,249 @@ +import math +import logging +import pathlib as pl +import kaldiio +import torch +import numpy as np +from tqdm.auto import tqdm +import speechbrain as sb +from speechbrain.dataio.dataloader import make_dataloader +from speechbrain.dataio.dataset import DynamicItemDataset + + +logger = logging.getLogger(__name__) + + +def get_device(use_cuda): + logger.info("=" * 30) + logger.info(f"USE_CUDA SET TO: {use_cuda}") + logger.info(f"CUDA AVAILABLE?: {torch.cuda.is_available()}") + logger.info("=" * 30) + use_cuda = use_cuda and torch.cuda.is_available() + return torch.device("cuda" if use_cuda else "cpu") + + +class TokensExtractor: + """ + Extracts tokens from audio data using a tokenizer and saves them to a specified format. + + Arguments + --------- + tokenizer : torch.nn.Module + The tokenizer model to use for token extraction. + save_path : str + The directory where the tokens will be saved. + src_key : str, optional + The key in the dataset that contains the audio data (default: "wav"). + id_key : str, optional + The key in the dataset that contains unique identifiers (default: "id"). + save_format : str, optional + The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy"). + use_cuda : bool, optional + Whether to use CUDA for computation (default: True). + dataloader_opts : dict, optional + Options for the data loader (default: None). + pipelines : list, optional + List of data processing pipelines to apply (default: None). + save_name : str, optional + Base name for the saved token files (default: "tokens"). + """ + + def __init__( + self, + tokenizer, + save_path, + src_key="wav", + id_key="id", + save_format="numpy", + use_cuda=True, + dataloader_opts=None, + pipelines=None, + save_name="tokens", + ): + """ + Initializes the TokensExtractor. + + Arguments + --------- + tokenizer : torch.nn.Module + The tokenizer model to use for token extraction. + save_path : str + The directory where the tokens will be saved. + src_key : str, optional + The key in the dataset that contains the audio data (default: "wav"). + id_key : str, optional + The key in the dataset that contains unique identifiers (default: "id"). + save_format : str, optional + The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy"). + use_cuda : bool, optional + Whether to use CUDA for computation (default: True). + dataloader_opts : dict, optional + Options for the data loader (default: None). + pipelines : list, optional + List of data processing pipelines to apply (default: None). + save_name : str, optional + Base name for the saved token files (default: "tokens"). + + Raises + ------ + ValueError + If an unsupported save_format is provided. + """ + self.save_path = pl.Path(save_path).absolute() + self.save_path.mkdir(parents=True, exist_ok=True) + self.save_name = save_name + + self.id_key = id_key + self.src_key = src_key + + self.device = get_device(use_cuda) + self.tokenizer = tokenizer.to(self.device) + + if save_format not in ["numpy", "pickle", "soundfile_flac"]: + raise ValueError(f"Unsupported save_format: {save_format}") + self.save_format = save_format + + if not dataloader_opts: + dataloader_opts = {} + self.dataloader_opts = dataloader_opts + self.pipelines = pipelines if pipelines is not None else [] + + self.wspecifier = f"ark,scp,t:{self.save_path}/{self.save_name}.ark,{self.save_path}/{self.save_name}.scp" + self.writer = kaldiio.WriteHelper( + self.wspecifier, write_function="numpy" + ) + + def extract(self, dataset): + """ + Extracts tokens from the dataset and saves them to the specified format. + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict + The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary. + """ + if isinstance(dataset, dict): + dataset = DynamicItemDataset(dataset) + dataset.set_output_keys([self.src_key, self.id_key]) + for pipeline in self.pipelines: + dataset.add_dynamic_item(pipeline) + + dataloader = make_dataloader(dataset, **self.dataloader_opts) + batch_size = self.dataloader_opts.get("batch_size", 1) + batch_count = int(math.ceil(len(dataset) / batch_size)) + for batch in tqdm(dataloader, total=batch_count): + batch = batch.to(self.device) + x, x_lengths = batch[self.src_key] + ids = batch[self.id_key] + batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths) + batch_tokens = sb.utils.data_utils.undo_padding( + batch_tokens, x_lengths + ) + self.process_batch(batch_tokens, ids) + + def process_batch(self, batch, ids): + """ + Processes a batch of tokens and writes them to the output files. + + Arguments + --------- + batch : list + A list of tokens for each item in the batch. + ids : list + A list of unique identifiers corresponding to each item in the batch. + """ + for tokens, utt_id in zip(batch, ids): + tokens = np.array(tokens) + self.writer(utt_id, tokens) + + def __del__(self): + """ + Close the writer. + """ + self.writer.close() + + +class TokensLoader: + """ + A loader class for retrieving tokens corresponding to utterance IDs. + + Arguments + --------- + data_path: str + The path to the data directory containing the token files. + save_name: str, optional + The base name of the tokens files (default: "tokens"). + """ + + def __init__( + self, + data_path, + save_name="tokens", + ): + """ + Initializes the TokensLoader. + + Arguments + --------- + data_path: str + The path to the data directory containing the token files. + save_name: str, optional + The base name of the tokens files (default: "tokens"). + """ + self.data_path = pl.Path(data_path) + if not self.data_path.exists(): + raise ValueError( + f"Data folder not found: {self.data_path.as_posix()}" + ) + self.tokens = self._load(data_path, save_name) + + def tokens_by_uttid(self, utt_id): + """ + Retrieves the tokens corresponding to a given utterance ID. + + Arguments + --------- + utt_id: str + The utterance ID to retrieve tokens for. + + Returns + ------- + result: torch.LongTensor [T, N_Q] + The tokens associated with the utterance ID. + + Raises + ------ + KeyError + If the utterance ID is not found in the tokens. + """ + if utt_id not in self.tokens: + raise KeyError(f"Utterance ID '{utt_id}' not found in tokens.") + tokens_path = self.tokens[utt_id] + tokens = kaldiio.load_mat(tokens_path) + tokens = torch.from_numpy(tokens).long() + return tokens + + def _load(self, data_path, save_name): + """ + Loads the mapping from utterance IDs to token file paths. + + Arguments + --------- + data_path: str + The path to the data directory containing the token files. + save_name: str + The base name of the tokens files. + + Returns + ------- + utt2toks: dict + A dictionary mapping utterance IDs to their corresponding token file paths. + """ + scp_path = f"{data_path}/{save_name}.scp" + with open(scp_path, "r") as f: + utt2toks = { + line.strip().split(None, 1)[0]: line.strip().split(None, 1)[1] + for line in f + if line.strip() + } + return utt2toks From 0694249f678572e455305f790fa751859d750193 Mon Sep 17 00:00:00 2001 From: Chaanks Date: Sat, 9 Nov 2024 18:57:21 +0100 Subject: [PATCH 007/270] update tokens extraction script --- benchmarks/DASB/utils/tokens.py | 192 +++++++++++++++++++++++--------- 1 file changed, 137 insertions(+), 55 deletions(-) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index b334106f1..3762457ec 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -3,14 +3,17 @@ import pathlib as pl import kaldiio import torch +import torchaudio import numpy as np from tqdm.auto import tqdm import speechbrain as sb from speechbrain.dataio.dataloader import make_dataloader from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.dataio.dataio import load_pkl, save_pkl logger = logging.getLogger(__name__) +OPT_FILE = "opt_extract.pkl" def get_device(use_cuda): @@ -30,8 +33,8 @@ class TokensExtractor: --------- tokenizer : torch.nn.Module The tokenizer model to use for token extraction. - save_path : str - The directory where the tokens will be saved. + sample_rate : int + The sample rate of the audio data. src_key : str, optional The key in the dataset that contains the audio data (default: "wav"). id_key : str, optional @@ -42,62 +45,36 @@ class TokensExtractor: Whether to use CUDA for computation (default: True). dataloader_opts : dict, optional Options for the data loader (default: None). - pipelines : list, optional - List of data processing pipelines to apply (default: None). - save_name : str, optional - Base name for the saved token files (default: "tokens"). + + Raises + ------ + ValueError + If an unsupported save_format is provided. + ValueError + If the tokenizer's sample rate does not match the provided sample_rate. """ def __init__( self, tokenizer, - save_path, + sample_rate, src_key="wav", id_key="id", save_format="numpy", use_cuda=True, dataloader_opts=None, - pipelines=None, - save_name="tokens", ): - """ - Initializes the TokensExtractor. - - Arguments - --------- - tokenizer : torch.nn.Module - The tokenizer model to use for token extraction. - save_path : str - The directory where the tokens will be saved. - src_key : str, optional - The key in the dataset that contains the audio data (default: "wav"). - id_key : str, optional - The key in the dataset that contains unique identifiers (default: "id"). - save_format : str, optional - The format to save the tokens ('numpy', 'pickle', 'soundfile_flac') (default: "numpy"). - use_cuda : bool, optional - Whether to use CUDA for computation (default: True). - dataloader_opts : dict, optional - Options for the data loader (default: None). - pipelines : list, optional - List of data processing pipelines to apply (default: None). - save_name : str, optional - Base name for the saved token files (default: "tokens"). - - Raises - ------ - ValueError - If an unsupported save_format is provided. - """ - self.save_path = pl.Path(save_path).absolute() - self.save_path.mkdir(parents=True, exist_ok=True) - self.save_name = save_name - self.id_key = id_key self.src_key = src_key self.device = get_device(use_cuda) self.tokenizer = tokenizer.to(self.device) + self.sample_rate = sample_rate + + if tokenizer.sample_rate != self.sample_rate: + raise ValueError( + f"Sample rate mismatch: {self.sample_rate} != {tokenizer.sample_rate}" + ) if save_format not in ["numpy", "pickle", "soundfile_flac"]: raise ValueError(f"Unsupported save_format: {save_format}") @@ -106,14 +83,9 @@ def __init__( if not dataloader_opts: dataloader_opts = {} self.dataloader_opts = dataloader_opts - self.pipelines = pipelines if pipelines is not None else [] - - self.wspecifier = f"ark,scp,t:{self.save_path}/{self.save_name}.ark,{self.save_path}/{self.save_name}.scp" - self.writer = kaldiio.WriteHelper( - self.wspecifier, write_function="numpy" - ) + self.pipelines = self._make_pipelines() - def extract(self, dataset): + def extract_tokens(self, dataset, save_path, save_name="tokens"): """ Extracts tokens from the dataset and saves them to the specified format. @@ -122,9 +94,30 @@ def extract(self, dataset): dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary. """ + conf = { + "sample_rate": self.sample_rate, + "save_folder": save_path, + "dataset_length": len(dataset), + } + + save_path = pl.Path(save_path).absolute() + save_path.mkdir(parents=True, exist_ok=True) + + # Check if the extraction is already done (if so, skip it) + if _skip(save_path, save_name, conf): + logger.info("Skipping preparation, completed in previous run.") + return + + self.wspecifier = ( + f"ark,scp,t:{save_path}/{save_name}.ark,{save_path}/{save_name}.scp" + ) + self.writer = kaldiio.WriteHelper( + self.wspecifier, write_function="numpy" + ) + if isinstance(dataset, dict): dataset = DynamicItemDataset(dataset) - dataset.set_output_keys([self.src_key, self.id_key]) + dataset.set_output_keys([self.src_key, self.id_key, "sig"]) for pipeline in self.pipelines: dataset.add_dynamic_item(pipeline) @@ -133,7 +126,7 @@ def extract(self, dataset): batch_count = int(math.ceil(len(dataset) / batch_size)) for batch in tqdm(dataloader, total=batch_count): batch = batch.to(self.device) - x, x_lengths = batch[self.src_key] + x, x_lengths = batch["sig"] ids = batch[self.id_key] batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths) batch_tokens = sb.utils.data_utils.undo_padding( @@ -141,6 +134,11 @@ def extract(self, dataset): ) self.process_batch(batch_tokens, ids) + logger.info("Extraction completed.") + + save_opt = save_path / OPT_FILE + save_pkl(conf, save_opt.as_posix()) + def process_batch(self, batch, ids): """ Processes a batch of tokens and writes them to the output files. @@ -156,6 +154,32 @@ def process_batch(self, batch, ids): tokens = np.array(tokens) self.writer(utt_id, tokens) + def _make_pipelines(self): + """ + Creates the data processing pipeline for audio data. + + The pipeline reads audio files, resamples them to the desired sample rate, and provides + the processed signal under the key "sig". + + Returns + ------- + pipeline : list + A list containing the audio processing pipeline function. + """ + + @sb.utils.data_pipeline.takes(self.src_key) + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + info = torchaudio.info(wav) + sig = sb.dataio.dataio.read_audio(wav) + sig = torchaudio.transforms.Resample( + info.sample_rate, + self.sample_rate, + )(sig) + return sig + + return [audio_pipeline] + def __del__(self): """ Close the writer. @@ -163,6 +187,46 @@ def __del__(self): self.writer.close() +def _skip(save_path, save_name, conf): + """ + Detects if the dataset extraction has been already done. + If the extraction has been done, we can skip it. + + Arguments + --------- + save_path : str + The path to the directory containing extracted tokens. + conf : dict + Configuration to match against saved config. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + skip = True + + # Checking ark,scp files + for ext in [".ark", ".scp"]: + save_file = save_path / f"{save_name}{ext}" + if not save_file.exists: + skip = False + + # Checking saved options + save_opt = save_path / OPT_FILE + if skip is True: + if save_opt.exists(): + opts_old = load_pkl(save_opt.as_posix()) + if opts_old == conf: + skip = True + else: + skip = False + else: + skip = False + return skip + + class TokensLoader: """ A loader class for retrieving tokens corresponding to utterance IDs. @@ -197,30 +261,48 @@ def __init__( ) self.tokens = self._load(data_path, save_name) - def tokens_by_uttid(self, utt_id): + def tokens_by_uttid(self, utt_id, num_codebooks=None): """ Retrieves the tokens corresponding to a given utterance ID. Arguments --------- - utt_id: str + utt_id : str The utterance ID to retrieve tokens for. + num_codebooks : int, optional + The number of codebooks to retrieve from the tokens. If specified, the tokens will be truncated + to include only the first `num_codebooks` codebooks. If not specified, all codebooks are returned. Returns ------- - result: torch.LongTensor [T, N_Q] - The tokens associated with the utterance ID. + result : torch.LongTensor [T, N_Q] + The tokens associated with the utterance ID, possibly truncated to `num_codebooks` codebooks. Raises ------ KeyError If the utterance ID is not found in the tokens. + ValueError + If `num_codebooks` is invalid or exceeds the number of available codebooks. """ if utt_id not in self.tokens: raise KeyError(f"Utterance ID '{utt_id}' not found in tokens.") tokens_path = self.tokens[utt_id] tokens = kaldiio.load_mat(tokens_path) tokens = torch.from_numpy(tokens).long() + + if num_codebooks is not None: + if not isinstance(num_codebooks, int) or num_codebooks <= 0: + raise ValueError( + f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer." + ) + if num_codebooks > tokens.size(-1): + raise ValueError( + f"Invalid number of codebooks: {num_codebooks}. " + f"Available codebooks: {tokens.size(-1)}." + ) + tokens = tokens[:, :num_codebooks] + return tokens def _load(self, data_path, save_name): From 2c30adeec940b80b988b01fdfef0fd65148faa73 Mon Sep 17 00:00:00 2001 From: Chaanks Date: Mon, 11 Nov 2024 01:04:36 +0100 Subject: [PATCH 008/270] update tokens extraction script --- .../DASB/LibriSpeech/extraction/extract.py | 96 +++++++++++++++++++ .../extraction/hparams/encodec.yaml | 57 +++++++++++ .../extraction/librispeech_prepare.py | 1 + benchmarks/DASB/utils/tokens.py | 73 +++++++++++--- 4 files changed, 216 insertions(+), 11 deletions(-) create mode 100644 benchmarks/DASB/LibriSpeech/extraction/extract.py create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml create mode 120000 benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py new file mode 100644 index 000000000..935c013bd --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -0,0 +1,96 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import torch +import torchaudio +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid"]: + csv_path = hparams[f"{split}_csv"] + name = pl.Path(csv_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_path, + replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + for split in hparams["test_csv"]: + name = pl.Path(split).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=split, + replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, (save_folder / "librispeech").as_posix() + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info(f"Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix() + ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..815b8aae6 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 32 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +save_embedding: True + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 3762457ec..493a0598a 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -1,3 +1,11 @@ +""" +Unified interface for token extraction and pretrained embeddings handling for speech tokenizers. + +Authors +--------- +* Jarod Duret, 2024 +""" + import math import logging import pathlib as pl @@ -105,7 +113,7 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"): # Check if the extraction is already done (if so, skip it) if _skip(save_path, save_name, conf): - logger.info("Skipping preparation, completed in previous run.") + logger.info("Skipping extraction, completed in previous run.") return self.wspecifier = ( @@ -180,6 +188,29 @@ def audio_pipeline(wav): return [audio_pipeline] + def save_pretrained_embeddings(self, save_path, save_name="embeddings"): + """ + Saves the pretrained embeddings of the tokenizer to a specified directory. + + This method retrieves the pretrained embeddings from the tokenizer, + converts them to a NumPy array, and saves them as a `.npy` file. + + Parameters + ---------- + save_path : str or pathlib.Path + The directory where the pretrained embeddings will be saved. + If the directory does not exist, it will be created. + save_name : str, optional + The base name of the saved embeddings file (default is "embeddings"). + The embeddings will be saved as `.npy` in the specified directory. + """ + save_path = pl.Path(save_path).absolute() + save_path.mkdir(parents=True, exist_ok=True) + + embeddings = self.tokenizer.get_pretrained_embeddings() + embeddings = embeddings.cpu().numpy() + np.save(save_path / save_name, embeddings) + def __del__(self): """ Close the writer. @@ -196,6 +227,8 @@ def _skip(save_path, save_name, conf): --------- save_path : str The path to the directory containing extracted tokens. + save_name : str + The base name of the saved tokens file. conf : dict Configuration to match against saved config. @@ -244,16 +277,6 @@ def __init__( data_path, save_name="tokens", ): - """ - Initializes the TokensLoader. - - Arguments - --------- - data_path: str - The path to the data directory containing the token files. - save_name: str, optional - The base name of the tokens files (default: "tokens"). - """ self.data_path = pl.Path(data_path) if not self.data_path.exists(): raise ValueError( @@ -329,3 +352,31 @@ def _load(self, data_path, save_name): if line.strip() } return utt2toks + + def load_pretrained_embeddings(self, data_path, save_name="embeddings"): + """ + Loads pretrained embeddings from a specified path. + + Arguments + --------- + data_path : str + The directory where the embeddings are saved. + save_name : str, optional + The name of the embeddings file (default: "embeddings"). + + Returns + ------- + embeddings : torch.Tensor + The loaded embeddings as a PyTorch tensor. + + Raises + ------ + FileNotFoundError + If the embeddings file does not exist at the specified path. + """ + data_path = pl.Path(data_path).absolute() + if not self.data_path.exists(): + raise ValueError(f"Data folder not found: {data_path.as_posix()}") + embeddings = np.load(data_path / save_name) + embeddings = torch.from_numpy(embeddings) + return embeddings From 336dd64f561d3174fc1cc31ab7fdd9253df9cabd Mon Sep 17 00:00:00 2001 From: Chaanks Date: Tue, 12 Nov 2024 16:48:10 +0100 Subject: [PATCH 009/270] update LibriSpeech ASR recipe --- .../hparams/LSTM/train.yaml | 239 ++++++++++ .../hparams/contextnet/train.yaml | 232 ++++++++++ .../librispeech_prepare.py | 1 + .../LibriSpeech/ASR-refactor-tokens/train.py | 438 ++++++++++++++++++ .../extraction/hparams/encodec.yaml | 2 +- benchmarks/DASB/utils/tokens.py | 2 +- 6 files changed, 912 insertions(+), 2 deletions(-) create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml create mode 120000 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py create mode 100644 benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml new file mode 100644 index 000000000..7ae90ad4e --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml @@ -0,0 +1,239 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + +tokens_folder: !PLACEHOLDER +pretain_embeddings_folder: !PLACEHOLDER # Optional + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +# bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer +# source: facebook/encodec_24khz # Only the 24kHz version supports mono audio +# save_path: !ref +# sample_rate: !ref +# bandwidth: !ref +# flat_embeddings: False +# freeze: True +# renorm_embeddings: False + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + # tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml new file mode 100644 index 000000000..c28fdead0 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml @@ -0,0 +1,232 @@ +# ############################################################################ +# Model: E2E ASR with CTC +# Auido Tokenizer: Encodec +# Encoder: LSTM Encoder +# Decoder: CTC beam searcher and greedy searcher +# Tokens: character +# Training: Librispeech 960h +# Authors: Pooneh Mousavi 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/enocdec/LSTM/ +output_wer_folder: !ref /wer.txt +save_folder: !ref /save +train_log: !ref /train_log.txt + + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES +# then data_folder_rirs should be /localscratch/xxx_corpus +# otherwise the dataset will automatically be downloaded +# data_folder_rirs: !ref +train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /dev-clean.csv + - !ref /test-clean.csv + + +####################### Training Parameters #################################### +number_of_epochs: 20 +batch_size: 4 # This works for 2x GPUs with 32GB +test_batch_size: 1 +grad_accumulation_factor: 2 +max_grad_norm: 5.0 +sorting: descending #random +num_workers: 8 +loss_reduction: batchmean +precision: fp32 # bf16, fp16 or fp32loss_reduction: batchmean +valid_search_interval: 1 +avg_checkpoints: 10 # Number of checkpoints to average for evaluation +cache_size: 1.e+10 + +lr_model: 0.001 +weight_decay: 0.0005 + + +# Training parameters +# To make Transformers converge, the global bath size should be large enough. +# The global batch size is max_batch_len * n_gpus * gradient_accumulation. +# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. +# Please, set your parameters accordingly. +dynamic_batching: True +max_batch_length_train: 850 +max_batch_len_val: 100 +num_bucket: 200 +shuffle: False # if true re-creates batches at each epoch shuffling examples. +max_batch_ex: 128 +batch_ordering: random + +dynamic_batch_sampler_train: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +dynamic_batch_sampler_val: + max_batch_length: !ref + num_buckets: !ref + shuffle: !ref + batch_ordering: !ref + max_batch_ex: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +valid_dataloader_opts: + batch_size: !ref + +test_dataloader_opts: + batch_size: !ref + + +####################### Model parameters ########################### +# Tokenizer parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +vocab_size: 1024 +# bandwidth: 1.5 +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +pretrain_embeddings: False +freeze_embedding: False + + +# LSTM +activation: !name:torch.nn.Sigmoid +dnn_layers: 2 +dnn_neurons: 1024 +dropout: 0.2 +output_neurons: 31 + +# BPE parameters +# BPE parameters +token_type: char # ["unigram", "bpe", "char"] +character_coverage: 1.0 +blank_index: 0 +bos_index: 1 +eos_index: 2 + +# Decoding parameters +beam_size: 100 +beam_prune_logp: -12.0 +token_prune_min_logp: -1.2 +prune_history: False + +############################## models ################################ +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer +# source: facebook/encodec_24khz # Only the 24kHz version supports mono audio +# save_path: !ref +# sample_rate: !ref +# bandwidth: !ref +# flat_embeddings: False +# freeze: True +# renorm_embeddings: False + +discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer + num_codebooks: !ref + vocab_size: !ref + emb_dim: !ref + # hidden_dim: !ref + freeze: !ref + init: !ref + +attention_mlp: !new:model.custom_model.AttentionMLP + input_dim: !ref + hidden_dim: !ref + +encoder: !new:speechbrain.nnet.RNN.LSTM + input_shape: [Null, Null, !ref ] + num_layers: !ref + bidirectional: True + dropout: !ref + hidden_size: !ref + +ctc_lin: !new:speechbrain.nnet.linear.Linear + input_size: 2048 + n_neurons: !ref + +modules: + encoder: !ref + ctc_lin: !ref + attention_mlp: !ref + # tokenizer: !ref + discrete_embedding_layer: !ref + + +model: !new:torch.nn.ModuleList + - [!ref , !ref , !ref , !ref ] + +####################### Decoding & optimiser ########################### +# Decoding parameters +test_beam_search: + blank_index: !ref + beam_size: !ref + beam_prune_logp: !ref + token_prune_min_logp: !ref + prune_history: !ref + alpha: 0.8 + beta: 1.2 + +ctc_cost: !name:speechbrain.nnet.losses.ctc_loss + blank_index: !ref + + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler + initial_value: !ref + improvement_threshold: 0.0025 + annealing_factor: 0.8 + patient: 0 +# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler +# lr_initial: !ref +# n_warmup_steps: 7500 +# n_keep_steps: 36000 + +model_opt_class: !name:torch.optim.AdamW + lr: !ref + betas: (0.9, 0.98) + eps: 0.000000001 + weight_decay: !ref + +############################## Logging and Pretrainer ########################## +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + scheduler: !ref + counter: !ref + + +# Functions and classes +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + +cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats + split_tokens: True +wer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py new file mode 120000 index 000000000..a3126ec94 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py @@ -0,0 +1 @@ +../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py new file mode 100644 index 000000000..927d7ea84 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py @@ -0,0 +1,438 @@ +#!/usr/bin/env/python3 +"""Recipe for training an discrete tokens ctc ASR system with librispeech. + +Decoding is performed with greedy decoding at validation time. +At test time, beamsearch is used with an optional external language model. + +Authors + * Pooneh Mousavi 2024 +""" + +import os +import sys +import torch +import torchaudio +import logging +import speechbrain as sb +from speechbrain.utils.distributed import run_on_main, if_main_process +from speechbrain.tokenizers.SentencePiece import SentencePiece +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + + +logger = logging.getLogger(__name__) + +_CACHE = {"size": 0} + + +# Define training procedure +class ASR(sb.Brain): + def compute_forward(self, batch, stage): + """Forward computations from the waveform batches to the output probabilities.""" + batch = batch.to(self.device) + wavs, wav_lens = batch.sig + in_toks, _ = batch.speech_tokens + + # Extract embeddings + in_embs = self.modules.discrete_embedding_layer( + in_toks + ) # [B, T, N-Q, D] + + # Attention-Pooling + att_w = self.modules.attention_mlp(in_embs) # [B, T, N-Q, 1] + in_embs = torch.matmul(att_w.transpose(2, -1), in_embs).squeeze( + -2 + ) # [B, T, D] + + # forward modules + if type(self.modules.encoder).__name__ == "ContextNet": + enc_out = self.modules.encoder(in_embs) + + elif type(self.modules.encoder).__name__ == "LSTM": + enc_out, _ = self.modules.encoder(in_embs) + + else: + raise NotImplementedError + + # output layer for ctc log-probabilities + logits = self.modules.ctc_lin(enc_out) + p_ctc = self.hparams.log_softmax(logits) + + p_tokens = None + if stage == sb.Stage.VALID: + p_tokens = sb.decoders.ctc_greedy_decode( + p_ctc, wav_lens, blank_id=self.hparams.blank_index + ) + elif stage == sb.Stage.TEST: + p_tokens = test_searcher(p_ctc, wav_lens) + + return p_ctc, wav_lens, p_tokens + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss (CTC+NLL) given predictions and targets.""" + + p_ctc, wav_lens, predicted_tokens = predictions + ids = batch.id + tokens, tokens_lens = batch.tokens + + # Label Augmentation + if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"): + tokens = self.hparams.wav_augment.replicate_labels(tokens) + tokens_lens = self.hparams.wav_augment.replicate_labels(tokens_lens) + + loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) + + if stage == sb.Stage.VALID: + # Decode token terms to words + predicted_words = self.tokenizer( + predicted_tokens, task="decode_from_list" + ) + elif stage == sb.Stage.TEST: + predicted_words = [ + hyp[0].text.split(" ") for hyp in predicted_tokens + ] + + if stage != sb.Stage.TRAIN: + target_words = [wrd.split(" ") for wrd in batch.wrd] + self.wer_metric.append(ids, predicted_words, target_words) + self.cer_metric.append(ids, predicted_words, target_words) + + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch""" + if stage != sb.Stage.TRAIN: + self.cer_metric = self.hparams.cer_computer() + self.wer_metric = self.hparams.wer_computer() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of a epoch.""" + # Compute/store important stats + stage_stats = {"loss": stage_loss} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + else: + stage_stats["CER"] = self.cer_metric.summarize("error_rate") + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + current_epoch = self.hparams.epoch_counter.current + valid_search_interval = self.hparams.valid_search_interval + if ( + current_epoch % valid_search_interval == 0 + or stage == sb.Stage.TEST + ): + stage_stats["WER"] = self.wer_metric.summarize("error_rate") + + # log stats and save checkpoint at end-of-epoch + if stage == sb.Stage.VALID: + if type(self.hparams.scheduler).__name__ == "NewBobScheduler": + lr, new_lr = self.hparams.scheduler(stage_stats["loss"]) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + elif type(self.hparams.scheduler).__name__ == "LinearNoamScheduler": + lr = self.hparams.scheduler.current_lr + else: + raise NotImplementedError + + optimizer = self.optimizer.__class__.__name__ + epoch_stats = { + "epoch": epoch, + "lr": lr, + "optimizer": optimizer, + } + self.hparams.train_logger.log_stats( + stats_meta=epoch_stats, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + self.checkpointer.save_and_keep_only( + meta={"WER": stage_stats["WER"], "epoch": epoch}, + min_keys=["WER"], + num_to_keep=self.hparams.avg_checkpoints, + ) + + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) + if if_main_process(): + with open( + self.hparams.output_wer_folder, "w", encoding="utf-8" + ) as w: + self.wer_metric.write_stats(w) + + def on_fit_batch_end(self, batch, outputs, loss, should_step): + if ( + should_step + and type(self.hparams.scheduler).__name__ == "LinearNoamScheduler" + ): + self.hparams.scheduler(self.optimizer) + + +def dataio_prepare(hparams, tokenizer): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + """ + data_folder = hparams["data_folder"] + + train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["train_csv"], + replacements={"data_root": data_folder}, + ) + + if hparams["sorting"] == "ascending": + # we sort training data to speed up training and get better results. + train_data = train_data.filtered_sorted(sort_key="duration") + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + train_data = train_data.filtered_sorted( + sort_key="duration", reverse=True + ) + # when sorting do not shuffle in dataloader ! otherwise is pointless + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=hparams["valid_csv"], + replacements={"data_root": data_folder}, + ) + valid_data = valid_data.filtered_sorted(sort_key="duration") + + # test is separate + test_datasets = {} + for csv_file in hparams["test_csv"]: + name = Path(csv_file).stem + test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( + csv_path=csv_file, replacements={"data_root": data_folder} + ) + test_datasets[name] = test_datasets[name].filtered_sorted( + sort_key="duration" + ) + + datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] + + # 1. Define tokens pipeline: + tokens_loader = hparams["tokens_loader"] + num_codebooks = hparams["num_codebooks"] + + @sb.utils.data_pipeline.takes("id") + @sb.utils.data_pipeline.provides("speech_tokens") + def tokens_pipeline(id): + tokens = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks) + return tokens + + sb.dataio.dataset.add_dynamic_item(datasets, tokens_pipeline) + + # 2. Define audio pipeline: + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + info = torchaudio.info(wav) + resampled = torchaudio.transforms.Resample( + info.sample_rate, + hparams["sample_rate"], + )(sig) + # resampled = resampled.unsqueeze(0) + return resampled + + sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) + + # 3. Define text pipeline: + @sb.utils.data_pipeline.takes("wrd") + @sb.utils.data_pipeline.provides( + "wrd", "char_list", "tokens_list", "tokens" + ) + def text_pipeline(wrd): + yield wrd + char_list = list(wrd) + yield char_list + tokens_list = tokenizer.sp.encode_as_ids(wrd) + yield tokens_list + tokens = torch.LongTensor(tokens_list) + yield tokens + + sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) + + # 4. Set output: + sb.dataio.dataset.set_output_keys( + datasets, + ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"], + ) + + # 5. If Dynamic Batching is used, we instantiate the needed samplers. + train_batch_sampler = None + valid_batch_sampler = None + if hparams["dynamic_batching"]: + from speechbrain.dataio.sampler import DynamicBatchSampler # noqa + + dynamic_hparams_train = hparams["dynamic_batch_sampler_train"] + dynamic_hparams_val = hparams["dynamic_batch_sampler_val"] + + train_batch_sampler = DynamicBatchSampler( + train_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_train, + ) + + valid_batch_sampler = DynamicBatchSampler( + valid_data, + length_func=lambda x: x["duration"], + **dynamic_hparams_val, + ) + + return ( + train_data, + valid_data, + test_datasets, + train_batch_sampler, + valid_batch_sampler, + ) + + +if __name__ == "__main__": + + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # If distributed_launch=True then + # create ddp_group with the right communication protocol + sb.utils.distributed.ddp_init_group(run_opts) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from librispeech_prepare import prepare_librispeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_librispeech, + kwargs={ + "data_folder": hparams["data_folder"], + "tr_splits": hparams["train_splits"], + "dev_splits": hparams["dev_splits"], + "te_splits": hparams["test_splits"], + "save_folder": hparams["output_folder"], + "merge_lst": hparams["train_splits"], + "merge_name": "train.csv", + "skip_prep": hparams["skip_prep"], + }, + ) + + # Defining tokenizer and loading it + tokenizer = SentencePiece( + model_dir=hparams["save_folder"], + vocab_size=hparams["output_neurons"], + annotation_train=hparams["train_csv"], + annotation_read="wrd", + model_type=hparams["token_type"], + character_coverage=hparams["character_coverage"], + bos_id=hparams["bos_index"], + eos_id=hparams["eos_index"], + ) + + # here we create the datasets objects as well as tokenization and encoding + ( + train_data, + valid_data, + test_datasets, + train_bsampler, + valid_bsampler, + ) = dataio_prepare(hparams, tokenizer) + + # Use pretrained embeddings + if hparams["pretrain_embeddings"]: + tokens_loader = hparams["tokens_loader"] + embs = tokens_loader.load_pretrained_embeddings( + hparams["pretain_embeddings_folder"] + ) + hparams["discrete_embedding_layer"].init_embedding(embs) + + # Log number of parameters/buffers + model_params = sum( + [ + x.numel() + for module in hparams["modules"].values() + for x in module.state_dict().values() + ] + ) + hparams["train_logger"].log_stats( + stats_meta={ + "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", + }, + ) + + # Trainer initialization + asr_brain = ASR( + modules=hparams["modules"], + opt_class=hparams["model_opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # Adding objects to trainer. + asr_brain.tokenizer = tokenizer + vocab_list = [ + tokenizer.sp.id_to_piece(i) for i in range(tokenizer.sp.vocab_size()) + ] + + from speechbrain.decoders.ctc import CTCBeamSearcher + + test_searcher = CTCBeamSearcher( + **hparams["test_beam_search"], + vocab_list=vocab_list, + ) + + train_dataloader_opts = hparams["train_dataloader_opts"] + valid_dataloader_opts = hparams["valid_dataloader_opts"] + + if train_bsampler is not None: + train_dataloader_opts = { + "batch_sampler": train_bsampler, + "num_workers": hparams["num_workers"], + } + + if valid_bsampler is not None: + valid_dataloader_opts = {"batch_sampler": valid_bsampler} + + # Training + asr_brain.fit( + asr_brain.hparams.epoch_counter, + train_data, + valid_data, + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index 815b8aae6..f68ab9b37 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -23,7 +23,7 @@ test_csv: - !ref /test-clean.csv - !ref /test-other.csv -batch_size: 32 +batch_size: 8 num_workers: 8 src_key: wav id_key: id diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 493a0598a..272e01ebe 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -377,6 +377,6 @@ def load_pretrained_embeddings(self, data_path, save_name="embeddings"): data_path = pl.Path(data_path).absolute() if not self.data_path.exists(): raise ValueError(f"Data folder not found: {data_path.as_posix()}") - embeddings = np.load(data_path / save_name) + embeddings = np.load(data_path / f"{save_name}.npy") embeddings = torch.from_numpy(embeddings) return embeddings From cf4041207ff844c75f714b0dd1950c7baf7d69ff Mon Sep 17 00:00:00 2001 From: Chaanks Date: Tue, 3 Dec 2024 00:29:49 +0100 Subject: [PATCH 010/270] update LibriSpeech ASR recipe --- .../hparams/LSTM/train.yaml | 20 +-- .../LibriSpeech/ASR-refactor-tokens/train.py | 3 - .../DASB/LibriSpeech/extraction/extract.py | 4 +- .../LibriSpeech/extraction/hparams/dac.yaml | 65 +++++++ .../hparams/discrete_ssl_wavlm.yaml | 83 +++++++++ .../extraction/hparams/encodec.yaml | 2 +- .../extraction/hparams/speech_tokenizer.yaml | 54 ++++++ benchmarks/DASB/model/tokenizer_interface.py | 170 ++++++++++-------- benchmarks/DASB/utils/tokens.py | 14 +- 9 files changed, 319 insertions(+), 96 deletions(-) create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml index 7ae90ad4e..89d347862 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml @@ -11,7 +11,8 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/enocdec/LSTM/ +run_name: !PLACEHOLDER +output_folder: !ref results/LSTM// output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt @@ -39,7 +40,7 @@ pretain_embeddings_folder: !PLACEHOLDER # Optional ####################### Training Parameters #################################### number_of_epochs: 20 -batch_size: 4 # This works for 2x GPUs with 32GB +batch_size: 4 test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -56,10 +57,6 @@ weight_decay: 0.0005 # Training parameters -# To make Transformers converge, the global bath size should be large enough. -# The global batch size is max_batch_len * n_gpus * gradient_accumulation. -# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. -# Please, set your parameters accordingly. dynamic_batching: True max_batch_length_train: 850 max_batch_len_val: 100 @@ -134,20 +131,9 @@ token_prune_min_logp: -1.2 prune_history: False ############################## models ################################ -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer -# source: facebook/encodec_24khz # Only the 24kHz version supports mono audio -# save_path: !ref -# sample_rate: !ref -# bandwidth: !ref -# flat_embeddings: False -# freeze: True -# renorm_embeddings: False - tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref - discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer num_codebooks: !ref vocab_size: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py index 927d7ea84..746a068e1 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py @@ -25,8 +25,6 @@ logger = logging.getLogger(__name__) -_CACHE = {"size": 0} - # Define training procedure class ASR(sb.Brain): @@ -36,7 +34,6 @@ def compute_forward(self, batch, stage): wavs, wav_lens = batch.sig in_toks, _ = batch.speech_tokens - # Extract embeddings in_embs = self.modules.discrete_embedding_layer( in_toks ) # [B, T, N-Q, D] diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 935c013bd..ef3e677b5 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -85,7 +85,9 @@ save_folder = pl.Path(hparams["save_folder"]) logger.info("Extracting dataset tokens ...") tokens_extractor.extract_tokens( - merged_dataset, (save_folder / "librispeech").as_posix() + merged_dataset, + hparams["num_codebooks"], + (save_folder / "librispeech").as_posix(), ) if hparams["save_embedding"]: diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml new file mode 100644 index 000000000..c380f0478 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -0,0 +1,65 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 2 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:model.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml new file mode 100644 index 000000000..2263547c5 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml @@ -0,0 +1,83 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: wavml +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +num_clusters: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +# ssl_layer_num: [3, 7, 12, 23] +# deduplicate: [False, False, False, False] +# bpe_tokenizer_path: [null , null, null, null] +ssl_layer_num: [1, 3, 7, 12, 18, 23] +num_codebooks: 6 +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index f68ab9b37..81cbd0fb2 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -37,7 +37,7 @@ dataloader_opts: bandwidth: 1.5 num_codebooks: 2 sample_rate: 24000 -save_embedding: True +save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:model.tokenizer_interface.EncodecTokenizer diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..176768d5e --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,54 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 2 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:model.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref \ No newline at end of file diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 604e3a403..652fa53e1 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -21,135 +21,161 @@ class BaseTokenizer(ABC): + def __init__(self): + super().__init__() + @abstractmethod @torch.no_grad() - def sig_to_tokens(self, signal, lengths, **kwargs): - """Abstract method to encode a signal into tokens.""" + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + """Encode signal into tokens.""" pass @abstractmethod @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): - """Abstract method to decode tokens into a signal.""" + """Decode tokens to signal.""" pass @abstractmethod @torch.no_grad() - def get_pretrained_embeddings(self, **kwargs): - """Return pretrained codebook embedding.""" + def get_pretrained_embeddings( + self, vocab_size, num_codebooks, device="cpu", **kwargs + ): + """Get codebook embeddings.""" pass class EncodecTokenizer(Encodec, BaseTokenizer): + def __init__(self, source, **kwargs): + Encodec.__init__(self, source=source, **kwargs) + BaseTokenizer.__init__(self) + @torch.no_grad() - def sig_to_tokens(self, signal, lengths, **kwargs): - # signal: [B, T] + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): self.eval() - tokens, _ = self.encode(signal, lengths) # [B, T, N_Q] + tokens, _ = self.encode(signal, lengths) + if num_codebooks: + if tokens.shape[-1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[..., :num_codebooks] return tokens @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): - # tokens: [B, T, N_Q] self.eval() - signal = self.decode(tokens)[:, 0] # [B, T] + signal = self.decode(tokens)[:, 0] return signal @torch.no_grad() - def get_pretrained_embeddings(self, **kwargs): - """Return pretrained codebook embedding.""" + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, device=None, **kwargs + ): embeddings = self.vocabulary return embeddings.reshape(-1, embeddings.shape[-1]) class DACTokenizer(DAC, BaseTokenizer): + def __init__(self, *args, **kwargs): + DAC.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + @torch.no_grad() - def sig_to_tokens(self, signal, lengths, **kwargs): - # signal: [B, T] + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): self.eval() - tokens, _ = self( - signal[:, None], n_quantizers=kwargs["num_codebooks"] - ) # [B, N_Q, T] - return tokens.movedim(-1, -2) # [B, T, N_Q] + tokens, _ = self(signal[:, None], n_quantizers=num_codebooks) + return tokens.movedim(-1, -2) @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): - # tokens: [B, T, N_Q] self.eval() quantized_feats, _, _ = self.quantizer.from_codes( - tokens.movedim(-1, -2) # [B, N_Q, T] + tokens.movedim(-1, -2) ) - signal = self.decode(quantized_feats)[:, 0] # [B, T] - return signal + return self.decode(quantized_feats)[:, 0] @torch.no_grad() - def get_pretrained_embeddings(self, **kwargs): - """Return pretrained codebook embedding.""" - # See https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/quantize.py#L200 - toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) - toks = ( - toks[:, None, None].expand(-1, kwargs["num_codebooks"], -1).clone() - ) # [C, K, 1] - self.to(kwargs["device"]).eval() - with torch.no_grad(): - z_q, z_p, _ = self.quantizer.from_codes(toks) + def get_pretrained_embeddings( + self, vocab_size, num_codebooks, device="cpu", **kwargs + ): + toks = torch.arange(vocab_size, device=device) + toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() + self.to(device).eval() + z_q, z_p, _ = self.quantizer.from_codes(toks) z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) - z_qs = [] - for i, z_p_i in enumerate(z_ps): - with torch.no_grad(): - z_q_i = self.quantizer.quantizers[i].out_proj( - z_p_i - ) # [C, H, 1] - z_qs.append(z_q_i) - assert (z_q == sum(z_qs)).all() - embeddings = torch.cat(z_qs)[:, :, 0] - return embeddings + z_qs = [ + self.quantizer.quantizers[i].out_proj(z_p_i) + for i, z_p_i in enumerate(z_ps) + ] + return torch.cat(z_qs)[:, :, 0] class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): + def __init__(self, *args, **kwargs): + SpeechTokenizer_interface.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + @torch.no_grad() - def sig_to_tokens(self, signal, lengths, **kwargs): - # signal: [B, T] + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): self.eval() - tokens = self(signal)[: kwargs["num_codebooks"]] # [N_Q, B, T] - return tokens.movedim(-3, -1) # [B, T, N_Q] + tokens = self(signal) + if num_codebooks: + if len(tokens) < num_codebooks: + raise ValueError( + f"Model only outputs {len(tokens)} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:num_codebooks] + return tokens.movedim(-3, -1) @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): - # tokens: [B, T, N_Q] self.eval() - tokens = tokens.movedim(-1, -3) # [N_Q, B, T] - return self.decode(tokens) # [B, T] + return self.decode(tokens.movedim(-1, -3)) @torch.no_grad() - def get_pretrained_embeddings(self, **kwargs): - """Return pretrained codebook embedding.""" - # See https://github.com/ZhangXInFD/SpeechTokenizer/blob/a9f88dc72642b600654a62861e34342babae6c71/speechtokenizer/quantization/core_vq.py#L360 - toks = torch.arange(kwargs["vocab_size"], device=kwargs["device"]) - toks = ( - toks[None, :, None].expand(kwargs["num_codebooks"], -1, -1).clone() - ) # [K, C, 1] - self.to(kwargs["device"]).eval() - embs = [] - for i, indices in enumerate(toks): - layer = self.model.quantizer.vq.layers[i] - with torch.no_grad(): - quantized = layer.decode(indices) - embs.append(quantized) - assert (self.model.quantizer.decode(toks) == sum(embs)).all() - embeddings = torch.cat(embs)[:, :, 0] - return embeddings + def get_pretrained_embeddings( + self, vocab_size, num_codebooks, device="cpu", **kwargs + ): + toks = torch.arange(vocab_size, device=device) + toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() + self.to(device).eval() + embs = [ + self.model.quantizer.vq.layers[i].decode(indices) + for i, indices in enumerate(toks) + ] + return torch.cat(embs)[:, :, 0] class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): + def __init__(self, *args, **kwargs): + DiscreteSSL.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + @torch.no_grad() - def sig_to_tokens(self, signal, lengths): - pass + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + self.eval() + tokens, _, _ = self.encode(signal, lengths) + if num_codebooks: + if tokens.shape[-1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[..., :num_codebooks] + return tokens @torch.no_grad() - def tokens_to_sig(self, tokens): - pass + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + return self.decode(tokens) @torch.no_grad() - def get_pretrained_embeddings(self, **kwargs): - pass + def get_pretrained_embeddings( + self, vocab_size, num_codebooks, device="cpu", **kwargs + ): + toks = torch.arange(vocab_size, device=device) + toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() + self.to(device).eval() + return torch.cat( + [self.quantizer.codebooks[i] for i in range(num_codebooks)] + ) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 272e01ebe..705184d80 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -93,7 +93,9 @@ def __init__( self.dataloader_opts = dataloader_opts self.pipelines = self._make_pipelines() - def extract_tokens(self, dataset, save_path, save_name="tokens"): + def extract_tokens( + self, dataset, num_codebooks, save_path, save_name="tokens" + ): """ Extracts tokens from the dataset and saves them to the specified format. @@ -101,6 +103,12 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"): --------- dataset : speechbrain.dataio.dataset.DynamicItemDataset or dict The dataset from which to extract tokens. Can be a DynamicItemDataset or a dictionary. + num_codebooks: int + The number of codebooks to retrieve from the tokens. + save_path: str + The path where tokens will be saved. + save_name: str + The name of the .scp and .ark files. """ conf = { "sample_rate": self.sample_rate, @@ -136,7 +144,9 @@ def extract_tokens(self, dataset, save_path, save_name="tokens"): batch = batch.to(self.device) x, x_lengths = batch["sig"] ids = batch[self.id_key] - batch_tokens = self.tokenizer.sig_to_tokens(x, x_lengths) + batch_tokens = self.tokenizer.sig_to_tokens( + x, x_lengths, num_codebooks=num_codebooks + ) batch_tokens = sb.utils.data_utils.undo_padding( batch_tokens, x_lengths ) From 973e12b97a502af4dd04c8f2738c82d1462d6939 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Fri, 20 Dec 2024 13:34:46 -0500 Subject: [PATCH 011/270] change name --- .../{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/dac.yaml | 0 .../{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/encodec.yaml | 0 .../hparams/LSTM/speech_tokenizer.yaml | 0 .../{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/dac.yaml | 0 .../hparams/contextnet/encodec.yaml | 0 .../hparams/contextnet/speech_tokenizer.yaml | 0 .../{ASR-refactor => ASR-on-the-fly}/librispeech_prepare.py | 0 .../DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/train.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/dac.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/encodec.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/LSTM/speech_tokenizer.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/dac.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/encodec.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/hparams/contextnet/speech_tokenizer.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/librispeech_prepare.py (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor => ASR-on-the-fly}/train.py (100%) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/dac.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/encodec.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/LSTM/speech_tokenizer.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/dac.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/encodec.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/hparams/contextnet/speech_tokenizer.yaml rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/librispeech_prepare.py rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/librispeech_prepare.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor/train.py rename to benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py From 8dca49daf10eb89a86922287e6d8a016f15cb249 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 12:09:28 -0500 Subject: [PATCH 012/270] add discrete_ssl, reorgnaize folder --- .../DASB/LibriSpeech/ASR/LSTM/custom_model.py | 1 - .../ASR/LSTM/hparams/train_dac.yaml | 178 --------- .../ASR/LSTM/hparams/train_discrete_ssl.yaml | 216 ----------- .../ASR/LSTM/hparams/train_encodec.yaml | 183 ---------- .../LSTM/hparams/train_speech_tokenizer.yaml | 169 --------- .../ASR/LSTM/hparams/train_weighted_ssl.yaml | 162 --------- .../ASR/LSTM/librispeech_prepare.py | 1 - .../DASB/LibriSpeech/ASR/LSTM/train_dac.py | 335 ----------------- .../ASR/LSTM/train_discrete_ssl.py | 333 ----------------- .../LibriSpeech/ASR/LSTM/train_encodec.py | 340 ------------------ .../ASR/LSTM/train_speech_tokenizer.py | 335 ----------------- .../ASR/LSTM/train_weighted_ssl.py | 322 ----------------- .../ASR/contextnet/custom_model.py | 1 - .../ASR/contextnet/hparams/train_dac.yaml | 172 --------- .../hparams/train_discrete_ssl.yaml | 214 ----------- .../ASR/contextnet/hparams/train_encodec.yaml | 178 --------- .../hparams/train_speech_tokenizer.yaml | 160 --------- .../hparams/train_weighted_ssl.yaml | 157 -------- .../ASR/contextnet/librispeech_prepare.py | 1 - .../LibriSpeech/ASR/contextnet/train_dac.py | 321 ----------------- .../ASR/contextnet/train_discrete_ssl.py | 319 ---------------- .../ASR/contextnet/train_encodec.py | 316 ---------------- .../ASR/contextnet/train_speech_tokenizer.py | 319 ---------------- .../ASR/contextnet/train_weighted_ssl.py | 318 ---------------- .../hparams/LSTM/train.yaml | 0 .../hparams/contextnet/train.yaml | 0 .../librispeech_prepare.py | 0 .../{ASR-refactor-tokens => ASR}/train.py | 0 .../DASB/LibriSpeech/extraction/extract.py | 6 +- benchmarks/DASB/model/tokenizer_interface.py | 50 +-- benchmarks/DASB/utils/tokens.py | 7 +- 31 files changed, 30 insertions(+), 5084 deletions(-) delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml delete mode 120000 benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py delete mode 100644 benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/hparams/LSTM/train.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/hparams/contextnet/train.yaml (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/librispeech_prepare.py (100%) rename benchmarks/DASB/LibriSpeech/{ASR-refactor-tokens => ASR}/train.py (100%) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py deleted file mode 120000 index 4b3f08ebb..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/custom_model.py +++ /dev/null @@ -1 +0,0 @@ -../../../model/custom_model.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml deleted file mode 100644 index 0b00db1f7..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_dac.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/dac/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -# DAC parameters -# model_type: [16khz, 24khz, 44khz, 44khz] -# vocab_size: [1024, 1024, 1024, 1024] -# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] -# max_num_codebooks: [12, 32, 9, 18] -# embedding_dim: [1024, 1024, 1024, 128] -model_type: 24khz -vocab_size: 1024 -model_bitrate: 8kbps -num_codebooks: 2 # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type -sample_rate: 24000 -encoder_dim: 1024 - - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 768 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# Modules -# DAC model (see https://github.com/descriptinc/descript-audio-codec) -codec: !new:speechbrain.lobes.models.discrete.dac.DAC - model_type: !ref - model_bitrate: !ref - load_pretrained: True - tag: latest - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml deleted file mode 100644 index c5a920693..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_discrete_ssl.yaml +++ /dev/null @@ -1,216 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/discrete_ssl/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -### Configuration for discrete SSL model -# ssl_model_type: hubert, wavlm, wav2vec2 -# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: hubert # hubert, wavml or wav2vec2 -ssl_hub: facebook/hubert-large-ll60k -ssl_folder: !ref /ssl_checkpoint -kmeans_repo_id: speechbrain/SSL_Quantization -kmeans_cache_dir: !ref /kmeans_checkpoint -kmeans_dataset: LibriSpeech-100-360-500 -freeze_ssl: True -freeze_feature_extractor: True -num_clusters: 1000 - -### Config for Tokenizer -# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 -deduplicate: [False, False, False, False, False, False] -bpe_tokenizer_path: [null, null, null, null, null, null] -sample_rate: 16000 -encoder_dim: 1024 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer_config: - SSL_layers: !ref - deduplicates: !ref - bpe_tokenizers: !ref - -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - -codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - save_path: !ref - ssl_model: !ref - kmeans_dataset: !ref - kmeans_repo_id: !ref - num_clusters: !ref - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml deleted file mode 100644 index e2477819a..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_encodec.yaml +++ /dev/null @@ -1,183 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: data # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -# EnCodec parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] -vocab_size: 1024 -bandwidth: 1.5 -num_codebooks: 2 -sample_rate: 24000 -# Feature parameters -encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. -init_embedding: False -freeze_embedding: False - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - freeze: !ref - init: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml deleted file mode 100644 index eda9a2bad..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_speech_tokenizer.yaml +++ /dev/null @@ -1,169 +0,0 @@ -# ################################ -# Recipe for training an discrete-input ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/speech_tokenizer/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - - -### Config for Tokenizer -vocab_size: 1024 -num_codebooks: 2 -sample_rate: 16000 - -# Feature parameters - -encoder_dim: 1024 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 1024 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - scheduler_model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml deleted file mode 100644 index bcfbe8d50..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/hparams/train_weighted_ssl.yaml +++ /dev/null @@ -1,162 +0,0 @@ -# ################################ -# Recipe for training an SSL-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * Salah Zaiem 2023 -# * Youcef Kemiche 2023 -# * Pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-LSTM/weighted_ssl/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] - -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -ssl_hub: microsoft/wavlm-large -ssl_folder: !ref /ssl_checkpoints -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -lr_weights: 0.01 -sorting: ascending -precision: fp32 -sample_rate: 16000 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 768 -freeze_encoder: True - -# Outputs -output_neurons: 30 # BPE size, index(blank/eos/bos) = 0 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -# -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length - hub: !ref - save_path: !ref - -enc: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: 2 - bidirectional: True - dropout: 0.2 - hidden_size: 1024 - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - weighted_ssl_model: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -weights_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - ssl_model: !ref - scheduler_model: !ref - scheduler_encoder: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py deleted file mode 120000 index cf4adfd79..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/librispeech_prepare.py +++ /dev/null @@ -1 +0,0 @@ -../../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py deleted file mode 100644 index 479d6719b..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_dac.py +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec( - wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks - ) - embeddings = self.modules.discrete_embedding_layer( - tokens.movedim(-2, -1) - ) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - # "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py deleted file mode 100644 index 2aac19193..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_discrete_ssl.py +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _, _ = self.hparams.codec( - wavs, wav_lens, **self.hparams.tokenizer_config - ) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - # "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py deleted file mode 100644 index d2215ce45..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_encodec.py +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec.encode(wavs, wav_lens) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - if hparams["discrete_embedding_layer"].init: - hparams["discrete_embedding_layer"].init_embedding( - hparams["codec"] - .vocabulary[: hparams["num_codebooks"], :, :] - .flatten(0, 1) - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py deleted file mode 100644 index 1493b5972..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_speech_tokenizer.py +++ /dev/null @@ -1,335 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 -""" - -import os -import sys -import torch -import torchaudio -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens = self.hparams.codec(wavs).permute(1, 2, 0)[ - :, :, : self.hparams.num_codebooks - ] - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - # old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - # stage_stats["loss"] - # ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - # sb.nnet.schedulers.update_learning_rate( - # self.weights_optimizer, new_lr_weights - # ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - # self.weights_optimizer = self.hparams.weights_opt_class( - # self.hparams.attention_mlp.parameters() - # ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - # "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - # self.checkpointer.add_recoverable( - # "weights_opt", self.weights_optimizer - # ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py deleted file mode 100644 index 4a7aed382..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/LSTM/train_weighted_ssl.py +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. - -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Adel Moumen 2024 - * Salah Zaiem 2023 - * Youcef Kemiche 2023 - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - feats = self.modules.weighted_ssl_model(wavs) - y = self.modules.enc(feats) - y = y[0] # As it is an RNN output - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.weights_optimizer, new_lr_weights - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - self.weights_optimizer = self.hparams.weights_opt_class( - [self.modules.weighted_ssl_model.weights] - ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - self.checkpointer.add_recoverable( - "weights_opt", self.weights_optimizer - ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - return sig - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # Loading the SSL model - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py deleted file mode 120000 index 4b3f08ebb..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/custom_model.py +++ /dev/null @@ -1 +0,0 @@ -../../../model/custom_model.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml deleted file mode 100644 index 4533e2e8d..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_dac.yaml +++ /dev/null @@ -1,172 +0,0 @@ -# ################################ -# Recipe for training an dac-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/dac/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -# DAC parameters -# model_type: [16khz, 24khz, 44khz, 44khz] -# vocab_size: [1024, 1024, 1024, 1024] -# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] -# max_num_codebooks: [12, 32, 9, 18] -# embedding_dim: [1024, 1024, 1024, 128] -model_type: 24khz -vocab_size: 1024 -model_bitrate: 8kbps -num_codebooks: 2 # NOTE: must be smaller or equal to the maximum number of codebooks for the given model type -sample_rate: 24000 -encoder_dim: 1024 - - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.dac.DAC - model_type: !ref - model_bitrate: !ref - load_pretrained: True - tag: latest - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml deleted file mode 100644 index c394c73c1..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_discrete_ssl.yaml +++ /dev/null @@ -1,214 +0,0 @@ -# ################################ -# Recipe for training an discrete_ssl-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) - -### Configuration for discrete SSL model -# ssl_model_type: hubert, wavlm, wav2vec2 -# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: hubert # hubert, wavml or wav2vec2 -ssl_hub: facebook/hubert-large-ll60k -ssl_folder: !ref /ssl_checkpoint -kmeans_repo_id: speechbrain/SSL_Quantization -kmeans_cache_dir: !ref /kmeans_checkpoint -kmeans_dataset: LibriSpeech-100-360-500 -freeze_ssl: True -freeze_feature_extractor: True -num_clusters: 1000 - -### Config for Tokenizer -# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 -deduplicate: [False, False, False, False, False, False] -bpe_tokenizer_path: [null, null, null, null, null, null] -sample_rate: 16000 -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer_config: - SSL_layers: !ref - deduplicates: !ref - bpe_tokenizers: !ref - -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - -codec: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - save_path: !ref - ssl_model: !ref - kmeans_dataset: !ref - kmeans_repo_id: !ref - num_clusters: !ref - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml deleted file mode 100644 index 6163550e9..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml +++ /dev/null @@ -1,178 +0,0 @@ -# ################################ -# Recipe for training an encodec-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -# EnCodec parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] -vocab_size: 1024 -bandwidth: 1.5 -num_codebooks: 2 -sample_rate: 24000 -# Feature parameters -encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. -init_embedding: False -freeze_embedding: False - -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False - -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - freeze: !ref - init: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml deleted file mode 100644 index aef1307ec..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_speech_tokenizer.yaml +++ /dev/null @@ -1,160 +0,0 @@ -# ################################ -# Recipe for training an speech_tokenizer-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/speech_tokenizer/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -### Config for Tokenizer -vocab_size: 1024 -num_codebooks: 2 -sample_rate: 16000 - -encoder_dim: 1024 -# Training parameters -number_of_epochs: 20 -lr: 0.0002 -sorting: ascending -precision: fp32 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -codec: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref -discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer - num_codebooks: !ref - vocab_size: !ref - emb_dim: !ref - -attention_mlp: !new:custom_model.AttentionMLP - input_dim: !ref - hidden_dim: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref , !ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - attention_mlp: !ref - codec: !ref - discrete_embedding_layer: !ref - scheduler_model: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml b/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml deleted file mode 100644 index 6d806f0a5..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_weighted_ssl.yaml +++ /dev/null @@ -1,157 +0,0 @@ -# ################################ -# Recipe for training an encodec-based ctc ASR system with librispeech. -# Decoding is performed with ctc greedy or LM-rescored decoder. -# -# Authors -# * pooneh Mousavi 2024 -# ################################ - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/MP3S-contextnet/encodec/ -output_wer_folder: !ref / -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Data files -data_folder: !PLACEHOLDER # e,g./path/to/LibriSpeech -# noise/ris dataset will automatically be downloaded -# data_folder_rirs: !ref -train_splits: ["train-clean-100"] -dev_splits: ["dev-clean"] -test_splits: ["test-clean", "test-other"] -skip_prep: False -ckpt_interval_minutes: 25 # save checkpoint every N min -train_csv: !ref /train-clean-100.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -num_layers_ssl: 25 #Number of layers in the SSL model (should be 25 for large) -ssl_hub: microsoft/wavlm-large -ssl_folder: !ref /ssl_checkpoints -encoder_dim: 1024 - -# Training parameters -number_of_epochs: 2 -lr: 0.0002 -lr_weights: 0.01 -sorting: ascending -precision: fp32 -sample_rate: 16000 - -# With data_parallel batch_size is split into N jobs -# With DDP batch_size is multiplied by N jobs -# Must be 3 per GPU to fit 32GB of VRAM -batch_size: 4 -test_batch_size: 1 - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - -valid_dataloader_opts: - batch_size: !ref - -test_dataloader_opts: - batch_size: !ref - -# Model parameters -activation: !name:torch.nn.Sigmoid -dnn_layers: 1 -dnn_neurons: 640 -freeze_encoder: True - -# Outputs -output_neurons: 30 - -# Decoding parameters -blank_index: 0 -unk_index: 1 - -test_beam_search: - beam_size: 143 - topk: 1 - blank_index: !ref - space_token: ' ' # make sure this is the same as the one used in the tokenizer - beam_prune_logp: -12.0 - token_prune_min_logp: -1.2 - prune_history: True - alpha: 0.8 - beta: 1.2 - # can be downloaded from here https://www.openslr.org/11/ or trained with kenLM - # It can either be a .bin or .arpa ; note: .arpa is much slower at loading - # If you don't want to use an LM, comment it out or set it to null - kenlm_model_path: null - -# Functions and classes -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -weighted_ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.WeightedSSLModel # yamllint disable-line rule:line-length - hub: !ref - save_path: !ref - -enc: !new:speechbrain.lobes.models.ContextNet.ContextNet - input_shape: [null, null, !ref ] - strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - -# only unitary strides to keep the frame rate - - -ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 640 - n_neurons: !ref - -log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True - -ctc_cost: !name:speechbrain.nnet.losses.ctc_loss - blank_index: !ref - -modules: - enc: !ref - ctc_lin: !ref - weighted_ssl_model: !ref - -model: !new:torch.nn.ModuleList - - [!ref , !ref ] - -model_opt_class: !name:torch.optim.Adam - lr: !ref - -weights_opt_class: !name:torch.optim.Adam - lr: !ref - -lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.8 - patient: 0 - -lr_annealing_weights: !new:speechbrain.nnet.schedulers.NewBobScheduler - initial_value: !ref - improvement_threshold: 0.0025 - annealing_factor: 0.9 - patient: 0 - -label_encoder: !new:speechbrain.dataio.encoder.CTCTextEncoder -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - ssl_model: !ref - scheduler_model: !ref - scheduler_encoder: !ref - counter: !ref - tokenizer: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - -cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats - split_tokens: True diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py deleted file mode 120000 index cf4adfd79..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/librispeech_prepare.py +++ /dev/null @@ -1 +0,0 @@ -../../librispeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py deleted file mode 100644 index a177e48a5..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_dac.py +++ /dev/null @@ -1,321 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec( - wavs.unsqueeze(1), n_quantizers=self.hparams.num_codebooks - ) - embeddings = self.modules.discrete_embedding_layer( - tokens.movedim(-2, -1) - ) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py deleted file mode 100644 index 640f6a220..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_discrete_ssl.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _, _ = self.hparams.codec( - wavs, wav_lens, **self.hparams.tokenizer_config - ) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py deleted file mode 100644 index eb7232303..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_encodec.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens, _ = self.hparams.codec.encode(wavs, wav_lens) - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py deleted file mode 100644 index cd784c80c..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_speech_tokenizer.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an discrete tokens + ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path -import torchaudio - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - # Feature extraction and attention pooling - with torch.no_grad(): - self.hparams.codec.to(self.device).eval() - tokens = self.hparams.codec(wavs).permute(1, 2, 0)[ - :, :, : self.hparams.num_codebooks - ] - embeddings = self.modules.discrete_embedding_layer(tokens) - att_w = self.modules.attention_mlp(embeddings) - feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the model optimizer" - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - info = torchaudio.info(wav) - resampled = torchaudio.transforms.Resample( - info.sample_rate, hparams["sample_rate"], - )(sig) - # resampled = resampled.unsqueeze(0) - return resampled - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py b/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py deleted file mode 100644 index 6d053fceb..000000000 --- a/benchmarks/DASB/LibriSpeech/ASR/contextnet/train_weighted_ssl.py +++ /dev/null @@ -1,318 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training an SSL-based ctc ASR system with librispeech. -Decoding is performed with greedy decoding at validation time. -At test time, beamsearch is used with an optional external language model. - -Authors - * Pooneh Mousavi 2024 -""" - -import os -import sys -import torch -import logging -import speechbrain as sb -from speechbrain.utils.distributed import run_on_main, if_main_process -from hyperpyyaml import load_hyperpyyaml -from pathlib import Path - -logger = logging.getLogger(__name__) - - -# Define training procedure -class ASR(sb.Brain): - def compute_forward(self, batch, stage): - """Forward computations from the waveform batches to the output probabilities.""" - batch = batch.to(self.device) - wavs, wav_lens = batch.sig - - # Forward pass - feats = self.modules.weighted_ssl_model(wavs) - y = self.modules.enc(feats) - - # Compute outputs - p_tokens = None - logits = self.modules.ctc_lin(y) - p_ctc = self.hparams.log_softmax(logits) - - if stage == sb.Stage.VALID: - p_tokens = sb.decoders.ctc_greedy_decode( - p_ctc, wav_lens, blank_id=self.hparams.blank_index - ) - elif stage == sb.Stage.TEST: - p_tokens = test_searcher(p_ctc, wav_lens) - - return p_ctc, wav_lens, p_tokens - - def compute_objectives(self, predictions, batch, stage): - """Computes the loss (CTC+NLL) given predictions and targets.""" - - p_ctc, wav_lens, predicted_tokens = predictions - ids = batch.id - tokens, tokens_lens = batch.tokens - loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens) - - if stage == sb.Stage.VALID: - # Decode token terms to words - predicted_words = [ - "".join(self.tokenizer.decode_ndim(utt_seq)).split(" ") - for utt_seq in predicted_tokens - ] - elif stage == sb.Stage.TEST: - predicted_words = [ - hyp[0].text.split(" ") for hyp in predicted_tokens - ] - - if stage != sb.Stage.TRAIN: - target_words = [wrd.split(" ") for wrd in batch.wrd] - self.wer_metric.append(ids, predicted_words, target_words) - self.cer_metric.append(ids, predicted_words, target_words) - - return loss - - def on_stage_start(self, stage, epoch): - """Gets called at the beginning of each epoch""" - if stage != sb.Stage.TRAIN: - self.cer_metric = self.hparams.cer_computer() - self.wer_metric = self.hparams.error_rate_computer() - - def on_stage_end(self, stage, stage_loss, epoch): - """Gets called at the end of an epoch.""" - # Compute/store important stats - stage_stats = {"loss": stage_loss} - if stage == sb.Stage.TRAIN: - self.train_stats = stage_stats - else: - stage_stats["CER"] = self.cer_metric.summarize("error_rate") - stage_stats["WER"] = self.wer_metric.summarize("error_rate") - - # Perform end-of-iteration things, like annealing, logging, etc. - if stage == sb.Stage.VALID: - old_lr_model, new_lr_model = self.hparams.lr_annealing_model( - stage_stats["loss"] - ) - old_lr_weights, new_lr_weights = self.hparams.lr_annealing_weights( - stage_stats["loss"] - ) - sb.nnet.schedulers.update_learning_rate( - self.model_optimizer, new_lr_model - ) - sb.nnet.schedulers.update_learning_rate( - self.weights_optimizer, new_lr_weights - ) - - self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr_model": old_lr_model}, - train_stats=self.train_stats, - valid_stats=stage_stats, - ) - self.checkpointer.save_and_keep_only( - meta={"WER": stage_stats["WER"]}, min_keys=["WER"], - ) - elif stage == sb.Stage.TEST: - self.hparams.train_logger.log_stats( - stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, - test_stats=stage_stats, - ) - if if_main_process(): - with open(self.hparams.test_wer_file, "w") as w: - self.wer_metric.write_stats(w) - - def init_optimizers(self): - "Initializes the weights optimizer and model optimizer" - self.weights_optimizer = self.hparams.weights_opt_class( - [self.modules.weighted_ssl_model.weights] - ) - self.model_optimizer = self.hparams.model_opt_class( - self.hparams.model.parameters() - ) - self.optimizers_dict = { - "weights_optimizer": self.weights_optimizer, - "model_optimizer": self.model_optimizer, - } - # Initializing the weights - if self.checkpointer is not None: - self.checkpointer.add_recoverable("modelopt", self.model_optimizer) - self.checkpointer.add_recoverable( - "weights_opt", self.weights_optimizer - ) - - -def dataio_prepare(hparams): - """This function prepares the datasets to be used in the brain class. - It also defines the data processing pipeline through user-defined functions.""" - data_folder = hparams["data_folder"] - - train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, - ) - - if hparams["sorting"] == "ascending": - # we sort training data to speed up training and get better results. - train_data = train_data.filtered_sorted(sort_key="duration") - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "descending": - train_data = train_data.filtered_sorted( - sort_key="duration", reverse=True - ) - # when sorting do not shuffle in dataloader ! otherwise is pointless - hparams["train_dataloader_opts"]["shuffle"] = False - - elif hparams["sorting"] == "random": - pass - - else: - raise NotImplementedError( - "sorting must be random, ascending or descending" - ) - - valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, - ) - valid_data = valid_data.filtered_sorted(sort_key="duration") - - # test is separate - test_datasets = {} - for csv_file in hparams["test_csv"]: - name = Path(csv_file).stem - test_datasets[name] = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_file, replacements={"data_root": data_folder} - ) - test_datasets[name] = test_datasets[name].filtered_sorted( - sort_key="duration" - ) - - datasets = [train_data, valid_data] + [i for k, i in test_datasets.items()] - - # 2. Define audio pipeline: - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - sig = sb.dataio.dataio.read_audio(wav) - return sig - - sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline) - label_encoder = sb.dataio.encoder.CTCTextEncoder() - - # 3. Define text pipeline: - @sb.utils.data_pipeline.takes("wrd") - @sb.utils.data_pipeline.provides( - "wrd", "char_list", "tokens_list", "tokens" - ) - def text_pipeline(wrd): - yield wrd - char_list = list(wrd) - yield char_list - tokens_list = label_encoder.encode_sequence(char_list) - yield tokens_list - tokens = torch.LongTensor(tokens_list) - yield tokens - - sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline) - - lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt") - special_labels = { - "blank_label": hparams["blank_index"], - "unk_label": hparams["unk_index"], - } - label_encoder.load_or_create( - path=lab_enc_file, - from_didatasets=[train_data], - output_key="char_list", - special_labels=special_labels, - sequence_input=True, - ) - - # 4. Set output: - sb.dataio.dataset.set_output_keys( - datasets, ["id", "sig", "wrd", "char_list", "tokens"], - ) - return train_data, valid_data, test_datasets, label_encoder - - -if __name__ == "__main__": - - # CLI: - hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) - - # If distributed_launch=True then - # create ddp_group with the right communication protocol - sb.utils.distributed.ddp_init_group(run_opts) - - with open(hparams_file) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Create experiment directory - sb.create_experiment_directory( - experiment_directory=hparams["output_folder"], - hyperparams_to_save=hparams_file, - overrides=overrides, - ) - - # Dataset prep (parsing Librispeech) - from librispeech_prepare import prepare_librispeech # noqa - - # multi-gpu (ddp) save data preparation - run_on_main( - prepare_librispeech, - kwargs={ - "data_folder": hparams["data_folder"], - "tr_splits": hparams["train_splits"], - "dev_splits": hparams["dev_splits"], - "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], - "merge_lst": hparams["train_splits"], - "merge_name": "train.csv", - "skip_prep": hparams["skip_prep"], - }, - ) - - # here we create the datasets objects as well as tokenization and encoding - train_data, valid_data, test_datasets, label_encoder = dataio_prepare( - hparams - ) - - # Trainer initialization - asr_brain = ASR( - modules=hparams["modules"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) - - # We dynamicaly add the tokenizer to our brain class. - asr_brain.tokenizer = label_encoder - - ind2lab = label_encoder.ind2lab - vocab_list = [ind2lab[x] for x in range(len(ind2lab))] - - from speechbrain.decoders.ctc import CTCBeamSearcher - - test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], vocab_list=vocab_list, - ) - - # Training - asr_brain.fit( - asr_brain.hparams.epoch_counter, - train_data, - valid_data, - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) - - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.test_wer_file = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/LSTM/train.yaml rename to benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/hparams/contextnet/train.yaml rename to benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py b/benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/librispeech_prepare.py rename to benchmarks/DASB/LibriSpeech/ASR/librispeech_prepare.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py similarity index 100% rename from benchmarks/DASB/LibriSpeech/ASR-refactor-tokens/train.py rename to benchmarks/DASB/LibriSpeech/ASR/train.py diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index ef3e677b5..62d45cfec 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -62,16 +62,14 @@ csv_path = hparams[f"{split}_csv"] name = pl.Path(csv_path).stem dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=csv_path, - replacements={"data_root": data_folder}, + csv_path=csv_path, replacements={"data_root": data_folder}, ) datasets.append(dataset) for split in hparams["test_csv"]: name = pl.Path(split).stem dataset = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=split, - replacements={"data_root": data_folder}, + csv_path=split, replacements={"data_root": data_folder}, ) datasets.append(dataset) diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 652fa53e1..3499bba9e 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -39,15 +39,15 @@ def tokens_to_sig(self, tokens, **kwargs): @abstractmethod @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size, num_codebooks, device="cpu", **kwargs + self, vocab_size, num_codebooks, **kwargs ): """Get codebook embeddings.""" pass class EncodecTokenizer(Encodec, BaseTokenizer): - def __init__(self, source, **kwargs): - Encodec.__init__(self, source=source, **kwargs) + def __init__(self, *args, **kwargs): + Encodec.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) @torch.no_grad() @@ -70,7 +70,7 @@ def tokens_to_sig(self, tokens, **kwargs): @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size=None, num_codebooks=None, device=None, **kwargs + self, vocab_size=None, num_codebooks=None, **kwargs ): embeddings = self.vocabulary return embeddings.reshape(-1, embeddings.shape[-1]) @@ -97,7 +97,7 @@ def tokens_to_sig(self, tokens, **kwargs): @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size, num_codebooks, device="cpu", **kwargs + self, vocab_size=None, num_codebooks=None , **kwargs ): toks = torch.arange(vocab_size, device=device) toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() @@ -135,11 +135,11 @@ def tokens_to_sig(self, tokens, **kwargs): @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size, num_codebooks, device="cpu", **kwargs + self, vocab_size=None, num_codebooks=None , **kwargs ): - toks = torch.arange(vocab_size, device=device) + toks = torch.arange(vocab_size) toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() - self.to(device).eval() + self.eval() embs = [ self.model.quantizer.vq.layers[i].decode(indices) for i, indices in enumerate(toks) @@ -153,29 +153,31 @@ def __init__(self, *args, **kwargs): BaseTokenizer.__init__(self) @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths, num_codebooks=None,**kwargs): self.eval() - tokens, _, _ = self.encode(signal, lengths) - if num_codebooks: - if tokens.shape[-1] < num_codebooks: - raise ValueError( - f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" - ) - tokens = tokens[..., :num_codebooks] + tokens, _, _ = self.encode(signal, lengths, SSL_layers=num_codebooks,**kwargs) return tokens @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): self.eval() - return self.decode(tokens) + return self.decode(tokens, **kwargs) @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size, num_codebooks, device="cpu", **kwargs + self, vocab_size=None, num_codebooks=None, **kwargs ): - toks = torch.arange(vocab_size, device=device) - toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() - self.to(device).eval() - return torch.cat( - [self.quantizer.codebooks[i] for i in range(num_codebooks)] - ) + embs = [] + for layer_num, vocabulary in zip( + self.ssl_layer_ids, + self.vocabularies, + ): + if layer_num not in num_codebooks: + continue + embs.append( + torch.as_tensor( + vocabulary, dtype=torch.float32 + ) + ) + embs = torch.cat(embs) + return embs diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 705184d80..7090325db 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -191,8 +191,7 @@ def audio_pipeline(wav): info = torchaudio.info(wav) sig = sb.dataio.dataio.read_audio(wav) sig = torchaudio.transforms.Resample( - info.sample_rate, - self.sample_rate, + info.sample_rate, self.sample_rate, )(sig) return sig @@ -283,9 +282,7 @@ class TokensLoader: """ def __init__( - self, - data_path, - save_name="tokens", + self, data_path, save_name="tokens", ): self.data_path = pl.Path(data_path) if not self.data_path.exists(): From e317d3a95e1839e86503799269f458abce62794a Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 13:25:00 -0500 Subject: [PATCH 013/270] clean code and fix speechtokenzier bug --- .../extraction/hparams/discrete_ssl.yaml | 100 ++++++++++++++++++ benchmarks/DASB/extra_requirements.txt | 1 + benchmarks/DASB/model/tokenizer_interface.py | 1 + 3 files changed, 102 insertions(+) create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..6a58b0135 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,100 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: wavml +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +num_clusters: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +# ssl_layer_num: [3, 7, 12, 23] +# deduplicate: [False, False, False, False] +# bpe_tokenizer_path: [null , null, null, null] +ssl_layer_num: [1, 3, 7, 12, 18, 23] +num_codebooks: 6 +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref \ No newline at end of file diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index 4d1d241c3..db9ae4376 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -8,3 +8,4 @@ speechtokenizer>=0.1.2 tensorboard tgt unidecode +kaldiio diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 3499bba9e..68fdf4221 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -115,6 +115,7 @@ class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): def __init__(self, *args, **kwargs): SpeechTokenizer_interface.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) + self.sample_rate = 16000 @torch.no_grad() def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): From fcb5209e80ae7f4588ec4c205884f4d14a06bbc0 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 14:29:59 -0500 Subject: [PATCH 014/270] fix discrete_ssl bug --- .../extraction/hparams/discrete_ssl.yaml | 2 +- .../hparams/discrete_ssl_wavlm.yaml | 83 ------------------- 2 files changed, 1 insertion(+), 84 deletions(-) delete mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 6a58b0135..d6715c54e 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -54,7 +54,7 @@ save_embedding: False # deduplicate: [False, False, False, False] # bpe_tokenizer_path: [null , null, null, null] ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 +num_codebooks: [1, 3, 7, 12, 18, 23] deduplicate: [False, False, False, False, False, False] bpe_tokenizer_path: [null, null, null, null, null, null] sample_rate: 16000 diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml deleted file mode 100644 index 2263547c5..000000000 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl_wavlm.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# ############################################################################ -# Auido Tokenizer: WavLM -# Extraction: Librispeech 960h -# Authors: Jarod Duret 2024 -# ############################################################################ -# Seed needs to be set at top of yaml, before objects with parameters are made - -seed: 1986 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/wavlm -save_folder: !ref /save -train_log: !ref /extraction_log.txt - -# Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" -dev_splits: ["dev-clean"] -test_splits: ["dev-clean", "test-clean", "test-other"] -skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv -test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv - -batch_size: 8 -num_workers: 8 -src_key: wav -id_key: id - -# Dataloader options -dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref - -### Configuration for discrete SSL model -# ssl_model_type: hubert, wavlm, wav2vec2 -# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: wavml -ssl_hub: microsoft/wavlm-large -ssl_folder: !ref /ssl_checkpoint -kmeans_cache_dir: !ref /kmeans_checkpoint -kmeans_dataset: LibriSpeech -vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS -freeze_ssl: True -freeze_feature_extractor: True -num_clusters: 1000 -save_embedding: False - -### Config for Tokenizer -# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] -num_codebooks: 6 -deduplicate: [False, False, False, False, False, False] -bpe_tokenizer_path: [null, null, null, null, null, null] -sample_rate: 16000 -encoder_dim: 1024 - -ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - -tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref - -tokens_extractor: !new:utils.tokens.TokensExtractor - tokenizer: !ref - sample_rate: !ref - src_key: !ref - id_key: !ref - dataloader_opts: !ref \ No newline at end of file From 0d575d43cb4e818d5bf01d90906d9844e21d8a05 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 17:55:16 -0500 Subject: [PATCH 015/270] fix bug --- .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index d6715c54e..9ce170b66 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -37,7 +37,7 @@ dataloader_opts: ### Configuration for discrete SSL model # ssl_model_type: hubert, wavlm, wav2vec2 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: wavml +ssl_model_type: wavlm ssl_hub: microsoft/wavlm-large ssl_folder: !ref /ssl_checkpoint kmeans_cache_dir: !ref /kmeans_checkpoint From 447844c3fcb5ff22e9bc5212725afad02d648d76 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 20:52:21 -0500 Subject: [PATCH 016/270] fix bug --- benchmarks/DASB/model/tokenizer_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 68fdf4221..6c5ab5acd 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -99,9 +99,9 @@ def tokens_to_sig(self, tokens, **kwargs): def get_pretrained_embeddings( self, vocab_size=None, num_codebooks=None , **kwargs ): - toks = torch.arange(vocab_size, device=device) + toks = torch.arange(vocab_size) toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() - self.to(device).eval() + self.eval() z_q, z_p, _ = self.quantizer.from_codes(toks) z_ps = z_p.split(z_p.shape[1] // toks.shape[1], dim=1) z_qs = [ From 8aeaeb92a8238cece88e619cb0674809368409d9 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 21:04:39 -0500 Subject: [PATCH 017/270] fix discrete_ssl train.py for specifiying which layer to use --- benchmarks/DASB/utils/tokens.py | 37 +++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 7090325db..994490958 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -299,9 +299,10 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None): --------- utt_id : str The utterance ID to retrieve tokens for. - num_codebooks : int, optional - The number of codebooks to retrieve from the tokens. If specified, the tokens will be truncated - to include only the first `num_codebooks` codebooks. If not specified, all codebooks are returned. + num_codebooks : int or list, optional + The number of codebooks to retrieve from the tokens. If specified as an int, the tokens + will be truncated to include only the first `num_codebooks` codebooks. If specified as a list, + the tokens will include only the codebooks at the specified indices. If not specified, all codebooks are returned. Returns ------- @@ -322,16 +323,26 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None): tokens = torch.from_numpy(tokens).long() if num_codebooks is not None: - if not isinstance(num_codebooks, int) or num_codebooks <= 0: - raise ValueError( - f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer." - ) - if num_codebooks > tokens.size(-1): - raise ValueError( - f"Invalid number of codebooks: {num_codebooks}. " - f"Available codebooks: {tokens.size(-1)}." - ) - tokens = tokens[:, :num_codebooks] + if isinstance(num_codebooks, int): + if num_codebooks <= 0: + raise ValueError( + f"Invalid num_codebooks value: {num_codebooks}. It must be a positive integer." + ) + if num_codebooks > tokens.size(-1): + raise ValueError( + f"Invalid number of codebooks: {num_codebooks}. " + f"Available codebooks: {tokens.size(-1)}." + ) + tokens = tokens[:, :num_codebooks] + elif isinstance(num_codebooks, list): + if not all(isinstance(idx, int) and 0 <= idx < tokens.size(-1) for idx in num_codebooks): + raise ValueError( + f"Invalid indices in num_codebooks list: {num_codebooks}. " + f"All indices must be integers within the range [0, {tokens.size(-1) - 1}]." + ) + tokens = tokens[:, num_codebooks] + else: + raise ValueError("num_codebooks must be an int or a list.") return tokens From c831e609dc78d251089433a01baf39ec2beccc24 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 21:18:05 -0500 Subject: [PATCH 018/270] fix discrete_ssl --- benchmarks/DASB/model/custom_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 1c655fc65..3ad6830c6 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -63,7 +63,7 @@ def __init__( ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size - self.num_codebooks = num_codebooks + self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks self.freeze = freeze self.embedding = torch.nn.Embedding( num_codebooks * vocab_size, emb_dim From ecf761a99e2593a5fa9d28a97339864dc9247878 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 21:19:46 -0500 Subject: [PATCH 019/270] fix bug introduced in last commit --- benchmarks/DASB/model/custom_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 3ad6830c6..01ff586df 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -66,7 +66,7 @@ def __init__( self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks self.freeze = freeze self.embedding = torch.nn.Embedding( - num_codebooks * vocab_size, emb_dim + self.num_codebooks * vocab_size, emb_dim ).requires_grad_(not self.freeze) self.init = init From 0d2e30989e772e44faeb94e2bdb841b7fa26c9cf Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 22:40:51 -0500 Subject: [PATCH 020/270] fix bug in saving pretrained embedding --- benchmarks/DASB/LibriSpeech/extraction/extract.py | 4 +++- benchmarks/DASB/utils/tokens.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 62d45cfec..93b309ff5 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -92,5 +92,7 @@ save_folder = pl.Path(hparams["save_folder"]) logger.info(f"Saving embeddings ...") tokens_extractor.save_pretrained_embeddings( - (save_folder / "embeddings").as_posix() + (save_folder / "embeddings").as_posix(), + hparams["num_codebooks"], + hparams["vocab_size"] ) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 994490958..474ec496f 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -197,7 +197,7 @@ def audio_pipeline(wav): return [audio_pipeline] - def save_pretrained_embeddings(self, save_path, save_name="embeddings"): + def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codebooks=None, vocab_size=None): """ Saves the pretrained embeddings of the tokenizer to a specified directory. @@ -216,7 +216,7 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings"): save_path = pl.Path(save_path).absolute() save_path.mkdir(parents=True, exist_ok=True) - embeddings = self.tokenizer.get_pretrained_embeddings() + embeddings = self.tokenizer.get_pretrained_embeddings(num_codebooks,vocab_size) embeddings = embeddings.cpu().numpy() np.save(save_path / save_name, embeddings) From 4729007d099d3d474c4a47f89602ed64a18214f0 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 22:44:08 -0500 Subject: [PATCH 021/270] fix --- benchmarks/DASB/LibriSpeech/extraction/extract.py | 3 ++- benchmarks/DASB/utils/tokens.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 93b309ff5..5ee2bbbba 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -93,6 +93,7 @@ logger.info(f"Saving embeddings ...") tokens_extractor.save_pretrained_embeddings( (save_folder / "embeddings").as_posix(), + hparams["vocab_size"], hparams["num_codebooks"], - hparams["vocab_size"] + ) diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 474ec496f..930b10253 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -197,7 +197,7 @@ def audio_pipeline(wav): return [audio_pipeline] - def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codebooks=None, vocab_size=None): + def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_size=None,num_codebooks=None): """ Saves the pretrained embeddings of the tokenizer to a specified directory. @@ -216,7 +216,7 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings",num_codeb save_path = pl.Path(save_path).absolute() save_path.mkdir(parents=True, exist_ok=True) - embeddings = self.tokenizer.get_pretrained_embeddings(num_codebooks,vocab_size) + embeddings = self.tokenizer.get_pretrained_embeddings(vocab_size,num_codebooks) embeddings = embeddings.cpu().numpy() np.save(save_path / save_name, embeddings) From 7a0ecc2875db91f06301c11adc5ef080a2b3647c Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 23:00:46 -0500 Subject: [PATCH 022/270] fix bug intriduced in prev commit --- benchmarks/DASB/LibriSpeech/extraction/extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 5ee2bbbba..7310a1469 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -93,7 +93,7 @@ logger.info(f"Saving embeddings ...") tokens_extractor.save_pretrained_embeddings( (save_folder / "embeddings").as_posix(), - hparams["vocab_size"], - hparams["num_codebooks"], + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], ) From 73dfa4d32429dc64f18be78201779c39771d2c44 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 23:30:09 -0500 Subject: [PATCH 023/270] fix bug for saveing embeedng --- .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml | 4 ++-- benchmarks/DASB/model/tokenizer_interface.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 9ce170b66..6d38e285c 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -45,7 +45,7 @@ kmeans_dataset: LibriSpeech vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS freeze_ssl: True freeze_feature_extractor: True -num_clusters: 1000 +vocab_size: 1000 save_embedding: False ### Config for Tokenizer @@ -90,7 +90,7 @@ tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer ssl_model: !ref vocoder_repo_id: !ref kmeans_dataset: !ref - num_clusters: !ref + num_clusters: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 6c5ab5acd..a4d3ae111 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -138,7 +138,7 @@ def tokens_to_sig(self, tokens, **kwargs): def get_pretrained_embeddings( self, vocab_size=None, num_codebooks=None , **kwargs ): - toks = torch.arange(vocab_size) + toks = torch.arange(vocab_size).to(next(self.parameters()).device) toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() self.eval() embs = [ @@ -181,4 +181,4 @@ def get_pretrained_embeddings( ) ) embs = torch.cat(embs) - return embs + return embs \ No newline at end of file From a9e8f3b8b2e76831c7c460fde426479b95b2c769 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 23:35:46 -0500 Subject: [PATCH 024/270] add vocab_size to encodec --- benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index 81cbd0fb2..255914c86 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -36,6 +36,7 @@ dataloader_opts: bandwidth: 1.5 num_codebooks: 2 +vocab_size: 1024 sample_rate: 24000 save_embedding: False From 4237bacf10713ffa5281447909fb6e6a51230af7 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 23 Dec 2024 23:48:45 -0500 Subject: [PATCH 025/270] fix bug --- benchmarks/DASB/model/tokenizer_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index a4d3ae111..91dea8042 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -99,7 +99,7 @@ def tokens_to_sig(self, tokens, **kwargs): def get_pretrained_embeddings( self, vocab_size=None, num_codebooks=None , **kwargs ): - toks = torch.arange(vocab_size) + toks = torch.arange(vocab_size).to(next(self.parameters()).device) toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() self.eval() z_q, z_p, _ = self.quantizer.from_codes(toks) From 867228ebcb1ca586944315c91ecea312650cf7ac Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 00:24:15 -0500 Subject: [PATCH 026/270] fix embedding loading for train.py --- benchmarks/DASB/LibriSpeech/ASR/train.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index 746a068e1..b6a9f712e 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -361,6 +361,12 @@ def text_pipeline(wrd): embs = tokens_loader.load_pretrained_embeddings( hparams["pretain_embeddings_folder"] ) + if isinstance(hparams['num_codebooks'], int): + embs= embs[:hparams['num_codebooks']*hparams['vocab_size'],] + elif isinstance(hparams['num_codebooks'], list): + indices = [i for codebook_idx in hparams['num_codebooks'] for i in range(codebook_idx * hparams['vocab_size'], (codebook_idx + 1) * hparams['vocab_size'])] + indices = torch.tensor(indices, dtype=torch.long) + embs = embs[indices] hparams["discrete_embedding_layer"].init_embedding(embs) # Log number of parameters/buffers From 3570b636309becb4318371f546deefece4599d6d Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 10:45:25 -0500 Subject: [PATCH 027/270] fix precommit --- .../LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +- benchmarks/DASB/LibriSpeech/ASR/train.py | 32 ++++++---- .../DASB/LibriSpeech/extraction/extract.py | 3 - .../LibriSpeech/extraction/hparams/dac.yaml | 16 ++--- .../extraction/hparams/discrete_ssl.yaml | 64 +++++++++---------- .../extraction/hparams/encodec.yaml | 22 +++---- .../extraction/hparams/speech_tokenizer.yaml | 12 ++-- benchmarks/DASB/extra_requirements.txt | 2 +- benchmarks/DASB/model/custom_model.py | 6 +- benchmarks/DASB/model/tokenizer_interface.py | 25 +++----- benchmarks/DASB/utils/tokens.py | 17 ++++- 11 files changed, 106 insertions(+), 95 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 89d347862..0f807c937 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -132,7 +132,7 @@ prune_history: False ############################## models ################################ tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer num_codebooks: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index b6a9f712e..d7b86f659 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -175,8 +175,7 @@ def dataio_prepare(hparams, tokenizer): data_folder = hparams["data_folder"] train_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["train_csv"], - replacements={"data_root": data_folder}, + csv_path=hparams["train_csv"], replacements={"data_root": data_folder}, ) if hparams["sorting"] == "ascending": @@ -201,8 +200,7 @@ def dataio_prepare(hparams, tokenizer): ) valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv( - csv_path=hparams["valid_csv"], - replacements={"data_root": data_folder}, + csv_path=hparams["valid_csv"], replacements={"data_root": data_folder}, ) valid_data = valid_data.filtered_sorted(sort_key="duration") @@ -238,8 +236,7 @@ def audio_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) info = torchaudio.info(wav) resampled = torchaudio.transforms.Resample( - info.sample_rate, - hparams["sample_rate"], + info.sample_rate, hparams["sample_rate"], )(sig) # resampled = resampled.unsqueeze(0) return resampled @@ -264,8 +261,7 @@ def text_pipeline(wrd): # 4. Set output: sb.dataio.dataset.set_output_keys( - datasets, - ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"], + datasets, ["id", "sig", "wrd", "char_list", "tokens", "speech_tokens"], ) # 5. If Dynamic Batching is used, we instantiate the needed samplers. @@ -361,10 +357,19 @@ def text_pipeline(wrd): embs = tokens_loader.load_pretrained_embeddings( hparams["pretain_embeddings_folder"] ) - if isinstance(hparams['num_codebooks'], int): - embs= embs[:hparams['num_codebooks']*hparams['vocab_size'],] - elif isinstance(hparams['num_codebooks'], list): - indices = [i for codebook_idx in hparams['num_codebooks'] for i in range(codebook_idx * hparams['vocab_size'], (codebook_idx + 1) * hparams['vocab_size'])] + if isinstance(hparams["num_codebooks"], int): + embs = embs[ + : hparams["num_codebooks"] * hparams["vocab_size"], + ] + elif isinstance(hparams["num_codebooks"], list): + indices = [ + i + for codebook_idx in hparams["num_codebooks"] + for i in range( + codebook_idx * hparams["vocab_size"], + (codebook_idx + 1) * hparams["vocab_size"], + ) + ] indices = torch.tensor(indices, dtype=torch.long) embs = embs[indices] hparams["discrete_embedding_layer"].init_embedding(embs) @@ -401,8 +406,7 @@ def text_pipeline(wrd): from speechbrain.decoders.ctc import CTCBeamSearcher test_searcher = CTCBeamSearcher( - **hparams["test_beam_search"], - vocab_list=vocab_list, + **hparams["test_beam_search"], vocab_list=vocab_list, ) train_dataloader_opts = hparams["train_dataloader_opts"] diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 7310a1469..3979ba731 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -7,8 +7,6 @@ import os import sys -import torch -import torchaudio import logging import pathlib as pl import speechbrain as sb @@ -95,5 +93,4 @@ (save_folder / "embeddings").as_posix(), vocab_size=hparams["vocab_size"], num_codebooks=hparams["num_codebooks"], - ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml index c380f0478..13356cf63 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -30,9 +30,9 @@ id_key: id # Dataloader options dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref + batch_size: !ref + shuffle: True + num_workers: !ref ####################### Model parameters ########################### # Tokenizer parameters @@ -52,14 +52,14 @@ encoder_dim: 1024 save_embedding: False tokenizer: !new:model.tokenizer_interface.DACTokenizer - model_type: !ref - model_bitrate: !ref - load_pretrained: True - tag: latest + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref sample_rate: !ref src_key: !ref id_key: !ref - dataloader_opts: !ref \ No newline at end of file + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 6d38e285c..847038dd2 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -30,9 +30,9 @@ id_key: id # Dataloader options dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref + batch_size: !ref + shuffle: True + num_workers: !ref ### Configuration for discrete SSL model # ssl_model_type: hubert, wavlm, wav2vec2 @@ -61,40 +61,40 @@ sample_rate: 16000 encoder_dim: 1024 ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - output_norm: False - freeze: !ref - freeze_feature_extractor: !ref - output_all_hiddens: True - save_path: !ref + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref sample_rate: !ref src_key: !ref id_key: !ref - dataloader_opts: !ref \ No newline at end of file + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index 255914c86..3cd3b691a 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -30,9 +30,9 @@ id_key: id # Dataloader options dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref + batch_size: !ref + shuffle: True + num_workers: !ref bandwidth: 1.5 num_codebooks: 2 @@ -42,17 +42,17 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:model.tokenizer_interface.EncodecTokenizer - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref sample_rate: !ref src_key: !ref id_key: !ref - dataloader_opts: !ref \ No newline at end of file + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 176768d5e..7726422f3 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -30,9 +30,9 @@ id_key: id # Dataloader options dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref + batch_size: !ref + shuffle: True + num_workers: !ref vocab_size: 1024 num_codebooks: 2 @@ -43,12 +43,12 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:model.tokenizer_interface.SpeechTokenizer - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref sample_rate: !ref src_key: !ref id_key: !ref - dataloader_opts: !ref \ No newline at end of file + dataloader_opts: !ref diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index db9ae4376..e04ccf781 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -1,5 +1,6 @@ beartype jsonlines +kaldiio librosa>=0.9.2 onnxruntime>=1.16.3 scikit-learn @@ -8,4 +9,3 @@ speechtokenizer>=0.1.2 tensorboard tgt unidecode -kaldiio diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 01ff586df..972d35c66 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -63,7 +63,11 @@ def __init__( ): super(Discrete_EmbeddingLayer, self).__init__() self.vocab_size = vocab_size - self.num_codebooks = len(num_codebooks) if isinstance(num_codebooks, list) else num_codebooks + self.num_codebooks = ( + len(num_codebooks) + if isinstance(num_codebooks, list) + else num_codebooks + ) self.freeze = freeze self.embedding = torch.nn.Embedding( self.num_codebooks * vocab_size, emb_dim diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/model/tokenizer_interface.py index 91dea8042..f63ddd6aa 100644 --- a/benchmarks/DASB/model/tokenizer_interface.py +++ b/benchmarks/DASB/model/tokenizer_interface.py @@ -38,9 +38,7 @@ def tokens_to_sig(self, tokens, **kwargs): @abstractmethod @torch.no_grad() - def get_pretrained_embeddings( - self, vocab_size, num_codebooks, **kwargs - ): + def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs): """Get codebook embeddings.""" pass @@ -97,7 +95,7 @@ def tokens_to_sig(self, tokens, **kwargs): @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size=None, num_codebooks=None , **kwargs + self, vocab_size=None, num_codebooks=None, **kwargs ): toks = torch.arange(vocab_size).to(next(self.parameters()).device) toks = toks[:, None, None].expand(-1, num_codebooks, -1).clone() @@ -136,7 +134,7 @@ def tokens_to_sig(self, tokens, **kwargs): @torch.no_grad() def get_pretrained_embeddings( - self, vocab_size=None, num_codebooks=None , **kwargs + self, vocab_size=None, num_codebooks=None, **kwargs ): toks = torch.arange(vocab_size).to(next(self.parameters()).device) toks = toks[None, :, None].expand(num_codebooks, -1, -1).clone() @@ -154,9 +152,11 @@ def __init__(self, *args, **kwargs): BaseTokenizer.__init__(self) @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None,**kwargs): + def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): self.eval() - tokens, _, _ = self.encode(signal, lengths, SSL_layers=num_codebooks,**kwargs) + tokens, _, _ = self.encode( + signal, lengths, SSL_layers=num_codebooks, **kwargs + ) return tokens @torch.no_grad() @@ -170,15 +170,10 @@ def get_pretrained_embeddings( ): embs = [] for layer_num, vocabulary in zip( - self.ssl_layer_ids, - self.vocabularies, + self.ssl_layer_ids, self.vocabularies, ): if layer_num not in num_codebooks: continue - embs.append( - torch.as_tensor( - vocabulary, dtype=torch.float32 - ) - ) + embs.append(torch.as_tensor(vocabulary, dtype=torch.float32)) embs = torch.cat(embs) - return embs \ No newline at end of file + return embs diff --git a/benchmarks/DASB/utils/tokens.py b/benchmarks/DASB/utils/tokens.py index 930b10253..03ea5049c 100644 --- a/benchmarks/DASB/utils/tokens.py +++ b/benchmarks/DASB/utils/tokens.py @@ -197,7 +197,13 @@ def audio_pipeline(wav): return [audio_pipeline] - def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_size=None,num_codebooks=None): + def save_pretrained_embeddings( + self, + save_path, + save_name="embeddings", + vocab_size=None, + num_codebooks=None, + ): """ Saves the pretrained embeddings of the tokenizer to a specified directory. @@ -216,7 +222,9 @@ def save_pretrained_embeddings(self, save_path, save_name="embeddings",vocab_siz save_path = pl.Path(save_path).absolute() save_path.mkdir(parents=True, exist_ok=True) - embeddings = self.tokenizer.get_pretrained_embeddings(vocab_size,num_codebooks) + embeddings = self.tokenizer.get_pretrained_embeddings( + vocab_size, num_codebooks + ) embeddings = embeddings.cpu().numpy() np.save(save_path / save_name, embeddings) @@ -335,7 +343,10 @@ def tokens_by_uttid(self, utt_id, num_codebooks=None): ) tokens = tokens[:, :num_codebooks] elif isinstance(num_codebooks, list): - if not all(isinstance(idx, int) and 0 <= idx < tokens.size(-1) for idx in num_codebooks): + if not all( + isinstance(idx, int) and 0 <= idx < tokens.size(-1) + for idx in num_codebooks + ): raise ValueError( f"Invalid indices in num_codebooks list: {num_codebooks}. " f"All indices must be integers within the range [0, {tokens.size(-1) - 1}]." From 3ef996451bd4c189bb2b9d0032068f89824a58c1 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 10:51:32 -0500 Subject: [PATCH 028/270] move tokenizer_interface to util --- .../DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml | 2 +- .../DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml | 2 +- .../ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml | 2 +- .../DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml | 2 +- .../LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml | 2 +- .../ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml | 2 +- benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml | 2 +- .../DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml | 2 +- benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml | 2 +- .../DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml | 2 +- benchmarks/DASB/{model => utils}/tokenizer_interface.py | 0 11 files changed, 10 insertions(+), 10 deletions(-) rename benchmarks/DASB/{model => utils}/tokenizer_interface.py (100%) diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml index 806305774..ff1749fab 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml @@ -135,7 +135,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.DACTokenizer +tokenizer: !new:utils.tokenizer_interface.DACTokenizer model_type: !ref model_bitrate: !ref load_pretrained: True diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml index 18d967244..dd4f62bf4 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml @@ -132,7 +132,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.EncodecTokenizer +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio save_path: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml index 99d423b87..bb0b32a43 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml @@ -127,7 +127,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml index aa7d2e141..b60b32604 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml @@ -131,7 +131,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.DACTokenizer +tokenizer: !new:utils.tokenizer_interface.DACTokenizer model_type: !ref model_bitrate: !ref load_pretrained: True diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml index a1b5262d3..7c0dcfc45 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml @@ -125,7 +125,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.EncodecTokenizer +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio save_path: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml index c12d6f79f..3dcd7eea7 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml @@ -121,7 +121,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml index 13356cf63..3f3d7e92f 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -51,7 +51,7 @@ sample_rate: 24000 encoder_dim: 1024 save_embedding: False -tokenizer: !new:model.tokenizer_interface.DACTokenizer +tokenizer: !new:utils.tokenizer_interface.DACTokenizer model_type: !ref model_bitrate: !ref load_pretrained: True diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 847038dd2..12b738bfd 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -85,7 +85,7 @@ ssl_model: !apply:speechbrain.utils.hparams.choice output_all_hiddens: True save_path: !ref -tokenizer: !new:model.tokenizer_interface.DiscreteSSLTokenizer +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer save_path: !ref ssl_model: !ref vocoder_repo_id: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index 3cd3b691a..1e226c45b 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -41,7 +41,7 @@ sample_rate: 24000 save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.EncodecTokenizer +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio save_path: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 7726422f3..acd292a19 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -42,7 +42,7 @@ freeze_embedding: False save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:model.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/model/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py similarity index 100% rename from benchmarks/DASB/model/tokenizer_interface.py rename to benchmarks/DASB/utils/tokenizer_interface.py From ca05ac6189d9c77ee6c3272328c235e4f5b42c39 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 11:13:24 -0500 Subject: [PATCH 029/270] update extract doc and comments and set to highest bitrate --- .../LibriSpeech/extraction/hparams/dac.yaml | 2 +- .../extraction/hparams/discrete_ssl.yaml | 18 ++++++++++-------- .../extraction/hparams/encodec.yaml | 9 +++++++-- .../extraction/hparams/speech_tokenizer.yaml | 2 +- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml index 3f3d7e92f..d2d935ed0 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -45,7 +45,7 @@ dataloader_opts: model_type: 24khz vocab_size: 1024 model_bitrate: 8kbps -num_codebooks: 2 +num_codebooks: 32 sample_rate: 24000 # Feature parameters encoder_dim: 1024 diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 12b738bfd..7d4938625 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -35,9 +35,15 @@ dataloader_opts: num_workers: !ref ### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + # ssl_model_type: hubert, wavlm, wav2vec2 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large -ssl_model_type: wavlm +ssl_model_type: WavLM ssl_hub: microsoft/wavlm-large ssl_folder: !ref /ssl_checkpoint kmeans_cache_dir: !ref /kmeans_checkpoint @@ -50,10 +56,6 @@ save_embedding: False ### Config for Tokenizer # Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) -# ssl_layer_num: [3, 7, 12, 23] -# deduplicate: [False, False, False, False] -# bpe_tokenizer_path: [null , null, null, null] -ssl_layer_num: [1, 3, 7, 12, 18, 23] num_codebooks: [1, 3, 7, 12, 18, 23] deduplicate: [False, False, False, False, False, False] bpe_tokenizer_path: [null, null, null, null, null, null] @@ -63,21 +65,21 @@ encoder_dim: 1024 ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM source: !ref output_norm: False freeze: !ref freeze_feature_extractor: !ref output_all_hiddens: True save_path: !ref - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT source: !ref output_norm: False freeze: !ref freeze_feature_extractor: !ref output_all_hiddens: True save_path: !ref - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref output_norm: False freeze: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index 1e226c45b..ee0a7e910 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -34,8 +34,13 @@ dataloader_opts: shuffle: True num_workers: !ref -bandwidth: 1.5 -num_codebooks: 2 +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 vocab_size: 1024 sample_rate: 24000 save_embedding: False diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index acd292a19..5d897a782 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -35,7 +35,7 @@ dataloader_opts: num_workers: !ref vocab_size: 1024 -num_codebooks: 2 +num_codebooks: 8 sample_rate: 16000 encoder_dim: 1024 freeze_embedding: False From a08891eb9647e14ede66844c46cd8c0231b6363d Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 13:52:11 -0500 Subject: [PATCH 030/270] add run_script.sh --- .../LibriSpeech/ASR/hparams/LSTM/train.yaml | 32 ++- .../ASR/hparams/contextnet/train.yaml | 62 ++---- benchmarks/DASB/LibriSpeech/ASR/train.py | 3 + benchmarks/DASB/run_experiment.sh | 203 ++++++++++++++++++ benchmarks/DASB/utils/aggregate_results.py | 147 +++++++++++++ 5 files changed, 389 insertions(+), 58 deletions(-) create mode 100644 benchmarks/DASB/run_experiment.sh create mode 100644 benchmarks/DASB/utils/aggregate_results.py diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 0f807c937..69e74ca54 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -5,7 +5,9 @@ # Decoder: CTC beam searcher and greedy searcher # Tokens: character # Training: Librispeech 960h -# Authors: Pooneh Mousavi 2024 +# Authors: +# - Pooneh Mousavi 2024 +# - Jarod Duret 2024 # ############################################################################ # Seed needs to be set at top of yaml, before objects with parameters are made @@ -20,6 +22,7 @@ train_log: !ref /train_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES # then data_folder_rirs should be /localscratch/xxx_corpus # otherwise the dataset will automatically be downloaded @@ -28,15 +31,14 @@ train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /dev-clean.csv - - !ref /test-clean.csv - -tokens_folder: !PLACEHOLDER -pretain_embeddings_folder: !PLACEHOLDER # Optional + - !ref /dev-clean.csv + - !ref /test-clean.csv +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. ####################### Training Parameters #################################### number_of_epochs: 20 @@ -94,21 +96,17 @@ test_dataloader_opts: ####################### Model parameters ########################### # Tokenizer parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] +# These parameters should be set according to the tokenizer used to extract tokens saved in . vocab_size: 1024 -# bandwidth: 1.5 num_codebooks: 2 sample_rate: 24000 + # Feature parameters encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128. pretrain_embeddings: False freeze_embedding: False - # LSTM activation: !name:torch.nn.Sigmoid dnn_layers: 2 @@ -191,10 +189,6 @@ scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 -# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler -# lr_initial: !ref -# n_warmup_steps: 7500 -# n_keep_steps: 36000 model_opt_class: !name:torch.optim.AdamW lr: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml index c28fdead0..dcedc415d 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml @@ -5,13 +5,16 @@ # Decoder: CTC beam searcher and greedy searcher # Tokens: character # Training: Librispeech 960h -# Authors: Pooneh Mousavi 2024 +# Authors: +# - Pooneh Mousavi 2024 +# - Jarod Duret 2024 # ############################################################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/enocdec/LSTM/ +run_name: !PLACEHOLDER +output_folder: !ref results/LSTM// output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt @@ -19,24 +22,27 @@ train_log: !ref /train_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES # then data_folder_rirs should be /localscratch/xxx_corpus # otherwise the dataset will automatically be downloaded # data_folder_rirs: !ref -train_splits: ["train-clean-100", "train-clean-360", "train-other-500"] +train_splits: ["train-clean-100"] #["train-clean-100", "train-clean-360", "train-other-500"] dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /dev-clean.csv - - !ref /test-clean.csv + - !ref /dev-clean.csv + - !ref /test-clean.csv +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. ####################### Training Parameters #################################### number_of_epochs: 20 -batch_size: 4 # This works for 2x GPUs with 32GB +batch_size: 4 test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -53,10 +59,6 @@ weight_decay: 0.0005 # Training parameters -# To make Transformers converge, the global bath size should be large enough. -# The global batch size is max_batch_len * n_gpus * gradient_accumulation. -# Empirically, we used 850 * 8 A40 45G GPUs * 2 or 1700 * 4 A100 80G * 2. -# Please, set your parameters accordingly. dynamic_batching: True max_batch_length_train: 850 max_batch_len_val: 100 @@ -94,21 +96,17 @@ test_dataloader_opts: ####################### Model parameters ########################### # Tokenizer parameters -# sample_rate: [24000, 24000, 24000, 24000] -# vocab_size: [1024, 1024, 1024, 1024] -# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] -# num_codebooks: [2, 4, 8, 16, 32] +# These parameters should be set according to the tokenizer used to extract tokens saved in . vocab_size: 1024 -# bandwidth: 1.5 num_codebooks: 2 sample_rate: 24000 + # Feature parameters encoder_dim: 1024 -# If set to True, the encoder_dim should be set to the dim of the tokenizer. For encodec it is 128. +# If set to True, encoder_dim should match the dimension of the tokenizer. For Encodec, it is 128. pretrain_embeddings: False freeze_embedding: False - # LSTM activation: !name:torch.nn.Sigmoid dnn_layers: 2 @@ -131,15 +129,8 @@ token_prune_min_logp: -1.2 prune_history: False ############################## models ################################ -# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -# tokenizer: !new:model.tokenizer_interface.EncodecTokenizer -# source: facebook/encodec_24khz # Only the 24kHz version supports mono audio -# save_path: !ref -# sample_rate: !ref -# bandwidth: !ref -# flat_embeddings: False -# freeze: True -# renorm_embeddings: False +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref discrete_embedding_layer: !new:model.custom_model.Discrete_EmbeddingLayer num_codebooks: !ref @@ -153,15 +144,12 @@ attention_mlp: !new:model.custom_model.AttentionMLP input_dim: !ref hidden_dim: !ref -encoder: !new:speechbrain.nnet.RNN.LSTM - input_shape: [Null, Null, !ref ] - num_layers: !ref - bidirectional: True - dropout: !ref - hidden_size: !ref +encoder: !new:speechbrain.lobes.models.ContextNet.ContextNet + input_shape: [null, null, !ref ] + strides: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ctc_lin: !new:speechbrain.nnet.linear.Linear - input_size: 2048 + input_size: 640 n_neurons: !ref modules: @@ -198,10 +186,6 @@ scheduler: !new:speechbrain.nnet.schedulers.NewBobScheduler improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 -# scheduler: !new:speechbrain.nnet.schedulers.LinearNoamScheduler -# lr_initial: !ref -# n_warmup_steps: 7500 -# n_keep_steps: 36000 model_opt_class: !name:torch.optim.AdamW lr: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index d7b86f659..2758eb0eb 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -6,6 +6,7 @@ Authors * Pooneh Mousavi 2024 + * Jarod Duret 2024 """ import os @@ -361,6 +362,8 @@ def text_pipeline(wrd): embs = embs[ : hparams["num_codebooks"] * hparams["vocab_size"], ] + # For discrete SSL, num_codebooks is a list used to determine which layers to use. + # It is not sequential and can be, for example, [0, 1] or [1, 4]. elif isinstance(hparams["num_codebooks"], list): indices = [ i diff --git a/benchmarks/DASB/run_experiment.sh b/benchmarks/DASB/run_experiment.sh new file mode 100644 index 000000000..35a3ba4bc --- /dev/null +++ b/benchmarks/DASB/run_experiment.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +########################################################### +# Script to run downstream evaluation training, optionally with multiple seeds. +# This script loops over seeds and trains different models. +# At the end, the final performance is computed with the aggregate_results.py script that provides the average performance. +# +# Usage: +# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \ +# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10 +# +# Authors: +# - Pooneh Mousavi (2024) +########################################################### + +# Initialize variables +hparams="" +data_folder="" +cached_data_folder="" +output_folder="" +task="" +dataset="" +seed="" +nruns="" +eval_metric="acc" +eval_set="test" +rnd_dir=False +additional_flags="" + + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --hparams hparams_path Hparam YAML file" + echo " --data_folder data_folder_path Data folder path" + echo " --cached_data_folder cache_path Cached data folder path" + echo " --output_folder output_path Output folder path" + echo " --task task downstream task" + echo " --dataset dataset dataset" + echo " --seed random_seed Seed (random if not specified)" + echo " --nruns num_runs Number of runs" + echo " --eval_metric metric Evaluation metric (e.g., acc or WER)" + echo " --eval_set dev or test Evaluation set. Default: test" + echo " --rnd_dir If True the results are stored in a subdir of the output folder with a random name (useful to store all the results of an hparam tuning). Default: False" + exit 1 +} + + +# Parse command line +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --hparams) + hparams="$2" + shift + shift + ;; + + --data_folder) + data_folder="$2" + shift + shift + ;; + + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --task) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + --eval_set) + eval_set="$2" + shift + shift + ;; + + --rnd_dir) + rnd_dir="$2" + shift + shift + ;; + + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$hparams" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Manage Seed (optional argument) +seed="${seed:-$RANDOM}" + + +if [ "$rnd_dir" = True ]; then + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" +fi + +# Make sure the output_folder is created +mkdir -p $output_folder + +# Print command line arguments and save to file +{ + echo "hparams: $hparams" + echo "data_folder: $data_folder" + echo "cached_data_folder: $cached_data_folder" + echo "output_folder: $output_folder" + echo "task: $task" + echo "dataset: $dataset" + echo "seed: $seed" + echo "nruns: $nruns" + echo "eval_metric: $eval_metric" + echo "eval_set: $eval_set" + echo "rnd_dir: $rnd_dir" + echo "additional flags: $additional_flags" +} | tee "$output_folder/flags.txt" + + +# Creating output folder +mkdir -p $output_folder +mkdir -p $data_folder +mkdir -p $cached_data_folder + +# Function to run the training experiment +run_experiment() { + +python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ +$additional_flags + +} + +# Run multiple training experiments (with different seeds) +for i in $(seq 0 1 $(( nruns - 1 ))); do + ((run_idx = i + 1)) + run_name=run"$run_idx" + output_folder_exp="$output_folder"/"$run_name"/$seed + + run_experiment $output_folder_exp + + + # Changing Random seed + seed=$((seed+1)) +done + + +echo 'Final Results (Performance Aggregation)' +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt \ No newline at end of file diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py new file mode 100644 index 000000000..be30bdb85 --- /dev/null +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -0,0 +1,147 @@ +#!/usr/bin/python +""" +Snippet to aggregate the results over multiple runs of the same experiment. +This is useful when we run multiple experiments with different seeds and we +want to compute the average performance. The script also reports the final +metric to Orion (when needed for hyperparameter tuning). + +The script searches for the result files (_results.txt) and computes the mean +and the standard deviation of the given evaluation metrics (e.g., acc or f1). +The results must have an identical format (with only different performance +numbers). + +To run this script: + + > python aggregate_results.py your_result_folder acc + +Author +------ +Pooneh Mousavi 2024 +""" + +import sys +import re +import numpy as np +from orion.client import report_objective +from speechbrain.utils.data_utils import get_all_files + + +def get_prototype(res_file, eval_metric): + """Parses a result file and adds a placeholder where the aggregated metrics + should be printed. It also returns the number of detected metrics. + + Arguments + --------- + res_file: path + Path of the result file to parse. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + prototype: list + List of the lines of the result file (with as placeholder). + n_metrics: int + Number of metrics to replace in the result files. + """ + prototype = [] + n_metrics = 0 + + # Open the first res file and figure out where the metrics are + with open(res_file) as file_in: + for line in file_in: + if eval_metric in line: + line = line.split(eval_metric)[0] + # The placeholder for the metric is + line = line + eval_metric + " " + n_metrics = n_metrics + 1 + prototype.append(line) + return prototype, n_metrics + + +def get_metrics(res_files, eval_metric): + """Summarizes the metrics of interest in a matrix. + + Arguments + --------- + res_files: list + List of all the result files. + eval_metric: path + Metric of interest (e.g, acc or f1). + + Returns + --------- + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + + # Metric initialization + metrics = np.zeros([n_metrics, len(res_files)]) + + # Loop over files + for i in range(len(res_files)): + cnt = 0 + # Metric extraction + with open(res_files[i]) as file_in: + for line in file_in: + if eval_metric in line: + # Use regex to find the test WER value + match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line) + if match: + value = match.group(1) + value = float(value) + metrics[cnt, i] = value + cnt = cnt + 1 + return metrics + + +def aggregate_metrics(prototype, metrics): + """Prints the aggregated metrics.It replaces the placeholders with + the corresponding metrics. + + Arguments + --------- + prototype: list + List of the lines of the result file (with as placeholder). + metrics: np.array + Matrix (n_metrics, n_files) containing the metrics of interest. + """ + cnt = 0 + for line in prototype: + if eval_metric in line: + values_line = "[" + for i in range(len(res_files)): + values_line = values_line + "%f " % float(metrics[cnt, i]) + values_line = values_line[:-1] + values_line = values_line + "] avg: %f ± %f " % ( + float(metrics[cnt, :].mean()), + float(metrics[cnt, :].std()), + ) + line = line.replace("", values_line) + cnt = cnt + 1 + print(line) + + +if __name__ == "__main__": + output_folder = sys.argv[1] + eval_metric = sys.argv[2] + + # Getting the list of the result files in the output folder + res_files = get_all_files(output_folder, match_and=["train_log.txt"]) + + # Gettin a prototype file + prototype, n_metrics = get_prototype(res_files[0], eval_metric) + + # Extracting the metrics of interest + metrics = get_metrics(res_files, eval_metric) + + # print aggregated metrics + aggregate_metrics(prototype, metrics) + + final_metric = metrics[-1, :].mean() + + # Report final metric to Orion + # Remember: orion expects metrics to be minimized! + if eval_metric == "acc" or eval_metric == "f1": + final_metric = 1 - final_metric + report_objective(final_metric) \ No newline at end of file From d41c6e4a51591bd2cd57d9a8a0fd5f11e0756ddb Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 18:28:34 -0500 Subject: [PATCH 031/270] fix run_experiments.sh bug --- .../DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +- .../LibriSpeech/ASR/hparams/contextnet/train.yaml | 2 +- benchmarks/DASB/LibriSpeech/ASR/train.py | 4 ++-- benchmarks/DASB/extra_requirements.txt | 1 + .../DASB/{run_experiment.sh => run_experiments.sh} | 11 ++++++----- benchmarks/DASB/utils/aggregate_results.py | 8 +++++--- 6 files changed, 16 insertions(+), 12 deletions(-) rename benchmarks/DASB/{run_experiment.sh => run_experiments.sh} (93%) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 69e74ca54..eb0d98d4b 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -5,7 +5,7 @@ # Decoder: CTC beam searcher and greedy searcher # Tokens: character # Training: Librispeech 960h -# Authors: +# Authors: # - Pooneh Mousavi 2024 # - Jarod Duret 2024 # ############################################################################ diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml index dcedc415d..aaca2668d 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml @@ -5,7 +5,7 @@ # Decoder: CTC beam searcher and greedy searcher # Tokens: character # Training: Librispeech 960h -# Authors: +# Authors: # - Pooneh Mousavi 2024 # - Jarod Duret 2024 # ############################################################################ diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index 2758eb0eb..19aa43786 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -324,7 +324,7 @@ def text_pipeline(wrd): "tr_splits": hparams["train_splits"], "dev_splits": hparams["dev_splits"], "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], + "save_folder": hparams["cached_data_folder"], "merge_lst": hparams["train_splits"], "merge_name": "train.csv", "skip_prep": hparams["skip_prep"], @@ -333,7 +333,7 @@ def text_pipeline(wrd): # Defining tokenizer and loading it tokenizer = SentencePiece( - model_dir=hparams["save_folder"], + model_dir=hparams["cached_data_folder"], vocab_size=hparams["output_neurons"], annotation_train=hparams["train_csv"], annotation_read="wrd", diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index e04ccf781..1068c2b2a 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -3,6 +3,7 @@ jsonlines kaldiio librosa>=0.9.2 onnxruntime>=1.16.3 +orion scikit-learn speechbrain>=1.0.0 speechtokenizer>=0.1.2 diff --git a/benchmarks/DASB/run_experiment.sh b/benchmarks/DASB/run_experiments.sh similarity index 93% rename from benchmarks/DASB/run_experiment.sh rename to benchmarks/DASB/run_experiments.sh index 35a3ba4bc..e0f848aef 100644 --- a/benchmarks/DASB/run_experiment.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -6,8 +6,9 @@ # At the end, the final performance is computed with the aggregate_results.py script that provides the average performance. # # Usage: -# ./run_experiments.sh --hparams=hparams/MotorImagery/BNCI2014001/EEGNet.yaml --data_folder=eeg_data \ -# --output_folder=results/MotorImagery/BNCI2014001/EEGNet --nsbj=9 --nsess=2 --seed=1986 --nruns=2 --number_of_epochs=10 +# ./run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ \ +# --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/ + # # Authors: # - Pooneh Mousavi (2024) @@ -75,13 +76,13 @@ while [[ $# -gt 0 ]]; do shift shift ;; - + --task) task="$2" shift shift ;; - + --dataset) dataset="$2" shift @@ -181,7 +182,7 @@ mkdir -p $cached_data_folder run_experiment() { python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ -$additional_flags +$additional_flags } diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py index be30bdb85..0df315b7e 100644 --- a/benchmarks/DASB/utils/aggregate_results.py +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -86,7 +86,9 @@ def get_metrics(res_files, eval_metric): for line in file_in: if eval_metric in line: # Use regex to find the test WER value - match = re.search(rf'{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)', line) + match = re.search( + rf"{eval_metric}: (\d+\.\d+(?:e[+-]?\d+)?)", line + ) if match: value = match.group(1) value = float(value) @@ -125,7 +127,7 @@ def aggregate_metrics(prototype, metrics): if __name__ == "__main__": output_folder = sys.argv[1] eval_metric = sys.argv[2] - + # Getting the list of the result files in the output folder res_files = get_all_files(output_folder, match_and=["train_log.txt"]) @@ -144,4 +146,4 @@ def aggregate_metrics(prototype, metrics): # Remember: orion expects metrics to be minimized! if eval_metric == "acc" or eval_metric == "f1": final_metric = 1 - final_metric - report_objective(final_metric) \ No newline at end of file + report_objective(final_metric) From 04ea1e62a1b74310466891a3e5f8e0fd950364ea Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 18:48:32 -0500 Subject: [PATCH 032/270] add bash script for token extraction --- .../DASB/run_discriminative_benchmark.sh | 36 ------ benchmarks/DASB/run_extraction.sh | 114 ++++++++++++++++++ benchmarks/DASB/run_generative_benchmark.sh | 67 ---------- 3 files changed, 114 insertions(+), 103 deletions(-) delete mode 100644 benchmarks/DASB/run_discriminative_benchmark.sh create mode 100644 benchmarks/DASB/run_extraction.sh delete mode 100644 benchmarks/DASB/run_generative_benchmark.sh diff --git a/benchmarks/DASB/run_discriminative_benchmark.sh b/benchmarks/DASB/run_discriminative_benchmark.sh deleted file mode 100644 index 79383deb2..000000000 --- a/benchmarks/DASB/run_discriminative_benchmark.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Please consult the README.md file for instructions on how to run the benchmark. - -tokenizer_name=$1 -if [[ "$tokenizer_name" == "" ]]; then - echo "Usage: run_generative_benchmark.sh " - exit 1 -fi - -output_folder='/path/to/output' -declare -a DatasetsFolders=('path/to/LibriSpeech' 'path/to/CommonVoice' 'path/to/IEMOCAP' 'path/to/SLURP' 'path/to/Google-speech-commands' 'path/to/VoiceCeleb1') -declare -a ConsideredTasks=('LibriSpeech/ASR' 'CommonVoice/ASR' 'IEMOCAP/emotion_recognition' 'SLURP/intent_classification' 'Google-speech-commands/keyword-spotting' 'VoiceCeleb1/speaker_ver') -declare -a DownStreams=('LSTM' 'LSTM' 'ecapa_tdnn' 'LSTM_linear' 'Xvector','Xvector') -declare -a Locales=('cy' 'eu') -declare -a LocalesVobSize=(100 200) - -shift -script_args="$@" - -for i in "${!ConsideredTasks[@]}"; do - task=${ConsideredTasks[i]} - downstream=${DownStreams[i]} - dataset_folder=${DatasetsFolders[i]} - recipe_extra_args="$script_args" - set -- "$recipe_extra_args" - if [[ "$task" == "CommonVoice/ASR" ]]; then - echo "${tokenizer_name}/${task}/${downstream}" - for j in "${!Locales[@]}"; do - locale=${Locales[j]} - vocab=${LocalesVobSize[j]} - python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml --output_folder $output_folder/$tokenizer_name/$task/$downstream/$locale --data_folder $dataset_folder/$locale --language $locale --output_neurons $vocab $@ - done - else - python $task/$downstream/train_$tokenizer_name.py $task/$downstream/hparams/train_$tokenizer_name.yaml --output_folder $output_folder/$tokenizer_name/$task/$downstream --data_folder $dataset_folder $@ - fi -done diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh new file mode 100644 index 000000000..2d419bac5 --- /dev/null +++ b/benchmarks/DASB/run_extraction.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +########################################################### +# Script to extracts and save tokens from dataset. +# +# Usage: +# ./ $run_extraction.sh --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encidec --dataset LibriSpeech + +# Authors: +# - Pooneh Mousavi (2024) +########################################################### + +# Initialize variables +data_folder="" +output_folder="" +tokenizer="" +dataset="" +save_embedding=False +additional_flags="" + + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --data_folder data_folder_path Data folder path" + echo " --output_folder output_path Output folder path" + echo " --tokenizer tokenizer tokenizer" + echo " --dataset dataset dataset" + echo " --save_embedding save_embedding If True the the embedding are saved. Default: False" + exit 1 +} + + +# Parse command line +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + --data_folder) + data_folder="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --tokenizer) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --save_embedding) + save_embedding="$2" + shift + shift + ;; + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$tokenizer" ] ||[ -z "$data_folder" ] || [ -z "$output_folder" ] || [ -z "$dataset" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + + +# Make sure the output_folder is created +mkdir -p $output_folder + +# Print command line arguments and save to file +{ + echo "data_folder: $data_folder" + echo "output_folder: $output_folder" + echo "tokenizer: $tokenizer" + echo "dataset: $dataset" + echo "save_embedding: $save_embedding" + echo "additional flags: $additional_flags" +} | tee "$output_folder/flags.txt" + + +# Creating output folder +mkdir -p $output_folder +mkdir -p $data_folder + +python $dataset/extraction/extract.py $dataset/extraction/hparams/$tokenizer.yaml --data_folder=$data_folder --output_folder=$output_folder --save_embedding=$save_embedding \ +$additional_flags diff --git a/benchmarks/DASB/run_generative_benchmark.sh b/benchmarks/DASB/run_generative_benchmark.sh deleted file mode 100644 index d5dc0d1d4..000000000 --- a/benchmarks/DASB/run_generative_benchmark.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# Please consult the README.md file for instructions on how to run the benchmark. - -tokenizer_name=$1 -if [[ "$tokenizer_name" == "" ]]; then - echo "Usage: run_generative_benchmark.sh " - exit 1 -fi - -output_folder='path/to/output' -librimix_path='path/to/Libri2Mix' -voicebank_path='path/to/VoiceBank' -ljspeech_path='path/to/ljspeech' -utmos_path='path/to/utmos' -tts_args="--token_list_file_text %recipe_root%/hparams/char_en.txt --utmos_model_path $utmos_path" - -declare -a DatasetsFolders=(\ - "$librimix_path" \ - "$voicebank_path" \ - "$ljspeech_path" \ - "$ljspeech_path" \ -) -declare -a ConsideredTasks=(\ - 'Libri2Mix/separation' \ - 'VoiceBank/enhancement' \ - 'LJSpeech/TTS' \ - 'LJSpeech/TTS' \ -) -declare -a DownStreams=(\ - 'conformer' \ - 'conformer' \ - 'tokotron' \ - 'tokotron' \ -) -declare -a ExtraArgs=(\ - '' \ - '' \ - "$tts_args" \ - "$tts_args --enc_num_layers 3 --dec_num_layers 6" \ -) - -declare -a OutputSuffix=(\ - '' \ - '' \ - '' \ - '-small' -) - -shift -script_args="$@" - -for i in "${!ConsideredTasks[@]}"; do - task=${ConsideredTasks[i]} - downstream=${DownStreams[i]} - dataset_folder=${DatasetsFolders[i]} - extra_args=${ExtraArgs[i]} - suffix=${OutputSuffix[i]} - recipe_root="$task/$downstream" - recipe_extra_args="$script_args ${extra_args//%recipe_root%/$recipe_root}" - set -- "$recipe_extra_args" - echo "${tokenizer_name}/${task}/${downstream}" - python $task/$downstream/train_$tokenizer_name.py \ - $task/$downstream/hparams/train_$tokenizer_name.yaml \ - --output_folder $output_folder/$tokenizer_name/$task/$downstream$suffix \ - --data_folder $dataset_folder \ - $@ -done From 95333cf4c9ab0a19c5254840fb6a7d14505eefd7 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Tue, 24 Dec 2024 18:56:21 -0500 Subject: [PATCH 033/270] fix bug --- benchmarks/DASB/run_extraction.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh index 2d419bac5..e121c35cb 100644 --- a/benchmarks/DASB/run_extraction.sh +++ b/benchmarks/DASB/run_extraction.sh @@ -49,13 +49,13 @@ while [[ $# -gt 0 ]]; do shift ;; - --tokenizer) - task="$2" + --tokenizer) + tokenizer="$2" shift shift ;; - --dataset) + --dataset) dataset="$2" shift shift From 096fc43c659122952e7be36257ac6a4d7f75ce39 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 13:48:17 -0500 Subject: [PATCH 034/270] add hyperparam tuning --- .../LibriSpeech/ASR/hparams/LSTM/train.yaml | 8 +- benchmarks/DASB/extra_requirements.txt | 1 + benchmarks/DASB/orion/hparams_tpe.yaml | 6 + benchmarks/DASB/run_hparam_optimization.sh | 422 ++++++++++++++++++ 4 files changed, 433 insertions(+), 4 deletions(-) create mode 100644 benchmarks/DASB/orion/hparams_tpe.yaml create mode 100644 benchmarks/DASB/run_hparam_optimization.sh diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index eb0d98d4b..98ba22d23 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -42,7 +42,8 @@ pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, th ####################### Training Parameters #################################### number_of_epochs: 20 -batch_size: 4 +batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" +batch_size: !ref 2 ** test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -54,7 +55,7 @@ valid_search_interval: 1 avg_checkpoints: 10 # Number of checkpoints to average for evaluation cache_size: 1.e+10 -lr_model: 0.001 +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" weight_decay: 0.0005 @@ -109,8 +110,7 @@ freeze_embedding: False # LSTM activation: !name:torch.nn.Sigmoid -dnn_layers: 2 -dnn_neurons: 1024 +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index 1068c2b2a..e97e16b28 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -4,6 +4,7 @@ kaldiio librosa>=0.9.2 onnxruntime>=1.16.3 orion +orion[profet] scikit-learn speechbrain>=1.0.0 speechtokenizer>=0.1.2 diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml new file mode 100644 index 000000000..cf2f6fd54 --- /dev/null +++ b/benchmarks/DASB/orion/hparams_tpe.yaml @@ -0,0 +1,6 @@ +experiment: + algorithms: + tpe: + seed: 1986 + n_initial_points: 20 + n_ei_candidates: 24 \ No newline at end of file diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh new file mode 100644 index 000000000..de5110b96 --- /dev/null +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -0,0 +1,422 @@ +#!/bin/bash + +########################################################### +# Hyperparameter Tuning Script for EEG Model with Orion +########################################################### + +# Description: +# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion. +# It supports leave-one-subject-out and/or leave-one-session-out training strategies. + +# Usage: +# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \ +# --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \ +# --data_folder eeg_data/ \ +# --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \ +# --nruns 1 --nruns_eval 10 \ +# --eval_metric acc \ +# --exp_max_trials 50 \ +# --store_all True \ +# --device 'cpu' +# +# Optimization Steps: +# The script supports multiple hyperparameter optimization steps. +# We found it convenient to first optimize training and model hyperparameters, +# and then optimize data augmentation hyperparameters in a separate step. + +# Script Workflow: +# 1. Search for the orion flags in the specified hparam file. +# 2. Run the orion-hunt command for hyperparameter tuning. +# By default, TPE (Tree-structured Parzen Estimator) hyperparameter tuning is +# performed, as specified in the default orion config file at hparams/orion/hparams_tpe.yaml. +# 3. Save the best hyperparameters, which can be viewed using torch-info. +# 4. Loop until flags like @orion_step are found in the YAML file. +# +# Final Performance Evaluation: +# At the end of the optimization process, the script computes the final performance +# using the best hyperparameters on the test set. +# This is done by averaging over nruns_eval different seeds. +# +# Note: More detailed information can be found in the README.md file. + +# Authors: +# - Pooneh Mousavi 2024 +########################################################### + +# Initialize variables +exp_name="hopt" +output_folder="" +data_folder="" +cached_data_folder="" +task="" +dataset="" +hparams="" +nruns="" +nruns_eval=10 +eval_metric="acc" +seed=1986 +config_file="orion/hparams_tpe.yaml" +mne_dir="" +orion_db_address="" +orion_db_type="PickledDB" +exp_max_trials=50 +store_all=True +compress_exp=True + +# Function to print argument descriptions and exit +print_argument_descriptions() { + echo "Usage: $0 [options]" + echo "Options:" + echo " --exp_name Name Name that Orion gives to the experiment" + echo " --output_folder output_path Output folder were the results will be stored" + echo " --data_folder data_path Folder were the data are stored. If not available, they will be downloaded there." + echo " --cached_data_folder path [Optional] Folder were the data in pkl format will be cached." + echo " --task task downstream task" + echo " --dataset dataset dataset" + echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" + echo " --nruns num_runs Number of runs for each hparam selection." + echo " --nruns_eval num_runs Number of runs for the final evaluation (with best hparams) on the test set" + echo " --eval_metric metric [Optional] Evaluation metric description. Default:acc" + echo " --seed random_seed [Optional] Seed (random if not specified)" + echo " --config_file config_file [Optional] Orion config file. Default: hparams/orion/hparams_tpe.yaml" + echo " --mne_dir mne_dir [Optional] MNE directory. Need it different from your home (see notes on MNE in README.md)" + echo " --orion_db_address [Optional] Path of the database where orion will store hparams and performance" + echo " --orion_db_type db_type [Optional] Type of the dataset that orion will use. Default: PickledDB" + echo " --exp_max_trials int [Optional] Maximum number of hparam trials for each oprimization step. Default:50" + echo " --store_all Bool [Optional] When set to True, the output folders of all hparam trials will be stored in randomly named folders. Default: False" + echo " --compress_exp Bool [Optional] When set to True, this option compresses the output folders of all hyperparameter trials into a single tar.gz file. This is particularly useful when store_all is set to True, as it helps prevent the accumulation of a large number of files. Default: False" + exit 1 +} + +POSITIONAL_ARGS=() + +while [[ $# -gt 0 ]]; do + case $1 in + + --exp_name) + exp_name="$2" + shift + shift + ;; + + --output_folder) + output_folder="$2" + shift + shift + ;; + + --data_folder) + data_folder="$2" + shift + shift + ;; + + --hparams) + hparams="$2" + shift + shift + ;; + + --cached_data_folder) + cached_data_folder="$2" + shift + shift + ;; + + --task) + task="$2" + shift + shift + ;; + + --dataset) + dataset="$2" + shift + shift + ;; + + --seed) + seed="$2" + shift + shift + ;; + + --nruns) + nruns="$2" + shift + shift + ;; + + --nruns_eval) + nruns_eval="$2" + shift + shift + ;; + + + --eval_metric) + eval_metric="$2" + shift + shift + ;; + + + + --config_file) + config_file="$2" + shift + shift + ;; + + --mne_dir) + mne_dir="$2" + shift + shift + ;; + + --orion_db_address) + orion_db_address="$2" + shift + shift + ;; + + --orion_db_type) + orion_db_type="$2" + shift + shift + ;; + + --exp_max_trials) + exp_max_trials="$2" + shift + shift + ;; + + --store_all) + store_all="$2" + shift + shift + ;; + + --compress_exp) + compress_exp="$2" + shift + shift + ;; + + --help) + print_argument_descriptions + ;; + + -*|--*) + additional_flags+="$1 $2 " # store additional flags + shift # past argument + ;; + + + *) + POSITIONAL_ARGS+=("$1") # save positional arg + shift # past argument + ;; + esac +done + + +# Check for required arguments +if [ -z "$output_folder" ] || [ -z "$data_folder" ] || [ -z "$hparams" ] || [ -z "$nruns" ]; then + echo "ERROR: Missing required arguments! Please provide all required options." + print_argument_descriptions +fi + +# Set mne_dir if specified +if [ "$mne_dir" ]; then + export _MNE_FAKE_HOME_DIR=$mne_dir +fi + +# Assign default value to cached_data_folder +if [ -z "$cached_data_folder" ]; then + cached_data_folder="$data_folder/cache" +fi + + +# Set orion db address if specified +if [ -z "$orion_db_address" ]; then + orion_db_address=$output_folder'/'$exp_name'.pkl' +fi +export ORION_DB_ADDRESS=$orion_db_address +export ORION_DB_TYPE=$orion_db_type + +echo "-------------------------------------" +echo "Experiment Name: $exp_name" +echo "hparams: $hparams" +echo "Output Folder: $output_folder" +echo "Data Folder: $data_folder" +echo "Cached Data Folder: $cached_data_folder" +echo "task: $task" +echo "dataset: $dataset" +echo "Hparam File: $hparams" +echo "Number of Runs: $nruns" +echo "Number of Eval Runs: $nruns_eval" +echo "Eval Metric: $eval_metric" +echo "Seed: $seed" +echo "Additional Flags: $additional_flags" +echo "Orion Config File: $config_file" +echo "Orion Database type: $orion_db_type" +echo "Orion Database file: $orion_db_address" +echo "Experiment Max Trials: $exp_max_trials" +echo "-------------------------------------" + + +# This function will extract all the optimization flags added in the yaml file +# The input is a text file (e.g, a yaml file) and a pattern (e.g, "@orion_step1:") +# The ouput are the detected flags (e.g., --dropout~"uniform(0.0, 0.5)"). +get_flag() { + local file_path="$1" + local pattern="$2" + + # Check if the file exists + if [ ! -f "$file_path" ]; then + echo "Error: File '$file_path' not found." + return 1 + fi + + # Use grep to find all lines containing the pattern and then extract the flags using sed + grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n' +} + + +# Function for updatading the hparam yaml file with the best hparams found at step 1 +update_hparams() { + local best_hparams_file="$1" + local hparams_yaml_file="$2" + local output_yaml_file="$3" + + # Read the values from best_hparams.txt into an associative array + declare -A best_hparams + while IFS=": " read -r key value; do + best_hparams["$key"]=$value + done < "$best_hparams_file" + + + # Read the hparams.yaml file into a variable + local hparams_content=$(cat "$hparams_yaml_file") + + # Update values in hparams_content using values from best_hparams + for key in "${!best_hparams[@]}"; do + local pattern="^$key: .*" + local replacement="$key: ${best_hparams[$key]}" + hparams_content=$(sed "s/$pattern/$replacement/g" <<< "$hparams_content") + done + + # Write the updated content to a new YAML file + echo "$hparams_content" > "$output_yaml_file" +} + +# Function for extracting the best hparams from orion-info +function extract_best_params() { + local input_file="$1" + local best_trial_line=$(grep -n "best trial:" "$input_file" | cut -d ":" -f 1) + local params_lines=$(tail -n +$best_trial_line "$input_file" | awk '/params:/{flag=1;next}/start time:/{flag=0}flag') + local formatted_params=$(echo "$params_lines" | sed -e 's/^[[:space:]]*//' -e 's/: /: /' -e '/^$/d' -e 's#^/##') + echo "$formatted_params" +} + +# Running hparam tuning (loop over multiple steps) +step_id=1 +hparams_step=$hparams +pattern="@orion_step1:" +opt_flags=$(get_flag "$hparams_step" "$pattern") + +# Check if the string is empty and exit with an error if it is +if [ -z "$opt_flags" ]; then + echo "Error: Optimization flags not found in '$hparams'" + echo "Please ensure that the Orion optimization flags are set in the hparam file using in-line comments like:" + echo "# @orion_step1: --dropout~\"uniform(0.0, 0.5)\"" + exit 1 # Exit with a non-zero error code +fi + + +while [ -n "$opt_flags" ]; do + # Do something + output_folder_step="$output_folder"/step"$step_id" + mkdir -p $output_folder_step + exp_name_step="$exp_name"_step"$step_id" + + echo + echo "**********************************************************************************************" + echo "Running hparam tuning (step $step_id)..." + echo "- This might take several hours!" + echo "- The best set of hparams will be save in $output_folder_step" + echo "- You can monitor the evolution of the hparam optimization with: orion status -n $exp_name" + echo "......" + echo "**********************************************************************************************" + echo + # Setting up orion command + orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ + ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder_step/exp --task $task --dataset $dataset --seed $seed --nruns $nruns \ + --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all $additional_flags" + + + # Appending the optimization flags + orion_hunt_command="$orion_hunt_command $opt_flags" + + echo $orion_hunt_command &> "$output_folder_step/orion_hunt_command.txt" + + # Execute the command for hparm tuning + eval $orion_hunt_command + + # Compress the exp folder (if required) + if [ "$compress_exp" = True ]; then + tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp" + if [ -d "$output_folder_step/exp" ]; then + rm -rf "$output_folder_step/exp" + fi + + fi + + # Storing best haprams + orion info --name $exp_name_step &> $output_folder_step/orion-info.txt + + # Extract list of the best hparams from orion-info + # Find the line number where "best trial:" appears + best_trial_line=$(grep -n "best trial:" $output_folder_step/orion-info.txt | cut -d ":" -f 1) + + # Extract and store the best set of hparams + best_params_output=$(extract_best_params "$output_folder_step/orion-info.txt") + best_hparams_file="$output_folder_step/best_hparams.txt" + echo "$best_params_output" > $best_hparams_file + + # Store the current best yaml file + best_yaml_file="$output_folder_step/best_hparams.yaml" + update_hparams "$best_hparams_file" "$hparams_step" "$best_yaml_file" + + # Update best hparam step + hparams_step=$best_yaml_file + + # Update step variable + ((step_id++)) + + # Update search pattern + pattern="@orion_step$step_id:" + + # update optimization flags pattern + opt_flags=$(get_flag "$hparams_step" "$pattern") +done + +echo +echo "**********************************************************************************************" +echo "Running Final Evaluation on the best hparams (test-set)..." +echo "**********************************************************************************************" +echo + +final_yaml_file="$output_folder/best_hparams.yaml" +scp $best_yaml_file $final_yaml_file + +# Running evaluation on the test set for the best models +./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ + --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ + --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ + --rnd_dir $store_all $additional_flags + +echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file From 8dc0161dd7c088e97faf8d7e22429646678d535b Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 13:50:59 -0500 Subject: [PATCH 035/270] fix precommit --- benchmarks/DASB/orion/hparams_tpe.yaml | 2 +- benchmarks/DASB/run_hparam_optimization.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/orion/hparams_tpe.yaml b/benchmarks/DASB/orion/hparams_tpe.yaml index cf2f6fd54..fb6a7c9b0 100644 --- a/benchmarks/DASB/orion/hparams_tpe.yaml +++ b/benchmarks/DASB/orion/hparams_tpe.yaml @@ -3,4 +3,4 @@ experiment: tpe: seed: 1986 n_initial_points: 20 - n_ei_candidates: 24 \ No newline at end of file + n_ei_candidates: 24 diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index de5110b96..39766018f 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -128,7 +128,7 @@ while [[ $# -gt 0 ]]; do shift shift ;; - + --dataset) dataset="$2" shift From c0f4feeafaad74e6e9dea038129917277e76f756 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 13:57:56 -0500 Subject: [PATCH 036/270] modify hparams.sh input order --- benchmarks/DASB/run_hparam_optimization.sh | 31 ++++++++++------------ 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 39766018f..3c84f5ad4 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -45,16 +45,16 @@ # Initialize variables exp_name="hopt" -output_folder="" +hparams="" data_folder="" cached_data_folder="" +output_folder="" task="" dataset="" -hparams="" +seed=1986 nruns="" nruns_eval=10 eval_metric="acc" -seed=1986 config_file="orion/hparams_tpe.yaml" mne_dir="" orion_db_address="" @@ -68,16 +68,16 @@ print_argument_descriptions() { echo "Usage: $0 [options]" echo "Options:" echo " --exp_name Name Name that Orion gives to the experiment" - echo " --output_folder output_path Output folder were the results will be stored" + echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" echo " --data_folder data_path Folder were the data are stored. If not available, they will be downloaded there." echo " --cached_data_folder path [Optional] Folder were the data in pkl format will be cached." - echo " --task task downstream task" - echo " --dataset dataset dataset" - echo " --hparms hparam_file YAML file containing the hparam to optimize. The hyperparameters decorated with @orion_step1 or @orion_step1 in the YAML file will be used" + echo " --output_folder output_path Output folder were the results will be stored" + echo " --task task downstream task" + echo " --dataset dataset dataset" + echo " --seed random_seed [Optional] Seed (random if not specified)" echo " --nruns num_runs Number of runs for each hparam selection." echo " --nruns_eval num_runs Number of runs for the final evaluation (with best hparams) on the test set" echo " --eval_metric metric [Optional] Evaluation metric description. Default:acc" - echo " --seed random_seed [Optional] Seed (random if not specified)" echo " --config_file config_file [Optional] Orion config file. Default: hparams/orion/hparams_tpe.yaml" echo " --mne_dir mne_dir [Optional] MNE directory. Need it different from your home (see notes on MNE in README.md)" echo " --orion_db_address [Optional] Path of the database where orion will store hparams and performance" @@ -99,8 +99,8 @@ while [[ $# -gt 0 ]]; do shift ;; - --output_folder) - output_folder="$2" + --hparams) + hparams="$2" shift shift ;; @@ -111,14 +111,14 @@ while [[ $# -gt 0 ]]; do shift ;; - --hparams) - hparams="$2" + --cached_data_folder) + cached_data_folder="$2" shift shift ;; - --cached_data_folder) - cached_data_folder="$2" + --output_folder) + output_folder="$2" shift shift ;; @@ -153,15 +153,12 @@ while [[ $# -gt 0 ]]; do shift ;; - --eval_metric) eval_metric="$2" shift shift ;; - - --config_file) config_file="$2" shift From a595cf6172f4c035cc3c2e9cc880ce411bf47a4c Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 16:13:54 -0500 Subject: [PATCH 037/270] only applying testing for final run HT --- .../LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +- benchmarks/DASB/LibriSpeech/ASR/train.py | 27 ++++++++++--------- benchmarks/DASB/run_hparam_optimization.sh | 9 +++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 98ba22d23..1be23bc0c 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -18,7 +18,7 @@ output_folder: !ref results/LSTM// output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt - +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index 19aa43786..49d2248cb 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -433,16 +433,17 @@ def text_pipeline(wrd): valid_loader_kwargs=hparams["valid_dataloader_opts"], ) - # Testing - if not os.path.exists(hparams["output_wer_folder"]): - os.makedirs(hparams["output_wer_folder"]) - - for k in test_datasets.keys(): # keys are test_clean, test_other etc - asr_brain.hparams.output_wer_folder = os.path.join( - hparams["output_wer_folder"], f"wer_{k}.txt" - ) - asr_brain.evaluate( - test_datasets[k], - test_loader_kwargs=hparams["test_dataloader_opts"], - min_key="WER", - ) + if hparams["testing"]: + # Testing + if not os.path.exists(hparams["output_wer_folder"]): + os.makedirs(hparams["output_wer_folder"]) + + for k in test_datasets.keys(): # keys are test_clean, test_other etc + asr_brain.hparams.output_wer_folder = os.path.join( + hparams["output_wer_folder"], f"wer_{k}.txt" + ) + asr_brain.evaluate( + test_datasets[k], + test_loader_kwargs=hparams["test_dataloader_opts"], + min_key="WER", + ) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 3c84f5ad4..390177b28 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -5,8 +5,7 @@ ########################################################### # Description: -# This script facilitates hyperparameter tuning for a given EEG model and dataset using Orion. -# It supports leave-one-subject-out and/or leave-one-session-out training strategies. +# This script facilitates hyperparameter tuning for a given audio tokenizer, dowsnteram model and dataset using Orion. # Usage: # ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \ @@ -21,8 +20,6 @@ # # Optimization Steps: # The script supports multiple hyperparameter optimization steps. -# We found it convenient to first optimize training and model hyperparameters, -# and then optimize data augmentation hyperparameters in a separate step. # Script Workflow: # 1. Search for the orion flags in the specified hparam file. @@ -352,7 +349,7 @@ while [ -n "$opt_flags" ]; do orion_hunt_command="orion hunt -n $exp_name_step -c $config_file --exp-max-trials $exp_max_trials \ ./run_experiments.sh --hparams $hparams_step --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder_step/exp --task $task --dataset $dataset --seed $seed --nruns $nruns \ - --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all $additional_flags" + --eval_metric $eval_metric --eval_set dev --rnd_dir $store_all --testing False $additional_flags" # Appending the optimization flags @@ -414,6 +411,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all $additional_flags + --rnd_dir $store_all --testing False $additional_flags echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file From 78da6c14e9c2b58d5cfce9a4341707689a9eab7e Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 21:08:03 -0500 Subject: [PATCH 038/270] fix bug --- benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 1be23bc0c..8b9581dc9 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -110,7 +110,8 @@ freeze_embedding: False # LSTM activation: !name:torch.nn.Sigmoid -dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)"dnn_neurons: 1024 +dnn_layers: 2 # @orion_step1: --dnn_layers~"uniform(1, 4,discrete=True)" +dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 From 6a3a7a5127c7f63534bc879305b602bb9170670e Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 21:26:45 -0500 Subject: [PATCH 039/270] fix bug --- benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index 8b9581dc9..be5c18d5b 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -41,7 +41,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. ####################### Training Parameters #################################### -number_of_epochs: 20 +number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)" batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" batch_size: !ref 2 ** test_batch_size: 1 From e9ff250486b8b1ed7adde95cc07e36a46c4b1441 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 21:32:24 -0500 Subject: [PATCH 040/270] add hupertun for contextnet --- .../DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml | 2 +- .../LibriSpeech/ASR/hparams/contextnet/train.yaml | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml index be5c18d5b..8b9581dc9 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml @@ -41,7 +41,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, this should be set to the path where the pretrained embeddings are saved. ####################### Training Parameters #################################### -number_of_epochs: 20 # @orion_step1: --number_of_epochs~"fidelity(5, 20, base=4)" +number_of_epochs: 20 batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" batch_size: !ref 2 ** test_batch_size: 1 diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml index aaca2668d..cd45d7d9a 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml @@ -18,7 +18,7 @@ output_folder: !ref results/LSTM// output_wer_folder: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt - +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech @@ -42,7 +42,8 @@ pretrain_embeddings_folder: none # Optional: If pretrain_embeddings is True, th ####################### Training Parameters #################################### number_of_epochs: 20 -batch_size: 4 +batch_size_exponent: 4 # @orion_step1: --batch_size_exponent~"uniform(2, 4,discrete=True)" +batch_size: !ref 2 ** test_batch_size: 1 grad_accumulation_factor: 2 max_grad_norm: 5.0 @@ -107,11 +108,8 @@ encoder_dim: 1024 pretrain_embeddings: False freeze_embedding: False -# LSTM -activation: !name:torch.nn.Sigmoid -dnn_layers: 2 -dnn_neurons: 1024 -dropout: 0.2 +# Contextnet + output_neurons: 31 # BPE parameters From 3e2fe0c89050745a2375f4c10c6d5654059d1f96 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 21:52:00 -0500 Subject: [PATCH 041/270] add etsting to average run --- benchmarks/DASB/run_hparam_optimization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 390177b28..5cbde3b20 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -411,6 +411,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all --testing False $additional_flags + --rnd_dir $store_all --testing True $additional_flags echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file From f378aeca46d2439d4bc747c8f656ec09173d24be Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 21:52:00 -0500 Subject: [PATCH 042/270] add lr for HT for contextnet --- benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml | 2 +- benchmarks/DASB/run_hparam_optimization.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml index cd45d7d9a..eab197c68 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR/hparams/contextnet/train.yaml @@ -55,7 +55,7 @@ valid_search_interval: 1 avg_checkpoints: 10 # Number of checkpoints to average for evaluation cache_size: 1.e+10 -lr_model: 0.001 +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" weight_decay: 0.0005 diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 390177b28..5cbde3b20 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -411,6 +411,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all --testing False $additional_flags + --rnd_dir $store_all --testing True $additional_flags echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file From b2bd3165bfa497b1742961eba8dae405171f0d77 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Wed, 25 Dec 2024 22:29:14 -0500 Subject: [PATCH 043/270] add measuring time --- benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 9 +++++++++ benchmarks/DASB/LibriSpeech/ASR/train.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py index 99eeb81fe..9fa3e3f3d 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py @@ -10,6 +10,7 @@ import os import sys +import time import torch import torchaudio import logging @@ -423,6 +424,8 @@ def text_pipeline(wrd): if valid_bsampler is not None: valid_dataloader_opts = {"batch_sampler": valid_bsampler} + # Measure time + start_time = time.time() # Start the timer # Training asr_brain.fit( @@ -433,6 +436,12 @@ def text_pipeline(wrd): valid_loader_kwargs=hparams["valid_dataloader_opts"], ) + end_time = time.time() # End the timer + # Calculate elapsed time + elapsed_time = end_time - start_time + hparams["train_logger"].log_stats( + stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, + ) # Testing if not os.path.exists(hparams["output_wer_folder"]): os.makedirs(hparams["output_wer_folder"]) diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index 49d2248cb..a66c0c5bf 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -11,6 +11,7 @@ import os import sys +import time import torch import torchaudio import logging @@ -424,6 +425,8 @@ def text_pipeline(wrd): if valid_bsampler is not None: valid_dataloader_opts = {"batch_sampler": valid_bsampler} + # Measure time + start_time = time.time() # Start the timer # Training asr_brain.fit( asr_brain.hparams.epoch_counter, @@ -433,6 +436,12 @@ def text_pipeline(wrd): valid_loader_kwargs=hparams["valid_dataloader_opts"], ) + end_time = time.time() # End the timer + # Calculate elapsed time + elapsed_time = end_time - start_time + hparams["train_logger"].log_stats( + stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, + ) if hparams["testing"]: # Testing if not os.path.exists(hparams["output_wer_folder"]): From 9de693453ac350cd9f4997a380d41ca4a4537b4d Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Thu, 26 Dec 2024 01:14:50 -0500 Subject: [PATCH 044/270] add time measure --- benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 7 ++++--- benchmarks/DASB/LibriSpeech/ASR/train.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py index 9fa3e3f3d..938ce8b96 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py @@ -439,9 +439,10 @@ def text_pipeline(wrd): end_time = time.time() # End the timer # Calculate elapsed time elapsed_time = end_time - start_time - hparams["train_logger"].log_stats( - stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, - ) + logger.info(f"Model execution time: {elapsed_time:.6f} seconds") + # hparams["train_logger"].log_stats( + # stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, + # ) # Testing if not os.path.exists(hparams["output_wer_folder"]): os.makedirs(hparams["output_wer_folder"]) diff --git a/benchmarks/DASB/LibriSpeech/ASR/train.py b/benchmarks/DASB/LibriSpeech/ASR/train.py index a66c0c5bf..ec6ac1b42 100644 --- a/benchmarks/DASB/LibriSpeech/ASR/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR/train.py @@ -439,9 +439,8 @@ def text_pipeline(wrd): end_time = time.time() # End the timer # Calculate elapsed time elapsed_time = end_time - start_time - hparams["train_logger"].log_stats( - stats_meta={f"Model execution time: {elapsed_time:.6f} seconds"}, - ) + logger.info(f"Model execution time: {elapsed_time:.6f} seconds") + if hparams["testing"]: # Testing if not os.path.exists(hparams["output_wer_folder"]): From c4e273852c2b1078bfb70b5d515867788c982218 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 28 Dec 2024 11:24:50 -0500 Subject: [PATCH 045/270] update readme + minor changes --- .../ASR-on-the-fly/hparams/LSTM/dac.yaml | 1 - .../ASR-on-the-fly/hparams/LSTM/encodec.yaml | 1 - .../hparams/LSTM/speech_tokenizer.yaml | 1 - .../hparams/contextnet/dac.yaml | 1 - .../hparams/contextnet/encodec.yaml | 1 - .../hparams/contextnet/speech_tokenizer.yaml | 1 - benchmarks/DASB/README.md | 176 +++++++++++++++--- benchmarks/DASB/run_extraction.sh | 2 +- benchmarks/DASB/run_hparam_optimization.sh | 24 ++- benchmarks/DASB/utils/tokenizer_interface.py | 82 +++++++- 10 files changed, 240 insertions(+), 50 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml index ff1749fab..605b772b5 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/dac.yaml @@ -119,7 +119,6 @@ dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml index dd4f62bf4..f13e3cb53 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/encodec.yaml @@ -116,7 +116,6 @@ dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml index bb0b32a43..d0e9aae5b 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml @@ -111,7 +111,6 @@ dnn_neurons: 1024 dropout: 0.2 output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml index b60b32604..8e73e3601 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/dac.yaml @@ -115,7 +115,6 @@ freeze_embedding: False # LSTM output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml index 7c0dcfc45..4d88a7978 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/encodec.yaml @@ -109,7 +109,6 @@ freeze_embedding: False output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml index 3dcd7eea7..7fdbf8d51 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml @@ -105,7 +105,6 @@ freeze_embedding: False output_neurons: 31 -# BPE parameters # BPE parameters token_type: char # ["unigram", "bpe", "char"] character_coverage: 1.0 diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index c3e42bf64..78d780739 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -29,10 +29,11 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294): - [Installation](#-installation) - [Discrete Audio Encoder](#-Discrete-Audio-Encoder) - [Datasets and Recipes](#-Datasets-and-Recipes) -- [Quickstart](#-quickstart) - - [Running a single task](#Running-a-single-task) - - [Running multiple tasks](#Runnin-multiple-tasks) +- [Training Scenarios](#-Training-Scenarios) + - [On-the-FlybToken Extraction](#On-the-Fly-Token-Extraction) + - [Offline-Token-Extraction](#Offline-Token-Extraction) - [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer) +- [Hyperparameter Tuning](#Hyperparameter-Tuning) - [Results](#-results) - [Contact](#-contact) - [Citing](#-citing) @@ -98,51 +99,166 @@ To set up SpeechBrain-DASB, follow these steps: | Libri2Mix | Speech Separation | Conformer | CRDNN | [github.com/JorisCos/LibriMix](https://github.com/JorisCos/LibriMix) | | LJSpeech | Text-to-Speech | Shallow Transformer | Deep Transformer | [keithito.com/LJ-Speech-Dataset/](https://keithito.com/LJ-Speech-Dataset/) | -# ▶️ Quickstart +# 📖 Training Scenarios -## Running a single task +We offer two different training scenarios: **on-the-fly token extraction** and **offline token extraction**. -If you have specific discrete model and want to benchmark it for a specific task, you need to run the following command: - ``` - python LibriSpeech/ASR/LSTM/train_[tokenzier_name].py LibriSpeech/ASR/LSTM/hparams/train_[tokenzier_name].yaml --output_folder my-output-folder --data_folder mypath/to/LibriSpeech - ``` +## 1. On-the-Fly Token Extraction +In this scenario, audio tokens are extracted dynamically during training. To enhance efficiency, we use a caching mechanism where tokens are saved in memory during the first epoch and retrieved for subsequent epochs. However, this approach has some limitations: +- It works best when the dataset is small, the bitrate is low, and batching is sorted (not random). +- It is unsuitable when data augmentation is required. -## Running multiple tasks +You can also disable the caching mechanism if needed. -To run all tasks, make the following changes: +Currently, the on-the-fly token extraction is applied only in the recipe located at: +`LibriSpeech/ASR-on-the-fly` -1. Edit the `run_discriminative_benchmark.sh` and `run_genarative_benchmark.sh` files and modify tokenizer related values for example the bitrate , number of codebooks, and etc. -2. Choose a set of tasks from the provided list and, for each task, select a downstream architecture from the available options (see list below). -3. Update the variables defined in `run_benchmark.sh` with two lists of equal size. In the `ConsideredTasks` list, specify the tasks you want to run (e.g., `'LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP'`). In the `Downstreams` list, specify the corresponding downstream architecture for each task (e.g., `'BiLSTM'`, `contextnet`, `'ecapa_tdnn'`). +If you wish to adapt this strategy for your own recipe, you can copy and modify the existing recipe as needed. Here's how to run the on-the-fly recipe: - For example, if you set `ConsideredTasks=('LibriSpeechASR' 'LibriSpeechASR' 'IEMOCAP')` and `Downstreams=('BiLSTM', 'contextnet', 'ecapa_tdnn')`, the benchmark will be executed as follows: - - LibriSpeechASR with BiLSTM as the probing head - - LibriSpeechASR with contextnet as the probing head - - IEMOCAP with ecapa_tdnn as the probing head. +```bash +python LibriSpeech/ASR-on-the-fly/train.py LibriSpeech/ASR-on-the-fly/hparams/LSTM/{TOKENIZER}.yaml --data_folder=path/LibriSpeech --output_folder=path/results/LibriSpeech/ASR/{TOKENIZER}/LSTM +``` -3. Run the following command: - ``` - bash run_discriminative_benchmark.sh [tokenzier_name] - bash run_genarative_benchmark.sh [tokenzier_name] - ``` - You could also pass extra arguments as far as they are consistent across all tasks. +> **Note:** On-the-fly extraction can be time-consuming, which is why we also provide an alternative approach: **offline token extraction**. + +--- + +## 2. Offline Token Extraction +In this scenario, all tokens are pre-extracted in a separate recipe. We recommend using the highest number of codebooks available for token extraction and then choosing the desired settings during training. + +### Token Extraction Command +To extract tokens, use the following command: + +```bash +python LibriSpeech/extraction/extract.py benchmarks/DASB/LibriSpeech/extraction/hparams/{tokenizer}.yaml --data_folder=path/LibriSpeech --num_codebooks=32 +``` + +If you wish to initialize your embedding layer with the tokenizer's embeddings while training your downstream model, set the flag `save_embedding` to `True`. For discrete SSL tokenizers, you can specify a list of layers for `--num_codebooks` instead of a single number (e.g., `--num_codebooks=[3,7,12]`). + +### Training with Pre-Extracted Tokens +Once tokens are extracted and saved, you can train a downstream model using the following command: + +```bash +bash run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/train.yaml --data_folder LibriSpeech --cached_data_folder cache/ --output_folder results/LibriSpeech/ASR/encodec/LSTM --task ASR --dataset LibriSpeech --seed 1986 --nruns 2 --eval_metric WER --tokens_folder LibriSpeech/extraction-emb/speech_tokenizer/save/librispeech/ +``` + +--- + +This workflow ensures flexibility, efficiency, and reproducibility for both training scenarios. Adapt the recipes as needed for your specific requirements! + +Here's a polished and formatted version for clarity, suitable for a README or documentation: + + + +# 🎛️ Hyperparameter Tuning + +Efficient hyperparameter tuning is critical when introducing novel models or experimenting with diverse datasets. Our benchmark establishes a standardized protocol for hyperparameter tuning, leveraging [Orion](https://orion.readthedocs.io/en/stable/) to ensure fair and consistent model comparisons. + +--- + +## **Overview** + +Hyperparameter tuning is managed using the `./run_hparam_optimization.sh` script. This script coordinates multiple hyperparameter trials via `run_experiments.sh`. + + + +## **Incorporating Orion Flags in Hparam Files** + +To enable tuning, Orion flags should be directly embedded in the YAML hparam file using comments. For example, to optimize the learning rate (`lr`) parameter within a defined range, include the following line in the YAML file: + +```yaml +lr_model: 0.0001 # @orion_step1: --lr_model~"loguniform(0.00001,0.5)" +``` + + + +## **Workflow of the Script** + +The script operates as follows: + +1. **Scans** the YAML hparam file for Orion flags. +2. **Executes** hyperparameter tuning using the `orion-hunt` command. +3. **Saves** the best hyperparameters for reference via `torch-info`. +4. **Iterates** until encountering flags such as `@orion_step` in the YAML file. + + + +## **Running Hyperparameter Optimization** + +You can perform hyperparameter optimization using a command like this: + +```bash +bash run_hparam_optimization.sh \ + --exp_name 'ASR-encodec-LSTM_hopt' \ + --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \ + --data_folder path/LibriSpeech \ + --cached_data_folder path/cache/ \ + --output_folder results/LibriSpeech/ASR/encodec/LSTM \ + --task ASR \ + --dataset LibriSpeech \ + --seed 1986 \ + --nruns 1 \ + --nruns_eval 5 \ + --eval_metric WER \ + --exp_max_trials 50 \ + --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \ + --run_name encodec +``` + +For more details on the arguments and customization options, refer to `./run_hparam_optimization.sh`. + + +### **Notes** + +1. **Execution Time**: + - Hyperparameter tuning may take several hours or even days, depending on the model complexity and dataset. + +2. **GPU vs. CPU**: + - By default, models are trained on GPU. To train on CPU instead, include the `--device cpu` flag. + +3. **Monitoring Progress**: + - Use the following command to monitor optimization status: + ```bash + orion status --all + ``` + - Ensure that Orion-specific environment variables are set in your bash environment. For example: + ```bash + export ORION_DB_ADDRESS=results/LibriSpeech/ASR/encodec/LSTM/hopt/ASR-encodec-LSTM_hopt.pkl + export ORION_DB_TYPE=pickleddb + ``` + Adjust `ORION_DB_ADDRESS` according to your experiment. + +4. **Resuming Optimization**: + - You can interrupt the script at any point. It will resume from the last completed trial. + +5. **Repetition of Optimization**: + - For multiple repetitions of the same hyperparameter optimization, modify the `--exp_name` parameter. + +6. **System Requirements**: + - The script is designed for Linux-based systems. A bash script is provided instead of Python due to its ability to manage diverse training loops across various subjects and sessions. + +--- + +This protocol ensures fair model comparison across diverse tasks and datasets. All reported results are derived using this standardized hyperparameter tuning methodology, enabling consistent assessments across models. - For generative task, make sure to set the `utmos_path` required for TTS evaluation. # 📝 ‍Incorporating Your Audio Tokenizer Let's now assume you've designed an audio and speech tokenizer in PyTorch and wish to integrate it into our benchmark. You're in luck because we've made this step as simple as possible for you! Here are the steps you should follow: -1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/MOABB/models/my_model.py`). -2. Create a YAML and py file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, if you're working with LibriSpeech/ASR/LSTM, copy `benchmarks/DASB/LibriSpeech/ASR/contextnet/hparams/train_encodec.yaml` and save it in the same folder with a different name (e.g., `train_my_model.yaml` and `train_my_model.py`). +1. Write your model's code in a Python library saved in `benchmarks/DASB/model` (e.g., `benchmarks/DASB/models/my_model.py`). + +2. Add the tokenizer to `utils/tokenizer_interface.py` and ensure the `encode` and `decode` functions are consistent in functionality and output shape with the other tokenizers. + +3. Create a YAML and Python file for each task you want to experiment with. Thankfully, you don't have to start from scratch. For example, you can copy `LibriSpeech/extraction/hparams/encodec.yaml`, adapt it based on your needs, and save it in the same folder with a different name (e.g., `LibriSpeech/extraction/hparams/{YOUR_TOKENIZER_NAME}.yaml`). -3. Edit the relevant section of your `train_my_model.yaml` and `train_my_model.py`. Redefine the `codec:` to reference your custom model (e.g., `codec: !new:models.my_model.my_model`). +4. Edit the relevant sections of your `{YOUR_TOKENIZER_NAME}.yaml`. Redefine the `tokenizer:` field to reference your custom model (e.g., `tokenizer: !new:tokenizer_interface.your_tokenizer`). -4. Ensure you include the hyperparameters specific to your model. +5. Ensure you include the hyperparameters specific to your model. -5. Now, follow the instructions above to run an experiments across tasks. +6. Now, follow the instructions provided earlier to run experiments across tasks. **Note**: If you're not familiar with YAML, you can refer to our [HyperPyYAML tutorial](https://speechbrain.github.io/tutorial_basics.html) on the SpeechBrain website for guidance. # 📈 Results diff --git a/benchmarks/DASB/run_extraction.sh b/benchmarks/DASB/run_extraction.sh index e121c35cb..92cc81381 100644 --- a/benchmarks/DASB/run_extraction.sh +++ b/benchmarks/DASB/run_extraction.sh @@ -4,7 +4,7 @@ # Script to extracts and save tokens from dataset. # # Usage: -# ./ $run_extraction.sh --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encidec --dataset LibriSpeech +# ./ $run_extraction.sh --data_folder LibriSpeech --output_folder results/LibriSpeech/ASR/encodec/LSTM --tokenizer encodec --dataset LibriSpeech # Authors: # - Pooneh Mousavi (2024) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 5cbde3b20..2ad1dddf3 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -8,16 +8,20 @@ # This script facilitates hyperparameter tuning for a given audio tokenizer, dowsnteram model and dataset using Orion. # Usage: -# ./run_hparam_optimization.sh --exp_name 'EEGNet_BNCI2014001_hopt' \ -# --output_folder results/MotorImagery/BNCI2014001/EEGNet/hopt \ -# --data_folder eeg_data/ \ -# --hparams hparams/MotorImagery/BNCI2014001/EEGNet.yaml \ -# --nruns 1 --nruns_eval 10 \ -# --eval_metric acc \ -# --exp_max_trials 50 \ -# --store_all True \ -# --device 'cpu' -# +# ./run_hparam_optimization.sh --exp_name 'ASR-encodec-LSTM_hopt' \ + # --hparams LibriSpeech/ASR/hparams/LSTM/train.yaml \ + # --data_folder path/LibriSpeech \ + # --cached_data_folder path/cache/ \ + # --output_folder results/LibriSpeech/ASR/encodec/LSTM \ + # --task ASR \ + # --dataset LibriSpeech \ + # --seed 1986 \ + # --nruns 1 \ + # --nruns_eval 5 \ + # --eval_metric WER \ + # --exp_max_trials 50 \ + # --tokens_folder results/LibriSpeech/extraction-emb/encodec/save/librispeech/ \ + # --run_name encodec # Optimization Steps: # The script supports multiple hyperparameter optimization steps. diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index f63ddd6aa..ff1194968 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -21,25 +21,101 @@ class BaseTokenizer(ABC): + """ + Abstract base class for tokenizers that encode signals into discrete tokens + and decode tokens back into signals. + + This class defines the essential methods that any tokenizer must implement, + including encoding, decoding, and retrieving pretrained embeddings. + + Naming Convenstion + ------------------ + B : int + Batch size. + T : int + Sequence length in the time domain. + N : int + Sequence length in the token domain. + C : int + Vocabulary size, assuming each codebook has the same number of tokens. + K : int + Number of codebooks. + """ + def __init__(self): + """ + Initialize the BaseTokenizer. + + This is a base constructor that other tokenizers can extend. + """ super().__init__() @abstractmethod @torch.no_grad() def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): - """Encode signal into tokens.""" + """ + Encode a signal into discrete tokens. + + Arguments + --------- + signal : torch.Tensor + Input signal with shape [B, T]. + lengths : torch.Tensor + Lengths of each sequence in the batch, with shape [B]. + num_codebooks : int, optional + Number of codebooks to use for encoding. If None, all codebooks are used (default: None). + If specified as an int, the tokens will be truncated to include only the first `num_codebooks` codebooks. If specified as a list, + the tokens will include only the codebooks at the specified indices. + **kwargs : dict + Additional arguments for the tokenizer. + + Returns + ------- + tokens : torch.Tensor + Discretized tokens with shape [B, N, K]. + """ pass @abstractmethod @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): - """Decode tokens to signal.""" + """ + Decode discrete tokens back into a signal. + + Arguments + --------- + tokens : torch.Tensor + Input tokens with shape [B, N, K]. + **kwargs : dict + Additional arguments for the tokenizer. + + Returns + ------- + signal : torch.Tensor + Reconstructed signal with shape [B, T]. + """ pass @abstractmethod @torch.no_grad() def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs): - """Get codebook embeddings.""" + """ + Retrieve pretrained embeddings for the tokenizer. + + Arguments + --------- + vocab_size : int + Number of tokens in each codebook. + num_codebooks : int + Number of codebooks. + **kwargs : dict + Additional arguments for embedding retrieval. + + Returns + ------- + embeddings : torch.Tensor + Pretrained embedding weights with shape [K, C, H], where H is the embedding dimension. + """ pass From 279e48b001fedc21ba69acc1150a9099b114d5a7 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 28 Dec 2024 11:35:57 -0500 Subject: [PATCH 046/270] fix link in readme --- README.md | 2 +- benchmarks/DASB/README.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a6defc05b..fc0b33c4d 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ The SpeechBrain Benchmarks currently include the following: - [MOABB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/MOABB) - A benchmark designed for evaluating neural models in well-known EEG tasks like motor imagery, P300, and SSVEP. -- [DASB](https://github.com/speechbrain/benchmarks/tree/main/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative +- [DASB](https://github.com/speechbrain/benchmarks/tree/DASB/benchmarks/DASB) - A benchmark designed for evaluating discrete audio tokens across a wide range of discriminative and generative tasks. diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index 78d780739..445232337 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -30,10 +30,10 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294): - [Discrete Audio Encoder](#-Discrete-Audio-Encoder) - [Datasets and Recipes](#-Datasets-and-Recipes) - [Training Scenarios](#-Training-Scenarios) - - [On-the-FlybToken Extraction](#On-the-Fly-Token-Extraction) - - [Offline-Token-Extraction](#Offline-Token-Extraction) + - [On-the-Fly Token Extraction](#-On-the-Fly-Token-Extraction) + - [Offline-Token-Extraction](#-Offline-Token-Extraction) - [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer) -- [Hyperparameter Tuning](#Hyperparameter-Tuning) +- [Hyperparameter Tuning](#-Hyperparameter-Tuning) - [Results](#-results) - [Contact](#-contact) - [Citing](#-citing) From 7f32f1bf3f3c7c7e844b5134f82845ce29cdbea7 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 28 Dec 2024 11:40:01 -0500 Subject: [PATCH 047/270] update table of contnet --- benchmarks/DASB/README.md | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index 445232337..684459083 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -25,18 +25,31 @@ For detailed information, refer to [paper](https://arxiv.org/pdf/2406.14294): # Table of Contents -- [Table of Contents](#table-of-contents) -- [Installation](#-installation) -- [Discrete Audio Encoder](#-Discrete-Audio-Encoder) -- [Datasets and Recipes](#-Datasets-and-Recipes) -- [Training Scenarios](#-Training-Scenarios) - - [On-the-Fly Token Extraction](#-On-the-Fly-Token-Extraction) - - [Offline-Token-Extraction](#-Offline-Token-Extraction) -- [‍Incorporating Your Audio Tokenizer](#-Incorporating-Your-Audio-Tokenizer) -- [Hyperparameter Tuning](#-Hyperparameter-Tuning) -- [Results](#-results) -- [Contact](#-contact) -- [Citing](#-citing) +Here’s the updated **Table of Contents** for your GitHub README with corrections and better alignment: + +--- + +# 📑 Table of Contents + +- [DASB - Discrete Audio and Speech Benchmark](#dasb---discrete-audio-and-speech-benchmark) +- [🛠️ Installation](#-installation) +- [🎌 Discrete Audio Encoder](#-discrete-audio-encoder) +- [⚡ Datasets and Recipes](#-datasets-and-recipes) +- [📖 Training Scenarios](#-training-scenarios) + - [On-the-Fly Token Extraction](#on-the-fly-token-extraction) + - [Offline Token Extraction](#offline-token-extraction) +- [🎛️ Hyperparameter Tuning](#-hyperparameter-tuning) +- [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer) +- [📈 Results](#-results) + - [Ranking](#ranking) + - [Benchmarking Results for Discriminative Tasks](#benchmarking-results-for-discriminative-tasks) + - [Benchmarking Results for Generative Tasks](#benchmarking-results-for-generative-tasks) +- [📧 Contact](#-contact) +- [📖 Citing](#-citing) + +--- + +This structure provides a clear and logical flow, ensuring users can easily navigate the document. Each major section is linked appropriately, with sub-sections for detailed content. Let me know if additional adjustments are required! # 🛠️ Installation @@ -103,7 +116,7 @@ To set up SpeechBrain-DASB, follow these steps: We offer two different training scenarios: **on-the-fly token extraction** and **offline token extraction**. -## 1. On-the-Fly Token Extraction +## On-the-Fly Token Extraction In this scenario, audio tokens are extracted dynamically during training. To enhance efficiency, we use a caching mechanism where tokens are saved in memory during the first epoch and retrieved for subsequent epochs. However, this approach has some limitations: - It works best when the dataset is small, the bitrate is low, and batching is sorted (not random). - It is unsuitable when data augmentation is required. @@ -121,9 +134,8 @@ python LibriSpeech/ASR-on-the-fly/train.py LibriSpeech/ASR-on-the-fly/hparams/LS > **Note:** On-the-fly extraction can be time-consuming, which is why we also provide an alternative approach: **offline token extraction**. ---- -## 2. Offline Token Extraction +## Offline Token Extraction In this scenario, all tokens are pre-extracted in a separate recipe. We recommend using the highest number of codebooks available for token extraction and then choosing the desired settings during training. ### Token Extraction Command @@ -149,7 +161,6 @@ This workflow ensures flexibility, efficiency, and reproducibility for both trai Here's a polished and formatted version for clarity, suitable for a README or documentation: - # 🎛️ Hyperparameter Tuning Efficient hyperparameter tuning is critical when introducing novel models or experimenting with diverse datasets. Our benchmark establishes a standardized protocol for hyperparameter tuning, leveraging [Orion](https://orion.readthedocs.io/en/stable/) to ensure fair and consistent model comparisons. From 30fc2d691aef1ff6c9edf4eb97fd1a3bb58d8a77 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 28 Dec 2024 11:42:14 -0500 Subject: [PATCH 048/270] fix --- benchmarks/DASB/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index 684459083..a3fdedb56 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -38,7 +38,7 @@ Here’s the updated **Table of Contents** for your GitHub README with correctio - [📖 Training Scenarios](#-training-scenarios) - [On-the-Fly Token Extraction](#on-the-fly-token-extraction) - [Offline Token Extraction](#offline-token-extraction) -- [🎛️ Hyperparameter Tuning](#-hyperparameter-tuning) +- [🎛️ Hyperparameter Tuning](#hyperparameter-tuning) - [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer) - [📈 Results](#-results) - [Ranking](#ranking) From a576ba7fe63d4b1e93792030210bbb8e6d4f3c1a Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 28 Dec 2024 11:43:35 -0500 Subject: [PATCH 049/270] fix --- benchmarks/DASB/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index a3fdedb56..0ad632979 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -38,7 +38,7 @@ Here’s the updated **Table of Contents** for your GitHub README with correctio - [📖 Training Scenarios](#-training-scenarios) - [On-the-Fly Token Extraction](#on-the-fly-token-extraction) - [Offline Token Extraction](#offline-token-extraction) -- [🎛️ Hyperparameter Tuning](#hyperparameter-tuning) +- [🎛️ Hyperparameter Tuning](#%EF%B8%8F-hyperparameter-tuning) - [📝 Incorporating Your Audio Tokenizer](#-incorporating-your-audio-tokenizer) - [📈 Results](#-results) - [Ranking](#ranking) From 0fafc1cfcc3685d6382399015dea49d619a97238 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 30 Dec 2024 17:19:26 -0500 Subject: [PATCH 050/270] Tokotron LJSpeech: Update to work with the new tokenizer pipeline --- .../hparams/train_continuous_ssl.yaml | 9 + .../TTS/tokotron/hparams/train_dac.yaml | 42 +-- .../tokotron/hparams/train_discrete_ssl.yaml | 42 ++- .../TTS/tokotron/hparams/train_encodec.yaml | 32 +-- .../hparams/train_speech_tokenizer.yaml | 14 + .../DASB/LJSpeech/TTS/tokotron/train.py | 250 ++++++++---------- .../DASB/LJSpeech/TTS/tokotron/train_dac.py | 4 +- .../TTS/tokotron/train_discrete_ssl.py | 2 +- .../DASB/LJSpeech/extraction/extract.py | 88 ++++++ .../DASB/LJSpeech/extraction/hparams/dac.yaml | 63 +++++ .../extraction/hparams/discrete_ssl.yaml | 100 +++++++ .../LJSpeech/extraction/hparams/encodec.yaml | 62 +++++ .../extraction/hparams/speech_tokenizer.yaml | 52 ++++ .../LJSpeech/extraction/ljspeech_prepare.py | 1 + benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 187 +------------ benchmarks/DASB/model/Tokotron.py | 141 ---------- benchmarks/DASB/utils/audio_tokens.py | 27 +- 17 files changed, 593 insertions(+), 523 deletions(-) create mode 100644 benchmarks/DASB/LJSpeech/extraction/extract.py create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml create mode 120000 benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml index ac80bdac0..087eb6cf9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml @@ -257,6 +257,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- audio_token_shift: !ref decoder_mode: !ref scale_factor: !ref + audio_dim: !ref representation_mode: continuous @@ -264,6 +265,7 @@ modules: model: !ref vocoder: !ref compute_cost: !ref + ssl_model: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam @@ -306,3 +308,10 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index cd4f338bc..4f50c7ed2 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p vocoder_type: encodec vocoder_src: "charactr/vocos-encodec-24khz" +# Model type +representation_mode: discrete + # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared/dac @@ -35,6 +38,14 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +token_model_kwargs: + n_quantizers: !ref + splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -101,17 +112,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref -# Token model (pretrained) -dac: !new:speechbrain.lobes.models.discrete.dac.DAC - sample_rate: !ref - model_type: !ref - model_bitrate: !ref - load_pretrained: True - -# Token model (pretrained) -token_model: !new:Tokotron.DACFeatureExtractor - dac: !ref - n_quantizers: !ref # Dataloader options train_dataloader_opts: @@ -143,13 +143,6 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer @@ -174,6 +167,8 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 2 bandwidth: 1.5 +model_shape: BHL +model_needs_channel: True attention_type: regularMHA ############################## models ################################ @@ -198,9 +193,17 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + modules: model: !ref - dac: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam @@ -235,3 +238,4 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index f8a0ee622..e3b549549 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -11,6 +11,7 @@ __set_seed: !apply:torch.manual_seed [!ref ] # Model Type ssl_model_type: wavlm +representation_mode: discrete output_folder: !ref results/tokotron/// save_folder: !ref /save @@ -37,6 +38,11 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + freeze_token_model: True token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref @@ -47,7 +53,9 @@ token_model_src: !apply:speechbrain.utils.hparams.choice g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization -token_model_kmeans_dataset: LibriSpeech-100-360-500 +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref token_offset: 1 @@ -161,14 +169,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice output_all_hiddens: True -token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - ssl_model: !ref - kmeans_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref - save_path: !ref - layers_num: !ref - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa @@ -206,18 +206,6 @@ sample_dataloader_opts: token_model_kwargs: SSL_layers: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - token_model_kwargs: !ref - ssl_model: !ref - ssl_model_layers: !ref - token_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - ####################### Model parameters ########################### # Transformer @@ -229,7 +217,7 @@ d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU -audio_num_tokens: 1000 +vocab_size: 1000 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False @@ -254,7 +242,7 @@ vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref - audio_num_tokens: !ref + audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref @@ -281,6 +269,7 @@ modules: model: !ref vocoder: !ref compute_cost: !ref + ssl_model: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam @@ -323,3 +312,10 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index f5e82c309..0082e20db 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p vocoder_type: encodec vocoder_src: "charactr/vocos-encodec-24khz" +# Model type +representation_mode: discrete + # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared/encodec @@ -35,6 +38,11 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -96,13 +104,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref -# Token model (pretrained) -token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec - source: !ref - save_path: !ref - bandwidth: !ref - flat_embeddings: True - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -133,13 +134,6 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - ####################### Model parameters ########################### # Transformer @@ -190,7 +184,6 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line modules: model: !ref - token_model: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam @@ -225,3 +218,12 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 103d584ed..ec6de9bb2 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -17,6 +17,9 @@ g2p_src: flexthink/soundchoice-g2p vocoder_type: encodec vocoder_src: "charactr/vocos-encodec-24khz" +# Model type +representation_mode: discrete + # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared/st @@ -35,6 +38,12 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -167,6 +176,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 2 bandwidth: 1.5 +model_shape: HBL attention_type: regularMHA ############################## models ################################ @@ -228,3 +238,7 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 3dddf48dc..0c80cc5c2 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -59,6 +59,8 @@ def __init__( create_waveform_fn=self.create_waveform, device=self.device, ) + self.representation_mode = RepresentationMode(self.hparams.representation_mode) + def compute_forward(self, batch, stage): """Runs all the computation of the Tokotron TTS @@ -77,7 +79,8 @@ def compute_forward(self, batch, stage): """ batch = batch.to(self.device) tokens, tokens_length = batch.tokens - audio, audio_length = batch.audio_bos + features = self.prepare_features(batch) + audio, audio_length, _, _ = features emb = None if self.use_spk_emb: emb = {"spk": batch.spk_emb.data.squeeze(1)} @@ -90,7 +93,48 @@ def compute_forward(self, batch, stage): emb=emb, ) - return predictions + return predictions, features + + def prepare_features(self, batch): + """Prepares features, depending on the configuration + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation + + Returns + ------- + audio_bos : torch.Tensor + Audio features, with BOS + audio_bos_length : torch.Tensor + Relative lengths of the audio features, with BOS + audio_tgt : torch.Tensor + Target audio features (for loss computation) + audio_tgt_length : torch.Tensor + Relative lengths of the target audio features + """ + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like( + audio[:, :1, :, :] + ).reshape(batch_size, self.hparams.bos_width, heads, dim) + audio_bos = torch.concatenate( + [bos, audio], + dim=1 + ) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length @torch.no_grad() def evaluate_batch(self, batch, stage): @@ -140,19 +184,20 @@ def compute_objectives(self, predictions, batch, stage): A one-element tensor used for backpropagating the gradient. """ batch = batch.to(self.device) - audio, audio_length = batch.audio_pad + predictions, features = predictions + _, _, audio_tgt, audio_tgt_length = features loss_details = self.hparams.compute_cost( predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, ) self.loss_metric.append( batch.uttid, predictions=predictions, - audio=audio, - audio_length=audio_length, + audio=audio_tgt, + audio_length=audio_tgt_length, input_tokens=batch.tokens.data, input_length=batch.tokens.lengths, reduction="batch", @@ -281,11 +326,7 @@ def fit_batch(self, batch): def init_optimizers(self): """Custom optimizer initialization """ - representation_mode = getattr( - self.hparams, "representation_mode", RepresentationMode.DISCRETE - ) - representation_mode = RepresentationMode(representation_mode) - if representation_mode == RepresentationMode.CONTINUOUS: + if self.representation_mode == RepresentationMode.CONTINUOUS: audio_emb_params = self.modules.model.decoder.audio_emb.parameters() audio_emb_params_set = set(audio_emb_params) model_params = [ @@ -368,9 +409,7 @@ def dataio_prepare(hparams): the token used for silence """ - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) + representation_mode = RepresentationMode(hparams["representation_mode"]) # Define datasets from json data manifest file # Define datasets sorted by ascending lengths for efficiency @@ -407,7 +446,7 @@ def audio_ref_pipeline(wav): Arguments --------- - wav : str + wav : strƒnum_ The file path Returns @@ -422,49 +461,43 @@ def audio_ref_pipeline(wav): if representation_mode == RepresentationMode.DISCRETE: layers_key = "token_model_layers" - model_key = "token_model" - audio_features = "audio_tokens" + model_key = "tokenizer" else: layers_key = "ssl_model_layers" model_key = "ssl_model" - audio_features = "audio_ssl" audio_tokens_per_step = ( len(hparams[layers_key]) if layers_key in hparams else hparams["audio_tokens_per_step"] ) - if use_silence_padding: - silence_token, silence_emb = get_silence_token( + if use_silence_padding and representation_mode == RepresentationMode.DISCRETE: + silence_token, _ = get_silence_token( hparams[model_key], - extract_emb=representation_mode == RepresentationMode.CONTINUOUS, model_kwargs=hparams.get("token_model_kwargs"), + extract_emb=False, + model_shape=hparams.get("model_shape", "BLH"), + unsqueeze=hparams.get("model_needs_channel", False) ) else: silence_token = ( torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64) * hparams["eos_index"] ) - silence_token = silence_token.cpu() - silence_padding = ( - silence_token - if representation_mode == RepresentationMode.DISCRETE - else silence_emb - ) + silence_padding = silence_token.cpu() + silence_padding = silence_padding[:audio_tokens_per_step] silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) audio_bos_prefix = ( torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] ) - if representation_mode == RepresentationMode.CONTINUOUS: - audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( - 1, 1, hparams["audio_dim"] - ) - @sb.utils.data_pipeline.takes(audio_features) + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") - def audio_pipeline(audio): - audio = torch.from_numpy(audio) + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -480,21 +513,20 @@ def audio_pipeline(audio): ] init_sequence_encoder(hparams) - use_spk_emb = hparams.get("use_spk_emb", False) - prepared_features = [audio_features] output_keys = [ "uttid", "tokens", - "audio_pad", - "audio_bos", "label_norm_eval", ] - if use_spk_emb: - prepared_features.append("spk_emb") - output_keys.append("spk_emb") + if representation_mode == RepresentationMode.DISCRETE: + output_keys += [ + "audio_pad", + "audio_bos", + ] + else: + output_keys.append("sig") eval_output_keys = [*output_keys, "sig"] - for dataset in data_info: if dataset == "train": dataset_output_keys = output_keys @@ -508,13 +540,6 @@ def audio_pipeline(audio): output_keys=dataset_output_keys, ) - add_prepared_features( - dataset=dynamic_dataset, - save_path=Path(hparams["prepare_save_folder"]) / "features", - id_key="uttid", - features=prepared_features, - ) - datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False @@ -540,50 +565,9 @@ def audio_pipeline(audio): "sorting must be random, ascending or descending" ) - datasets["sample"] = select_sample(hparams, datasets) return datasets, silence_padding -def select_sample(hparams, datasets): - """Selects a sample of files for sample generation, freezing the sample if - requested to persist across multiple experiments - - Arguments - --------- - hparams : dict - experiment hyperparameters - datasets : dict - a dictionary of datasets - - Returns - ------- - dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset - the sample dataset - """ - sample_path = hparams.get("sample_path") - dataset = None - if sample_path is not None: - sample_path = Path(sample_path) - if sample_path.exists(): - with open(sample_path, "r") as sample_file: - data_ids = [line.strip() for line in sample_file] - dataset = FilteredSortedDynamicItemDataset( - datasets["valid"], data_ids - ) - - if dataset is None: - dataset = ( - datasets["valid"] - .batch_shuffle(1) - .filtered_sorted(select_n=hparams["num_audio_samples"]) - ) - if sample_path is not None: - with open(sample_path, "w") as sample_file: - for data_id in dataset.data_ids: - print(data_id, file=sample_file) - return dataset - - def init_sequence_encoder(hparams): """Initialize a sequence encoder @@ -682,7 +666,6 @@ def apply_overfit_test(hparams, dataset): "train": dataset_train, "valid": dataset_eval, "test": dataset_eval, - "sample": dataset_eval, } else: result = dataset.overfit_test( @@ -736,40 +719,25 @@ def run_experiment(brain_cls): from ljspeech_prepare import prepare_ljspeech # Data preparation, to be run on only one process. - representation_mode = RepresentationMode( - hparams.get("representation_mode", RepresentationMode.DISCRETE) - ) - audio_features = ( - "audio_tokens" - if representation_mode == RepresentationMode.DISCRETE - else "audio_ssl" - ) - extract_features = [audio_features] - if hparams.get("use_spk_emb", False): - extract_features.append("spk_emb") - if not hparams["skip_prep"]: - with hparams["freezer"]: - run_on_main( - prepare_ljspeech, - kwargs={ - "data_folder": hparams["data_folder"], - "save_folder": hparams["prepare_save_folder"], - "splits": hparams["splits"], - "split_ratio": hparams["split_ratio"], - "seed": hparams["seed"], - "extract_features": extract_features, - "extract_features_opts": hparams["extract_features_opts"], - "extract_phonemes": hparams["input"] == "phonemes", - "model_name": "tokotron", - "g2p_src": hparams["g2p_src"], - "skip_ignore_folders": hparams[ - "prepare_skip_ignore_folders" - ], - "frozen_split_path": hparams.get("frozen_split_path"), - "device": run_opts.get("device", "cpu"), - }, - ) + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams[ + "prepare_skip_ignore_folders" + ], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) # We can now directly create the datasets for training, valid, and test datasets, silence_padding = dataio_prepare(hparams) @@ -786,31 +754,43 @@ def run_experiment(brain_cls): run_opts=run_opts, checkpointer=hparams["checkpointer"], ) - tts_brain.sample_data = datasets["sample"] # The `fit()` method iterates the training loop, calling the methods # necessary to update the parameters of the model. Since all objects # with changing state are managed by the Checkpointer, training can be # stopped at any point, and will be resumed on next call. + + dataloader_opts = [ + hparams[f"{key}_dataloader_opts"] + for key in ["train", "valid", "test"] + ] + representation_mode = RepresentationMode(hparams["representation_mode"]) + if representation_mode == RepresentationMode.DISCRETE: + dataloader_opts = [ + use_silence_padding( + opts, silence_padding, audio_keys + ) + for opts in dataloader_opts + ] + ( + train_dataloader_opts, + valid_dataloader_opts, + test_dataloader_opts + ) = dataloader_opts + tts_brain.fit( tts_brain.hparams.epoch_counter, datasets["train"], datasets["valid"], - train_loader_kwargs=use_silence_padding( - hparams["train_dataloader_opts"], silence_padding, audio_keys - ), - valid_loader_kwargs=use_silence_padding( - hparams["valid_dataloader_opts"], silence_padding, audio_keys - ), + train_loader_kwargs=train_dataloader_opts, + valid_loader_kwargs=valid_dataloader_opts, ) # Load best checkpoint for evaluation tts_brain.evaluate( test_set=datasets["test"], min_key="loss", - test_loader_kwargs=use_silence_padding( - hparams["test_dataloader_opts"], silence_padding, audio_keys - ), + test_loader_kwargs=test_dataloader_opts, ) # Save final checkpoint (fixed name) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py index d0bc9f4f7..83b9ff538 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py @@ -33,10 +33,10 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - z, _, _ = self.modules.dac.quantizer.from_codes( + z, _, _ = self.modules.tokenizer.quantizer.from_codes( audio.transpose(1, 2).int() ) - wav = self.modules.dac.decode(z).squeeze(1) + wav = self.modules.tokenizer.decode(z).squeeze(1) clean_padding_(wav, length) return wav diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py index f9fc764cd..aa2c57681 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py @@ -34,7 +34,7 @@ def compute_offset(self): str(layer) for layer in (layers_set - available_layers_set) ) raise ValueError(f"Layers {unavailable_layers} are not supported") - self.num_units = self.hparams.audio_num_tokens + self.num_units = self.hparams.vocab_size _, layers_idx = torch.where( torch.tensor( self.hparams.vocoder_available_layers, device=self.device diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py new file mode 100644 index 000000000..556d8a9d0 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/extract.py @@ -0,0 +1,88 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech) + from ljspeech_prepare import prepare_ljspeech # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["output_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "ljspeech").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info(f"Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml new file mode 100644 index 000000000..ebf155bb2 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..c4c01f527 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,100 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..0b07a6b1f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml @@ -0,0 +1,62 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..54da4f210 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,52 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py new file mode 120000 index 000000000..2de5a21a8 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/ljspeech_prepare.py @@ -0,0 +1 @@ +../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index e88b92eb6..bfd1b3743 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -13,13 +13,11 @@ import json import random import logging -from types import SimpleNamespace import torch import torchaudio import numpy as np import tgt import re -import speechbrain as sb from tqdm import tqdm from pathlib import Path from speechbrain.utils.data_utils import download_file @@ -27,10 +25,6 @@ from speechbrain.inference.text import GraphemeToPhoneme from unidecode import unidecode from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations -from speechbrain.dataio.batch import PaddedData -from speechbrain.dataio.dataset import DynamicItemDataset -from preparation import FeatureExtractor -from torchaudio.functional import resample logger = logging.getLogger(__name__) @@ -179,7 +173,7 @@ def prepare_ljspeech( os.makedirs(duration_folder) # extract pitch for both Fastspeech2 and FastSpeech2WithAligner models - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: pitch_folder = os.path.join(data_folder, "pitch") if not os.path.exists(pitch_folder): os.makedirs(pitch_folder) @@ -200,16 +194,6 @@ def prepare_ljspeech( data_folder, splits, split_ratio, frozen_split_path ) - extract_features_context = None - extract_features_folder = None - if extract_features: - extract_features_context = get_context( - extract_features=extract_features, - extract_features_opts=extract_features_opts or {}, - device=device, - ) - extract_features_folder = Path(save_folder) / "features" - if "train" in splits: prepare_json( model_name, @@ -226,10 +210,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -250,10 +230,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -274,10 +250,6 @@ def prepare_ljspeech( pitch_min_f0, pitch_max_f0, use_custom_cleaner, - extract_features, - extract_features_context, - extract_features_folder, - extract_features_opts, extract_phonemes, g2p_src, device, @@ -500,7 +472,7 @@ def prepare_json( g2p = GraphemeToPhoneme.from_hparams( g2p_src, run_opts={"device": device} ) - if "FastSpeech2" in model_name: + if model_name is not None and "FastSpeech2" in model_name: logger.info( "Computing pitch as required for FastSpeech2. This may take a while." ) @@ -649,19 +621,6 @@ def prepare_json( # Updates data for the utterance json_dict[id].update({"phonemes": phonemes}) - # Feature Extraction - if extract_features: - extract_features_folder.mkdir(exist_ok=True) - prepare_features( - data=json_dict, - data_folder=data_folder, - save_path=extract_features_folder, - features=extract_features, - context=extract_features_context, - options=extract_features_opts, - device=device, - ) - # Writing the dictionary to the json file with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) @@ -839,145 +798,3 @@ def custom_clean(text, model_name): text = re.sub(regex, replacement, text) return text - -INLINE_FEATURES = ["audio_ssl_len"] - - -def prepare_features( - data, data_folder, save_path, features, context, options=None, device="cpu" -): - """Performs feature extraction - - Arguments - --------- - data: dict - a preprocessed dataset - data_folder : str - the data folder - save_folder : str - the folder where features will be saved - context : dict - context data - features: list - the list of feature extractions to be performed - """ - dataset = DynamicItemDataset(data) - feature_extractor = FeatureExtractor( - save_path=save_path, - src_keys=["sig"], - id_key="uttid", - dataloader_opts=options.get("dataloader_opts", {}), - device=device, - ) - token_model_kwargs = options.get("token_model_kwargs", {}) - ssl_layers = options.get("ssl_model_layers") or options.get( - "token_model_layers" - ) - - @sb.utils.data_pipeline.takes("wav") - @sb.utils.data_pipeline.provides("sig") - def audio_pipeline(wav): - """Load the audio signal. """ - wav = wav.replace("{data_root}", data_folder) - sig = sb.dataio.dataio.read_audio(wav) - - yield sig - - dataset.add_dynamic_item(audio_pipeline) - - @sb.utils.data_pipeline.takes("sig") - @sb.utils.data_pipeline.provides("sig_resampled") - def resample_pipeline(sig): - sig_data = resample( - waveform=sig.data, - orig_freq=options["sample_rate"], - new_freq=options["model_sample_rate"], - ) - return PaddedData(sig_data, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_tokens", "audio_emb") - def token_pipeline(sig): - with torch.no_grad(): - result = context.token_model( - sig.data, sig.lengths, **token_model_kwargs - ) - # TODO: Clean this up - if torch.is_tensor(result): - tokens = result - # Note: Dummy embedding - meaning embeddings are not available - emb = torch.zeros((len(sig.data), 1, 1), device=sig.data.device) - else: - tokens, emb = result[:2] - tokens = tokens.int() - if tokens.dim() < 3: - tokens = tokens.unsqueeze(-1) - yield PaddedData(tokens, sig.lengths) - yield PaddedData(emb, sig.lengths) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("spk_emb") - def spk_emb_pipeline(sig): - mel_spec = context.spk_emb_model.mel_spectogram(audio=sig.data) - return context.spk_emb_model.encode_mel_spectrogram_batch( - mel_spec, sig.lengths - ) - - @sb.utils.data_pipeline.takes("sig_resampled") - @sb.utils.data_pipeline.provides("audio_ssl", "audio_ssl_len") - def ssl_pipeline(sig): - ssl_raw = context.ssl_model(sig.data, sig.lengths) - ssl = ssl_raw[ssl_layers].permute(1, 2, 0, 3) - yield PaddedData(ssl, sig.lengths) - yield (sig.lengths * ssl.size(1)).tolist() - - dynamic_items = [ - resample_pipeline, - token_pipeline, - ssl_pipeline, - spk_emb_pipeline, - ] - for dynamic_item in dynamic_items: - feature_extractor.add_dynamic_item(dynamic_item) - - feature_keys = [key for key in features if key not in INLINE_FEATURES] - inline_keys = [key for key in features if key in INLINE_FEATURES] - feature_extractor.set_output_features(feature_keys, inline_keys=inline_keys) - feature_extractor.extract(dataset, data) - - -def get_context(extract_features, extract_features_opts, device): - """ - Gets the context (pretrained models, etc) for feature extraction - - Arguments - --------- - extract_features : list - A list of features to extract - Available features: - audio_tokens - raw tokens - audio_emb - embeddings from the model - extract_features_opts : dict - Options for feature extraction - device : str|torch.Device - The device on which extraction will be run - - Returns - -------- - context: SimpleNamespace - The context object - """ - context = {} - if ( - any(key in extract_features for key in ["audio_tokens", "audio_emb"]) - and "token_model" in extract_features_opts - ): - context["token_model"] = extract_features_opts["token_model"].to(device) - if "audio_ssl" in extract_features: - context["ssl_model"] = extract_features_opts["ssl_model"].to(device) - if "spk_emb" in extract_features: - context["spk_emb_model"] = extract_features_opts["spk_emb_model"]( - run_opts={"device": device} - ) - - return SimpleNamespace(**context) diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 5238beacd..804227d55 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -802,147 +802,6 @@ def decode(self, enc_out, length): ) -class TokotronForwardInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - - Arguments - --------- - scale_factor : float - The scaling factor for encoder representations - gate_threshold : float - The threshold for gate activation - min_length : int - The minimum length for generating sequences, in tokens - """ - - def __init__( - self, - scale_factor=5.0, - gate_threshold=0.5, - min_length=16, - eos_mode=EosMode.GATE, - eos_index=0, - representation_mode=RepresentationMode.DISCRETE, - ): - super().__init__() - self.scale_factor = scale_factor - self.gate_threshold = gate_threshold - self.min_length = min_length - self.decoder = None - self.gate = None - self.eos_mode = EosMode(eos_mode) - self.eos_index = eos_index - self.representation_mode = RepresentationMode(representation_mode) - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - self.decoder = model.decoder - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - max_len = enc_out.size(1) - src_key_padding_mask = length_to_mask( - length * max_len, max_len, - ).logical_not() - tgt = scale(enc_out, self.scale_factor) - dec_out = self.decoder( - enc_out=enc_out, - tgt=tgt, - tgt_length=length, - src_length=length, - src_key_padding_mask=src_key_padding_mask, - pos_embs_src=None, - ) - if self.eos_mode == EosMode.GATE: - p_eos, eos = self.get_length_gate(dec_out) - else: - p_eos, eos = self.get_length_token(dec_out) - - infer_length_abs = eos.max(dim=1).indices - infer_length_abs_nonzero = infer_length_abs[infer_length_abs > 0] - if len(infer_length_abs_nonzero) > 0: - infer_length_max = infer_length_abs_nonzero.max() - else: - infer_length_max = 0 - if infer_length_max == 0: - infer_length_max = p_eos.size(1) - infer_length_abs = torch.where( - infer_length_abs == 0, infer_length_max, infer_length_abs - ) - infer_length_abs = infer_length_abs.clip(min=self.min_length) - infer_length = infer_length_abs / infer_length_max - - audio = dec_out.out[:, :infer_length_max].argmax(-1) - if self.representation_mode == RepresentationMode.DISCRETE: - audio = audio.argmax(-1) - return TokotronDecoderInfernceOutput( - audio=audio, - length=infer_length, - dec_self_attn=dec_out.dec_self_attn, - dec_attn=dec_out.dec_attn, - alignments=get_alignments(dec_out.dec_attn), - p_eos=p_eos, - ) - - def get_length_gate(self, dec_out): - """Infers lengths using the gate module - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - - Returns - ------- - p_eos : torch.Tensor - EOS probabilities (as estimated by the gate) - eos : torch.Tensor - a Boolean tensor where positions indicate whether - the gate has activated - """ - p_eos = dec_out.gate_out.sigmoid() - eos = p_eos > self.gate_threshold - return p_eos, eos - - def get_length_token(self, dec_out): - """Infers lengths using an EOS token - - Arguments - --------- - dec_out : TokotronDecoderOutput - The decoder output - eos : torch.Tensor - A Boolean tensor indicating whether EOS has been reached - """ - p_seq = dec_out.out[:, :, 0].softmax(dim=-1) - p_eos = p_seq[:, :, self.eos_index].softmax(-1) - eos = p_seq.argmax(dim=-1) == self.eos_index - return p_eos, eos - - class TokotronTransformerModel(nn.Module): """An end-to-end Tokotron model receiving characters or phonemes as inputs and outputting audio tokens diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py index 9dc4014c4..9dcc922cd 100644 --- a/benchmarks/DASB/utils/audio_tokens.py +++ b/benchmarks/DASB/utils/audio_tokens.py @@ -14,6 +14,8 @@ def get_silence_token( model, sample_length=100000, extract_emb=True, + model_shape="BLH", + unsqueeze=False, device=None, model_kwargs=None, ): @@ -28,6 +30,13 @@ def get_silence_token( The length of the sample extract_emb : bool Whether to extract embeddings + model_shape : str + The shape of tokens output by the model + BLH: Batch x Length x Heads (Discrete SSL, Encodec) + BHL: Batch x Heads x Length (DAC) + HBL: Heads x Batch x Length (SpeechTokenizer) + unsqueeze: bool + Whether to add an extra dimension to the audio (needed for DAC) device : str | torch.Device The device to use model_kwargs : dict @@ -48,10 +57,24 @@ def get_silence_token( model_kwargs = {} audio = torch.zeros(1, sample_length, device=device) + if unsqueeze: + audio = audio.unsqueeze(1) length = torch.ones(1, device=device) + model_training = model.training + model.eval() result = model(audio, length, **model_kwargs) - tokens = result[0] - silence_tokens = tokens.squeeze(0).mode(0).values + if model_training: + model.train() + tokens = result if torch.is_tensor(result) else result[0] + if model_shape == "HBL": + tokens = tokens.permute(1, 2, 0) + elif model_shape == "BHL": + tokens = tokens.transpose(-1, -2) + + tokens = tokens.squeeze(0) + if unsqueeze: + tokens = tokens.squeeze(0) + silence_tokens = tokens.mode(0).values silence_emb = None if extract_emb: if hasattr(model, "embeddings"): From e66a00e5763491173017912fcb9cf64c500fea42 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 11 Jan 2025 20:49:54 -0500 Subject: [PATCH 051/270] Tokotron: Add Tokotron integration for LibriTTS (multi-speaker recipes) --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 33 +- .../DASB/LJSpeech/TTS/tokotron/train.py | 13 +- .../DASB/LibriTTS/TTS/tokotron/Tokotron.py | 1 + .../LibriTTS/TTS/tokotron/custom_model.py | 1 + benchmarks/DASB/LibriTTS/TTS/tokotron/data.py | 1 + benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py | 1 + .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 603 +++++++++ .../LibriTTS/TTS/tokotron/hparams/arpabet.txt | 50 + .../LibriTTS/TTS/tokotron/hparams/char_en.txt | 38 + .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 66 + .../hparams/train_continuous_ssl.yaml | 488 ++++++++ .../TTS/tokotron/hparams/train_dac.yaml | 330 +++++ .../tokotron/hparams/train_discrete_ssl.yaml | 396 ++++++ .../TTS/tokotron/hparams/train_encodec.yaml | 352 ++++++ .../hparams/train_speech_tokenizer.yaml | 329 +++++ .../LibriTTS/TTS/tokotron/libritts_prepare.py | 1 + .../DASB/LibriTTS/TTS/tokotron/train.py | 1077 +++++++++++++++++ .../TTS/tokotron/train_continuous_ssl.py | 47 + .../DASB/LibriTTS/TTS/tokotron/train_dac.py | 47 + .../TTS/tokotron/train_discrete_ssl.py | 79 ++ .../LibriTTS/TTS/tokotron/train_encodec.py | 46 + .../TTS/tokotron/train_speech_tokenizer.py | 46 + .../DASB/LibriTTS/extraction/extract.py | 90 ++ .../DASB/LibriTTS/extraction/hparams/dac.yaml | 63 + .../extraction/hparams/discrete_ssl.yaml | 101 ++ .../LibriTTS/extraction/hparams/encodec.yaml | 63 + .../extraction/hparams/speech_tokenizer.yaml | 53 + .../LibriTTS/extraction/libritts_prepare.py | 1 + benchmarks/DASB/LibriTTS/libritts_prepare.py | 331 +++++ benchmarks/DASB/model/Tokotron.py | 421 +++---- benchmarks/DASB/utils/audio_tokens.py | 216 ---- benchmarks/DASB/utils/eval.py | 545 +++++---- 32 files changed, 5266 insertions(+), 663 deletions(-) create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/data.py create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml create mode 120000 benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py create mode 100644 benchmarks/DASB/LibriTTS/extraction/extract.py create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml create mode 120000 benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py create mode 100644 benchmarks/DASB/LibriTTS/libritts_prepare.py delete mode 100644 benchmarks/DASB/utils/audio_tokens.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index bdf6c0f75..bad9ce7c1 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -9,12 +9,22 @@ eval_asr_source: !apply:speechbrain.utils.hparams.choice whisper: openai/whisper-small evaluations: utmos,asr tmp_folder: null -utmos_batch_size: 8 -utmos_model_path: ./utmos -utmos_ckpt_name: epoch=3-step=7459.ckpt -utmos_ckpt_path: !ref / -utmos_use_python: True -utmos_script: predict.py +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: false + + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref eval_asr: !apply:speechbrain.utils.hparams.choice @@ -31,18 +41,9 @@ eval_asr: !apply:speechbrain.utils.hparams.choice savedir: !ref evaluators: + utmos: !ref asr: !ref -bulk_evaluators: - utmos: !name:eval.UTMOSSpeechEvaluator - model_path: !ref - output_folder: !ref - ckpt_path: !ref - batch_size: !ref - script: !ref - use_python: !ref - tmp_folder: !ref - eval_summary: asr: descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 0c80cc5c2..deb8a3236 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -22,10 +22,8 @@ import string from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset from speechbrain.utils.distributed import run_on_main -from preparation import add_prepared_features -from audio_tokens import ( +from Tokotron import ( get_silence_token, use_silence_padding, feature_pad_to, @@ -683,6 +681,14 @@ def apply_overfit_test(hparams, dataset): def run_experiment(brain_cls): + """Starts the experiement + + Arguments + --------- + brain_cls : type + The brain class to instantiate + """ + # Reading command line arguments hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) @@ -789,7 +795,6 @@ def run_experiment(brain_cls): # Load best checkpoint for evaluation tts_brain.evaluate( test_set=datasets["test"], - min_key="loss", test_loader_kwargs=test_dataloader_opts, ) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py new file mode 120000 index 000000000..097a6d488 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/Tokotron.py @@ -0,0 +1 @@ +../../../model/Tokotron.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py new file mode 120000 index 000000000..4b3f08ebb --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/custom_model.py @@ -0,0 +1 @@ +../../../model/custom_model.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py new file mode 120000 index 000000000..d65702b6c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/data.py @@ -0,0 +1 @@ +../../../utils/data.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py new file mode 120000 index 000000000..0ca6d4644 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/eval.py @@ -0,0 +1 @@ +../../../utils/eval.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py new file mode 100644 index 000000000..d72df92aa --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -0,0 +1,603 @@ +"""Evaluates a checkpoint using an MOS estimation tool + +Authors +* Artem Ploujnikov 2024 +""" + +#TODO: There are too many evaluation scripts. Refactor to extract common +# features + +import speechbrain as sb +import json +import logging +import math +import sys +import csv +import torch +import torchaudio +import string +import re +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from types import SimpleNamespace +from torch.nn import ModuleDict +from tqdm.auto import tqdm +from data import undo_batch +from eval import vocoder_to_device +from torch.utils.flop_counter import FlopCounterMode +from contextlib import nullcontext + + +logger = logging.getLogger(__name__) + + +class TokotronEvaluator: + """An evaluator class for the TTS model + + Arguments + --------- + hparams: dict + hyperparameters (as a dictionary) + device : str | torch.device + the device + """ + def __init__(self, hparams, create_waveform_fn, device): + self.hparams = SimpleNamespace(**hparams) + self.create_waveform_fn = create_waveform_fn + self.device = device + modules = self.hparams.modules + self.modules = ModuleDict(modules).to(self.device) + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts={"device": device} + ) + self.modules.model.vocoder = None + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.get("evaluators", {}) + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {}) + if bulk_evaluators: + self.bulk_evaluators = { + key: evaluator_f() + for key, evaluator_f in bulk_evaluators.items() + if key in self.enabled_evaluators + } + else: + self.bulk_evaluators = {} + + if not self.evaluators and not self.bulk_evaluators: + logger.warning("No evaluators were defined - this run will produce samples only") + + self.attention = [] + + def on_evaluate_start(self, stage, epoch): + """Invoked when evaluation starts + + Arguments + --------- + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.stage = stage + self.epoch = epoch + self.output_folder = self.get_output_folder(stage, epoch) + self.samples_folder = self.output_folder / "samples" + self.samples_folder.mkdir(parents=True, exist_ok=True) + logger.info( + "Starting evaluation, results will be saved in %s", + self.output_folder, + ) + self.create_reports() + self.modules.model.show_inference_progress = False + self.item_ids = [] + details_keys = list(self.evaluators.keys()) + list( + self.bulk_evaluators.keys() + ) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.sample_text = [] + self.sample_file_names = [] + self.ref_file_names = [] + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + + def get_output_folder(self, stage, epoch): + """Computes the output folder of evaluation results + for the specified stage and epoch. + + If the folder does not exists, it will be created. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + + Returns + ------- + """ + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(parents=True, exist_ok=True) + return output_folder + + + def evaluate(self, dataset): + """Runs evaluation on a dataset + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + a dataset + """ + logger.info("Recovering the checkpoint") + ckpt = self.hparams.checkpointer.recover_if_possible() + if not ckpt: + raise ValueError("Unable to recover the checkpoint") + self.modules.model.eval() + if self.hparams.eval_samples is not None: + dataset = dataset.filtered_sorted(select_n=self.hparams.eval_samples) + loader = sb.dataio.dataloader.make_dataloader(dataset, batch_size=self.hparams.batch_size) + loader_it = iter(loader) + self.create_reports() + self.modules.model.show_inference_progress = False + self.item_ids = [] + details_keys = list(self.evaluators.keys()) + list(self.bulk_evaluators.keys()) + self.details = { + evaluator_key: [] + for evaluator_key in details_keys + } + self.read_reports() + self.sample_text = [] + self.sample_file_names = [] + self.ref_file_names = [] + logger.info("Starting evaluation") + batch_count = math.ceil(len(dataset) / self.hparams.batch_size) + for batch in tqdm(loader_it, desc="Evaluation", total=batch_count): + self.evaluate_batch(batch) + self.evaluate_bulk() + self.write_summary() + logger.info("Evaluation done") + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + if self.hparams.eval_perf: + self.perf_file = open(self.output_folder / "perf.csv", "w") + self.perf_writer = csv.DictWriter( + self.perf_file, + [ + "uttid", + "infer_flops", + "steps", + "infer_flops_per_step", + "vocoder_flops", + "total_flops", + "total_flops_per_step", + ] + ) + self.perf_writer.writeheader() + + def infer(self, tokens, tokens_length, emb): + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + infer_out = self.modules.model.infer( + input_tokens=tokens, input_length=tokens_length, emb=emb + ) + if self.hparams.eval_perf: + steps = (infer_out.length * infer_out.audio.size(1)).sum().item() + total_flops = flop_counter.get_total_flops() + stats = { + "infer_flops": total_flops, + "steps": steps, + "infer_flops_per_step": total_flops / steps, + } + return infer_out, stats + + def vocoder(self, infer_out, emb): + stats = {} + if self.hparams.eval_perf: + flop_counter = FlopCounterMode() + else: + flop_counter = nullcontext() + + with flop_counter: + wav = self.create_waveform_fn( + infer_out.audio, + length=infer_out.length, + emb=emb + ) + if wav.dim() > 2: + wav = wav.squeeze(1) + + if self.hparams.eval_perf: + flops = flop_counter.get_total_flops() + stats = { + "vocoder_flops": flops + } + return wav, stats + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = {key : handle_number(value) for key, value in row.items()} + self.details[evaluator_key].append(row) + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1., 1.], device=self.device) + if evaluator_key in self.evaluators: + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + else: + bogus_file_name = self.output_folder / "bogus.wav" + evaluator = self.bulk_evaluators[evaluator_key] + sb.dataio.dataio.write_audio( + str(bogus_file_name), + bogus_wavs[0].cpu(), + samplerate=self.hparams.model_sample_rate, + ) + result = evaluator.evaluate_files( + file_names=[bogus_file_name], + text=["BOGUS"], + file_names_ref=[bogus_file_name], + ) + + return ["uttid"] + list(result.details.keys()) + + def evaluate_batch(self, batch): + """Runs evaluation on a single batch of speech + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch to be evaluated""" + with torch.no_grad(): + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + vocoder_to_device(self.modules.vocoder, self.device) + if hasattr(self.modules.vocoder, "device"): + self.modules.vocoder.device = self.device + audio_resampled = torchaudio.functional.resample( + batch.sig.data, + self.hparams.sample_rate, + self.hparams.model_sample_rate, + ) + mel_spec = self.spk_emb_model.mel_spectogram( + audio=audio_resampled + ) + spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, batch.sig.lengths + ).squeeze(1) + infer_out, perf_stats = self.infer( + tokens=tokens, tokens_length=tokens_length, + emb={ + "spk": spk_emb + } + ) + wav, vocoder_stats = self.vocoder( + infer_out, spk_emb + ) + perf_stats.update(vocoder_stats) + length = infer_out.length + if wav.dim() > 2: + wav = wav.squeeze(1) + + self.save_samples(batch, wav, infer_out.length) + self.item_ids.extend(batch.uttid) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=batch.label_norm_eval, + wavs_ref=batch.sig.data, + length_ref=batch.sig.lengths, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, batch.uttid, details) + self.details[evaluator_key].extend(details) + + if self.hparams.eval_perf: + perf_stats.update(vocoder_stats) + perf_stats["total_flops"] = perf_stats["vocoder_flops"] + perf_stats["infer_flops"] + perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"] + self.write_perf_stats(batch.uttid, perf_stats) + + + def evaluate_bulk(self): + """Performs bulk evaluation""" + for evaluator_key, evaluator in self.bulk_evaluators.items(): + result = evaluator.evaluate_files( + file_names=self.sample_file_names, + text=self.sample_text, + file_names_ref=self.ref_file_names, + ) + self.details[evaluator_key].append(result.details) + details = undo_batch(result.details) + self.write_result(evaluator_key, self.item_ids, details) + + def write_result(self, evaluator_key, uttid, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + batch : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(uttid, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow( + ascii_only(flatten(report_details)) + ) + self.report_files[evaluator_key].flush() + + def save_samples(self, batch, wav, length): + """Saves the samples generated by the TTS system + + Arguments + --------- + batch : speechbrain.dataio.batch.PaddedBatch + the batch being evaluated + wav : torch.Tensor + the waveform + length: torch.Tensor + relative lengths + """ + wav_length_abs = (length * wav.size(1)).int() + for item_id, infer_wav, wav_length in zip( + batch.uttid, wav, wav_length_abs + ): + file_name = str( + self.samples_folder / f"{item_id}_pred.wav" + ) + infer_wav_cut = infer_wav[:wav_length.item()].cpu() + sb.dataio.dataio.write_audio( + file_name, infer_wav_cut, samplerate=self.hparams.model_sample_rate + ) + self.sample_file_names.append(file_name) + + def write_summary(self): + """Outputs summarized statistics""" + summary = self.compute_summary() + file_name = self.output_folder / "summary.json" + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def write_perf_stats(self, uttid, details): + self.perf_writer.writerow( + { + "uttid": " ".join(uttid), + **details + } + ) + self.perf_file.flush() + + + def compute_summary(self): + """Computes the summarized statistics""" + return { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key]["descriptive"] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], + key=metric_key, + ).items() + } + + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_PUNCTUATION = re.compile( + "|".join( + re.escape(char) for char in string.punctuation + ) +) + +RE_NON_ASCII = re.compile(r'[^\x00-\x7F]+') + + +def ascii_only(values): + return { + key: RE_NON_ASCII.sub('', value) if isinstance(value, str) + else value + for key, value in values.items() + } + + +@sb.utils.data_pipeline.takes("label_norm") +@sb.utils.data_pipeline.provides("label_norm_eval") +def label_norm_pipeline(label): + """Normalizes labels for ASR comparison, converting to uppercase and removing + punctuation + + Arguments + --------- + label : str + The unnormalized label + + Returns + ------- + result : str + The normalized label + """ + label = label.upper() + label = RE_PUNCTUATION.sub("", label) + return label + + +@sb.utils.data_pipeline.takes("wav") +@sb.utils.data_pipeline.provides("sig") +def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : str + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() + for stat_key, value in stats.items() + } + + +def select_subset(dataset, hparams): + """Selects a subset of the dataset provided, if specified. + The selection is controlled by a hyperparameter named + eval_subset, which is expected to list the IDs of the + data items on which evaluation will take place, one per line + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams : dict + A hyperparameters file + + Returns + ------- + subset : dataset + The dataset, filtered down if applicable + """ + eval_subset_path = hparams.get("eval_subset") + if eval_subset_path is not None: + eval_subset_path = Path(eval_subset_path) + if not eval_subset_path.exists(): + raise ValueError(f"eval_subset {eval_subset_path} does not exist") + with open(eval_subset_path) as eval_subset_file: + eval_subset_ids = [line.strip() for line in eval_subset_file] + subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) + else: + subset = dataset + return subset + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value + diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml new file mode 100644 index 000000000..a4c8b6b59 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -0,0 +1,66 @@ +eval_dataset: valid +eval_suffix: "" +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech + whisper: openai/whisper-small +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: false + + +eval_asr: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator + source: !ref + sample_rate: !ref + overrides: + lm_weight: 0.0 + test_beam_size: !ref + whisper: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +eval_utmos: !name:eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml new file mode 100644 index 000000000..3626079ef --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml @@ -0,0 +1,488 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder_alignments: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +ssl_model_type: wavlm +representation_mode: discrete +vocoder_model_name: !ref unithifigan-dasb---ms +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_archive: !ref /progress.tar +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large-960h-lv60-self +g2p_src: flexthink/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +token_model_kmeans_dataset: LibriSpeech-100-360-500 +ssl_model_layers: [1, 3, 7, 12, 18, 23] +token_model_layers: !ref +select_layers: null +token_offset: 1 +vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS + hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS + wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS +vocoder_src_continous: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS + hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS + wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS +vocoder_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: !ref + continuous: !ref +vocoder_available_layers: [1, 3, 7, 12, 18, 23] +vocoder_takes_spk_emb: True +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite + hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite + wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite +asr_src: speechbrain/asr-transformer-transformerlm-librispeech +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + +# Guides +guides_enabled: False +guides_start_epoch: 40 +guides_spk: False +guides_spk_discrete: True +guides_spk_loss_weight: 0.2 +guides_asr: True +guides_asr_loss_weight: 0.1 + + + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + + +token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL + ssl_model: !ref + kmeans_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + save_path: !ref + layers_num: !apply:benchmarks.DASB.utils.hparams.as_list + value: !ref + dtype: int + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class + source: !ref + savedir: !ref /ecapa- + pymodule_file: custom_interface.py + classname: DiscreteSpkEmb + overrides: + ssl_layer_num_selected: !ref + +asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide + source: !ref + savedir: !ref /asr-transformer + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list + value: !ref + dtype: int + deduplicates: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers + layers: !ref + value: False + bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers + layers: !ref + value: null + + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + token_model_kwargs: !ref + ssl_model: !ref + ssl_model_layers: !apply:benchmarks.DASB.utils.hparams.as_list + value: !ref + dtype: int + token_model_layers: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + data_folder_alignments: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 +dec_num_layers: 12 +layerwise_renorm: True +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +enc_n_dim: 16 +dec_n_dim: 256 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +emb_dropout: 0.0 +activation: !name:torch.nn.GELU +audio_num_tokens: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA + +############################## models ################################ + +vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list + value: !apply:speechbrain.utils.hparams.choice + value: !ref + default: !ref + choices: + null: !ref + dtype: int + +vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper + model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams + source: !ref + savedir: !ref + available_layers: !ref + layers: !ref + num_units: !ref + offset: !ref + +vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper + model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams + source: !ref + savedir: !ref + +vocoder: !apply:benchmarks.DASB.utils.hparams.choice + value: !ref + apply: True + choices: + discrete: !ref + continuous: !ref + +inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference + bos_index: !ref + eos_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + using_eos_threshold: False + length_normalization: True + audio_token_shift: !ref + +inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference + scale_factor: !ref + gate_threshold: !ref + eos_mode: !ref + representation_mode: !ref + +inference: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + search: !ref + forward: !ref + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: True + injection: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + enc_n_dim: !ref + dec_n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + emb_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + inference: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + representation_mode: !ref + use_position_shift: !ref + max_position_shift: !ref + position_shift_probability: !ref + position_shift_seed: !ref + emb: !ref + layerwise_renorm: !ref + +modules: + model: !ref + vocoder: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + spk_weight: !ref + asr_weight: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +freezer: !new:benchmarks.DASB.utils.preparation.Freezer + save_path: !ref + archive_path: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport + logger: !ref + sample_rate: !ref + eos_threshold: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml new file mode 100644 index 000000000..c6875498c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -0,0 +1,330 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_archive: !ref /progress.tar +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# DAC-specific settings +model_type: 24khz +model_bitrate: 8kbps + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +dac: !new:speechbrain.lobes.models.discrete.dac.DAC + sample_rate: !ref + model_type: !ref + model_bitrate: !ref + load_pretrained: True + +token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor + dac: !ref + n_quantizers: !ref + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 +dec_num_layers: 12 +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder + dac: !ref + + + +inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference + bos_index: !ref + eos_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + using_eos_threshold: False + length_normalization: True + audio_token_shift: !ref + +inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference + scale_factor: !ref + gate_threshold: !ref + eos_mode: !ref + +inference: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + search: !ref + forward: !ref + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + inference: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + emb: !ref + +modules: + model: !ref + vocoder: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +freezer: !new:benchmarks.DASB.utils.preparation.Freezer + save_path: !ref + archive_path: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport + logger: !ref + sample_rate: !ref + eos_threshold: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..c1c2f9f1c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -0,0 +1,396 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder_alignments: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +ssl_model_type: wavlm +representation_mode: discrete +vocoder_model_name: !ref unithifigan-dasb---ms +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_archive: !ref /progress.tar +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large-960h-lv60-self +g2p_src: flexthink/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +ssl_model_layers: [1, 3, 7, 12, 18, 23] +token_model_layers: !ref +select_layers: null +token_offset: 1 +vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS + hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS + wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS +vocoder_src_continous: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS + hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS + wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS +vocoder_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: !ref + continuous: !ref +vocoder_available_layers: [1, 3, 7, 12, 18, 23] +vocoder_takes_spk_emb: True +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite + hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite + wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite +asr_src: speechbrain/asr-transformer-transformerlm-librispeech +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 2000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + +# Guides +guides_enabled: False +guides_start_epoch: 40 +guides_spk: False +guides_spk_discrete: True +guides_spk_loss_weight: 0.2 +guides_asr: True +guides_asr_loss_weight: 0.1 + + + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class + source: !ref + savedir: !ref /ecapa- + pymodule_file: custom_interface.py + classname: DiscreteSpkEmb + overrides: + ssl_layer_num_selected: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + ssl_model: !ref + ssl_model_layers: !ref + token_model_layers: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + data_folder_alignments: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 +dec_num_layers: 12 +layerwise_renorm: True +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA + +############################## models ################################ + +vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams + source: !ref + savedir: !ref + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: True + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +modules: + model: !ref + vocoder: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml new file mode 100644 index 000000000..a82d82a2c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -0,0 +1,352 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_archive: !ref /progress.tar +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec + source: !ref + save_path: !ref + bandwidth: !ref + flat_embeddings: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 +dec_num_layers: 12 +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +enc_n_dim: 16 +dec_n_dim: 256 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +vocoder: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + encodec: !new:benchmarks.DASB.model.custom_model.EncodecVocoder + encodec: !ref + vocos: !new:speechbrain.lobes.models.huggingface_transformers.vocos.Vocos + source: !ref + save_path: !ref + + +inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference + bos_index: !ref + eos_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + using_eos_threshold: False + length_normalization: True + audio_token_shift: !ref + +inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference + scale_factor: !ref + gate_threshold: !ref + eos_mode: !ref + +inference: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + search: !ref + forward: !ref + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + enc_n_dim: !ref + dec_n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + inference: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + emb: !ref + +modules: + model: !ref + vocoder: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +freezer: !new:benchmarks.DASB.utils.preparation.Freezer + save_path: !ref + archive_path: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger + current_path: !ref + archive_path: !ref + meta_path: !ref + epoch_counter: !ref + +progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport + logger: !ref + sample_rate: !ref + eos_threshold: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml new file mode 100644 index 000000000..1711b10f4 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -0,0 +1,329 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_archive: !ref /progress.tar +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +freeze_token_model: True +token_model_src: "fnlp/SpeechTokenizer" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +# Token model (pretrained) +token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface + source: !ref + save_path: !ref + shape: compat + codebooks: !ref + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +extract_features_opts: + dataloader_opts: + batch_size: !ref + num_workers: !ref + token_model: !ref + sample_rate: !ref + model_sample_rate: !ref + spk_emb_model: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 +dec_num_layers: 12 +d_ffn: 2048 +z_dim: 128 +hidden_dim: 2048 +n_dim: 16 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA + +############################## models ################################ + +vocoder: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerVocoder + tokenizer: !ref + +inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference + bos_index: !ref + eos_index: !ref + min_decode_ratio: !ref + max_decode_ratio: !ref + beam_size: !ref + using_eos_threshold: False + length_normalization: True + audio_token_shift: !ref + +inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference + scale_factor: !ref + gate_threshold: !ref + eos_mode: !ref + +inference: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + search: !ref + forward: !ref + +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + z_dim: !ref + hidden_dim: !ref + n_dim: !ref + decoder_chunk_size: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + vocoder: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + inference: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + emb: !ref + +modules: + model: !ref + vocoder: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +freezer: !new:benchmarks.DASB.utils.preparation.Freezer + save_path: !ref + archive_path: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger + current_path: !ref + archive_path: !ref + meta_path: !ref + epoch_counter: !ref + +progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport + logger: !ref + sample_rate: !ref + eos_threshold: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py new file mode 100644 index 000000000..a09a4cc23 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -0,0 +1,1077 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import math +import torch +import sys +from functools import partial +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from Tokotron import ( + RepresentationMode, + get_silence_token, + use_silence_padding, + feature_pad_to, +) +from types import SimpleNamespace +from evaluate import TokotronEvaluator +import re +import string + + +logger = logging.getLogger(__name__) + +SPECIAL_TOKEN_COUNT = 1 + + +# Brain class for speech recognition training +class TokotronBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluator = TokotronEvaluator( + hparams=hparams, + create_waveform_fn=self.create_waveform, + device=self.device, + ) + self.representation_mode = RepresentationMode(self.hparams.representation_mode) + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + raise NotImplementedError() + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + tokens, tokens_length = batch.tokens + features = self.prepare_features(batch) + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb + ) = features + + predictions = self.modules.model( + input_tokens=tokens, + input_length=tokens_length, + audio=audio_bos, + audio_length=audio_bos_length, + emb={ + "spk": spk_emb + } + ) + + return predictions, features + + def prepare_features(self, batch): + if self.hparams.spk_emb_shuffle: + wav, wav_length = batch.spk_emb_random_match + else: + wav, wav_length = batch.sig + spk_emb = self._compute_spk(wav, wav_length).squeeze(1) + + if self.representation_mode == RepresentationMode.DISCRETE: + audio_bos, audio_bos_length = batch.audio_bos + audio_tgt, audio_tgt_length = batch.audio_pad + else: + wav, audio_length = batch.sig + audio = self.modules.ssl_model(wav) + audio = audio[self.hparams.ssl_model_layers, :, :, :].permute( + 1, 2, 0, 3 + ) + batch_size, _, heads, dim = audio.shape + bos = torch.zeros_like( + audio[:, :1, :, :] + ).reshape(batch_size, self.hparams.bos_width, heads, dim) + audio_bos = torch.concatenate( + [bos, audio], + dim=1 + ) + audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) + audio_tgt = audio + audio_tgt_length = audio_length + + return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb + + def _compute_spk(self, wav, wav_length): + mel_spec = self.spk_emb_model.mel_spectogram( + wav.squeeze(1)) + spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch( + mel_spec, wav_length + ) + return spk_emb_pred + + def _get_selected_layer_idx(self): + selected_layers = None + if hasattr(self.hparams, "select_layers") and self.hparams.select_layers: + layers = self.hparams.select_layers + model_layers_map = { + layer: idx + for idx, layer in enumerate( + self.hparams.token_model_layers) + } + selected_layers = [model_layers_map[layer] for layer in layers] + return selected_layers + + # TODO: Move this elsewhere + def select_layers(self, audio_ssl): + """Applies layer squishing, if enabled + + Arguments + --------- + audio_ssl : torch.Tensor + SSL features + + Returns + ------- + audio_ssl : torch.Tensor + SSL features, squished if enabled + """ + if self.layer_idx: + audio_ssl = audio_ssl[:, :, self.layer_idx] + return audio_ssl + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + predictions, features = predictions + ( + audio_bos, + audio_bos_length, + audio_tgt, + audio_tgt_length, + spk_emb + ) = features + + loss_details = self.hparams.compute_cost( + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + ) + self.loss_metric.append( + batch.uttid, + predictions=predictions, + audio=audio_tgt, + audio_length=audio_tgt_length, + input_tokens=batch.tokens.data, + input_length=batch.tokens.lengths, + reduction="batch", + ) + return loss_details.loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + if hasattr(self.modules.vocoder, "model"): + self.modules.vocoder.model.device = self.device + self.layer_idx = self._get_selected_layer_idx() + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.hparams.compute_cost, batch_eval=True, + ) + if ( + self.hparams.audio_emb_pretrained + and epoch == 1 + and stage == sb.Stage.TRAIN + ): + # TODO: Clean this up + if hasattr(self.hparams.token_model, "vocabulary"): + vocabulary = self.hparams.token_model.vocabulary + elif hasattr(self.hparams.token_model, "vocabularies"): + vocabulary = torch.stack( + [ + torch.from_numpy(voc) + for voc in self.hparams.token_model.vocabularies + ] + ) + self.modules.model.init_audio_emb(vocabulary) + # Load the compression model only if compression is enables + pretrained_run_opts = {"device": self.device} + self.spk_emb_model = self.hparams.spk_emb_model( + run_opts=pretrained_run_opts + ) + self.representation_mode = RepresentationMode(self.hparams.representation_mode) + # If speaker embedding shuffling is enabled, re-initialize them for the + # epoch + if self.hparams.spk_emb_shuffle: + stage_key = stage.name.lower() + self.resample_fn[stage_key](epoch=epoch) + + # Reset the learning rate - if supported. This is useful when fine-tuning + # a model pre-trained on another dataset + if ( + stage == sb.Stage.TRAIN + and self.hparams.reset_annealing_epoch is not None + and epoch is not None + and epoch == self.hparams.reset_annealing_epoch + ): + self.hparams.lr_annealing.n_steps = 0 + + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + return epoch % self.hparams.eval_interval == 0 + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + self.evaluator.evaluate_batch(batch) + return loss.detach().cpu() + + def make_dataloader( + self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs + ): + """A custom override of make_dataloader that will change the batch + size if guides are enabled to meet GPU memory constraints + + Arguments + --------- + dataset : Dataset + A set of data to use to create data loader. If the Dataset is a + DynamicItemDataset, PaddedBatch is used as the default collate_fn, + unless specified in loader_kwargs. + stage : Stage + The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST + ckpt_prefix : str, None + Prefix to use for SaveableDataLoader Checkpoint name. The Stage + name is added to this to create the full key. Set to None to not + save the DataLoader. + **loader_kwargs : dict + Additional keyword arguments to the DataLoader. + E.g., batch_size, num_workers, pin_memory. + + Returns + ------- + DataLoader for the input dataset + """ + if stage == sb.Stage.TRAIN and not getattr(self, "_ckpt_recovered", False): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + if self.guides_running(pre_epoch=True): + loader_kwargs["batch_size"] = self.hparams.batch_size_guided + return super().make_dataloader( + dataset=dataset, + stage=stage, + ckpt_prefix=ckpt_prefix, + **loader_kwargs + ) + + def guides_running(self, pre_epoch=False): + """Determines whether guides are currently running + + Arguments + --------- + pre_epoch : bool + If enabled, a correction will be applied to the current epoch + indicating that the current epoch has not yet started""" + epoch = self.hparams.epoch_counter.current + if pre_epoch: + epoch += 1 + return ( + self.hparams.guides_enabled + and epoch >= self.hparams.guides_start_epoch + ) + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + ) + + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams, guide_ctx=None): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + guide_ctx : SimpleNamespace, optional + The guide context with pretrained models + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + representation_mode = RepresentationMode( + hparams.get("representation_mode", RepresentationMode.DISCRETE) + ) + + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def audio_ref_pipeline(wav): + """The audio loading pipeline for references + + Arguments + --------- + wav : strƒnum_ + The file path + + Returns + ------- + sig : torch.Tensor + The waveform + """ + sig = sb.dataio.dataio.read_audio(wav) + return sig + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label.upper() + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + @sb.utils.data_pipeline.takes("label_norm") + @sb.utils.data_pipeline.provides("asr_tokens") + def asr_tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return torch.tensor(guide_ctx.asr_model.encode(label)) + + use_silence_padding = hparams.get("use_silence_padding", True) + if "token_model_layers" in hparams: + audio_tokens_per_step = len(hparams["token_model_layers"]) + else: + audio_tokens_per_step = hparams["audio_tokens_per_step"] + if use_silence_padding: + silence_token, silence_emb = get_silence_token( + hparams["tokenizer"], + extract_emb=True, + model_kwargs=hparams.get("token_model_kwargs"), + ) + else: + silence_token = ( + torch.ones(audio_tokens_per_step, dtype=torch.int64) + * hparams["eos_index"] + ) + + silence_padding = silence_token if representation_mode == RepresentationMode.DISCRETE else silence_emb + silence_padding = silence_padding.cpu() + silence_padding_len = int(math.ceil(hparams["silence_padding"])) + bos_width = hparams.get("bos_width", 1) + audio_bos_prefix = ( + torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] + ) + if representation_mode == RepresentationMode.CONTINUOUS: + audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(1, 1, hparams["audio_dim"]) + + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("uttid") + @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") + def audio_pipeline(id): + audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step) + audio_pad = feature_pad_to( + audio, len(audio) + silence_padding_len, silence_padding + ) + yield audio_pad + audio_bos = torch.cat([audio_bos_prefix, audio_pad], dim=0) + yield audio_bos + + def spk_emb_random_match(uttid, dataset, spk_sample): + # Sample a speaker-matched embedding + selected_idx = spk_sample[uttid] + + # Retrieve the embedding value from the dataset + with dataset.output_keys_as(["sig"]): + spk_emb = dataset[selected_idx]["sig"] + return spk_emb + + dynamic_items = [ + text_pipeline, + tokens_pipeline, + audio_ref_pipeline, + audio_pipeline + ] + output_keys = [ + "uttid", + "tokens", + "audio_pad", + "audio_bos", + "sig", + "spk_emb_random_match", + ] + + init_sequence_encoder(hparams) + + resample_fn = {} + for dataset in data_info: + dataset_output_keys = output_keys if dataset == "train" else output_keys + ["label_norm_eval"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + if hparams["spk_emb_shuffle"]: + spk_idx, spk_samplers = group_by_speaker( + dynamic_dataset, + hparams + ) + spk_sample = {} + spk_emb_random_match_pipeline = partial( + spk_emb_random_match, + spk_sample=spk_sample, + dataset=dynamic_dataset.filtered_sorted(), + ) + dynamic_dataset.add_dynamic_item( + func=spk_emb_random_match_pipeline, + takes=["uttid"], + provides=["spk_emb_random_match"], + ) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers + ) + resample_fn[dataset](epoch=0) + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + + # Exclude samples without phonemes + if hparams["input"] == "phonemes": + for key in datasets: + datasets[key] = datasets[key].filtered_sorted( + key_test={ + "phn": lambda value: value + } + ) + datasets["sample"] = select_sample(hparams, datasets) + return datasets, silence_padding, resample_fn + + +def select_sample(hparams, datasets): + """Selects a sample of files for sample generation, freezing the sample if + requested to persist across multiple experiments + + Arguments + --------- + hparams : dict + experiment hyperparameters + datasets : dict + a dictionary of datasets + + Returns + ------- + dataset : speechbrain.dataio.dataset.FilteredSortedDynamicItemDataset + the sample dataset + """ + sample_path = hparams.get("sample_path") + dataset = None + if sample_path is not None: + sample_path = Path(sample_path) + if sample_path.exists(): + with open(sample_path, "r") as sample_file: + data_ids = [line.strip() for line in sample_file] + dataset = FilteredSortedDynamicItemDataset( + datasets["valid"], data_ids + ) + + if dataset is None: + dataset = ( + datasets["valid"] + .batch_shuffle(1) + .filtered_sorted(select_n=hparams["num_audio_samples"]) + ) + if sample_path is not None: + with open(sample_path, "w") as sample_file: + for data_id in dataset.data_ids: + print(data_id, file=sample_file) + return dataset + + +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> int dictionary with a list of utterance indexes + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_idx = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_idx: + spk_idx[spk_id] = [] + spk_idx[spk_id].append(idx) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_idx[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_idx, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + return encoder + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + if not Path(file_name).exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, _, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def get_guide_ctx(hparams, run_opts): + """Initializes a context object for guides, + containing pretrained models only for guides that will be + used per hparams + + Arguments + --------- + hparams : dict + Hyperparameters + run_opts : dict + Run options + + Returns + ------- + ctx : SimpleNamespace + The resulting context""" + ctx = {} + if hparams["guides_enabled"]: + pretrained_run_opts = {"device": run_opts.get("device", "cpu")} + if hparams["guides_spk"]: + ctx["spk_emb_model"] = hparams["spk_emb_model"]( + run_opts=pretrained_run_opts + ) + if hparams["guides_asr"]: + ctx["asr_model"] = hparams["asr_model"]( + run_opts=pretrained_run_opts + ) + return SimpleNamespace(**ctx) + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + + +def run_experiment(brain_cls): + """Starts the experiement + + Arguments + --------- + brain_cls : type + The brain class to instantiate + """ + + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "model_name": hparams["model"].__class__.__name__, + }, + ) + + # We can now directly create the datasets for training, valid, and test + guide_ctx = get_guide_ctx(hparams, run_opts) + ( + datasets, + silence_padding, + resample_fn + ) = dataio_prepare(hparams, guide_ctx) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_pad", "audio_bos"] + + # Trainer initialization + tts_brain = brain_cls( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + tts_brain.sample_data = datasets["sample"] + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=use_silence_padding( + hparams["train_dataloader_opts"], silence_padding, audio_keys + ), + valid_loader_kwargs=use_silence_padding( + hparams["valid_dataloader_opts"], silence_padding, audio_keys + ), + ) + + # Load best checkpoint for evaluation + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=use_silence_padding( + hparams["test_dataloader_opts"], silence_padding, audio_keys + ), + ) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py new file mode 100644 index 000000000..9c8b243be --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py @@ -0,0 +1,47 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio +Continuous SSL verfsion + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + +from train import TokotronBrain, run_experiment +from speechbrain.dataio.dataio import clean_padding_ + + +class TokotronContinuousSSLBrain(TokotronBrain): + """Tokotron implementation for Encodec""" + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + wav = self.modules.vocoder(audio, emb) + wav = wav.squeeze(1) + clean_padding_(wav, length) + return wav + + +if __name__ == "__main__": + run_experiment(TokotronContinuousSSLBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py new file mode 100644 index 000000000..78c584c45 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py @@ -0,0 +1,47 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" +from train import TokotronBrain, run_experiment +from speechbrain.dataio.dataio import clean_padding_ + + +class TokotronDACBrain(TokotronBrain): + """Tokotron implementation for Encodec""" + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + z, _, _ = self.modules.tokenizer.quantizer.from_codes( + audio.transpose(1, 2).int() + ) + wav = self.modules.tokenizer.decode(z).squeeze(1) + clean_padding_(wav, length) + return wav + + +if __name__ == "__main__": + run_experiment(TokotronDACBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py new file mode 100644 index 000000000..3cc0e2644 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py @@ -0,0 +1,79 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio +Discrete SSL version + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + +import torch +from train import TokotronBrain, run_experiment +from speechbrain.dataio.dataio import clean_padding_ + + +class TokotronDiscreteSSLBrain(TokotronBrain): + """Tokotron implementation for Encodec""" + + def on_stage_start(self, stage, epoch): + self.compute_offset() + return super().on_stage_start(stage, epoch) + + def compute_offset(self): + """Computes per-layer offsets""" + layers_set = set(self.hparams.token_model_layers) + available_layers_set = set(self.hparams.vocoder_available_layers) + if not layers_set.issubset(available_layers_set): + unavailable_layers = ",".join( + str(layer) for layer in (layers_set - available_layers_set) + ) + raise ValueError(f"Layers {unavailable_layers} are not supported") + self.num_units = self.hparams.vocab_size + _, layers_idx = torch.where( + torch.tensor( + self.hparams.vocoder_available_layers, device=self.device + ).unsqueeze(0) + == torch.tensor( + self.hparams.token_model_layers, device=self.device + ).unsqueeze(1) + ) + self.layer_offset = ( + torch.tensor(layers_idx, device=self.device) * self.num_units + )[None, None, :] + self.offset = self.hparams.token_offset + self.modules.vocoder.tokenize = False + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + units_with_offset = ( + audio + self.layer_offset.to(audio.device) + self.offset + ) + wav = self.modules.vocoder(units_with_offset) + wav = wav.squeeze(1) + clean_padding_(wav, length) + return wav + + +if __name__ == "__main__": + run_experiment(TokotronDiscreteSSLBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py new file mode 100644 index 000000000..07edbbd8c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py @@ -0,0 +1,46 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + +from train import TokotronBrain, run_experiment +from speechbrain.dataio.dataio import clean_padding_ + + +class TokotronEncodecBrain(TokotronBrain): + """Tokotron implementation for Encodec""" + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + wav = self.modules.token_model.decode(audio) + wav = wav.squeeze(1) + clean_padding_(wav, length) + return wav + + +if __name__ == "__main__": + run_experiment(TokotronEncodecBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py new file mode 100644 index 000000000..fdbbb3ed7 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py @@ -0,0 +1,46 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + +from train import TokotronBrain, run_experiment +from speechbrain.dataio.dataio import clean_padding_ + + +class TokotronSTBrain(TokotronBrain): + """Tokotron implementation for Encodec""" + + def create_waveform(self, audio, length, emb): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + emb: dict + Embeddings (speaker, etc) + + Returns + ------- + wav : torch.Tensor + """ + wav = self.modules.token_model.decode(audio) + if length is not None: + clean_padding_(wav, length) + return wav + + +if __name__ == "__main__": + run_experiment(TokotronSTBrain) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py new file mode 100644 index 000000000..ad2f5bf0c --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -0,0 +1,90 @@ +#!/usr/bin/env/python3 +"""Recipe for extracting a discrete tokens with librispeech. + +Authors + * Jarod Duret 2024 +""" + +import os +import sys +import logging +import pathlib as pl +import speechbrain as sb +from speechbrain.dataio.dataset import DynamicItemDataset +from speechbrain.utils.distributed import run_on_main +from hyperpyyaml import load_hyperpyyaml + +base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +sys.path.append(base_dir) + +print(base_dir) + +logger = logging.getLogger(__name__) + + +if __name__ == "__main__": + # CLI: + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + with open(hparams_file) as fin: + hparams = load_hyperpyyaml(fin, overrides) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + # Dataset prep (parsing Librispeech + from libritts_prepare import prepare_libritts # noqa + + # multi-gpu (ddp) save data preparation + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "train_split": hparams["train_splits"], + "valid_split": hparams["dev_splits"], + "test_split": hparams["test_splits"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": hparams["test_json"], + "sample_rate": hparams["sample_rate"], + "skip_prep": hparams["skip_prep"], + }, + ) + + tokens_extractor = hparams["tokens_extractor"] + data_folder = hparams["data_folder"] + datasets = [] + for split in ["train", "valid", "test"]: + json_path = hparams[f"{split}_json"] + name = pl.Path(json_path).stem + dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=json_path, replacements={"data_root": data_folder}, + ) + datasets.append(dataset) + + merged_data = { + key: value + for dataset in datasets + for key, value in dataset.data.items() + } + merged_dataset = DynamicItemDataset(merged_data) + + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Extracting dataset tokens ...") + tokens_extractor.extract_tokens( + merged_dataset, + hparams["num_codebooks"], + (save_folder / "libritts").as_posix(), + ) + + if hparams["save_embedding"]: + save_folder = pl.Path(hparams["save_folder"]) + logger.info("Saving embeddings ...") + tokens_extractor.save_pretrained_embeddings( + (save_folder / "embeddings").as_posix(), + vocab_size=hparams["vocab_size"], + num_codebooks=hparams["num_codebooks"], + ) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml new file mode 100644 index 000000000..76870e279 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: DAC +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +# DAC parameters +# model_type: [16khz, 24khz, 44khz, 44khz] +# vocab_size: [1024, 1024, 1024, 1024] +# model_bitrate: [8kbps, 8kbps, 8kbps, 16kbps] +# max_num_codebooks: [12, 32, 9, 18] +# embedding_dim: [1024, 1024, 1024, 128] +model_type: 24khz +vocab_size: 1024 +model_bitrate: 8kbps +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + load_pretrained: True + tag: latest + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml new file mode 100644 index 000000000..2b57a7edf --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml @@ -0,0 +1,101 @@ +# ############################################################################ +# Auido Tokenizer: WavLM +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavlm +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +### Configuration for discrete SSL model +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | + +# ssl_model_type: hubert, wavlm, wav2vec2 +# ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large +ssl_model_type: WavLM +ssl_hub: microsoft/wavlm-large +ssl_folder: !ref /ssl_checkpoint +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +freeze_ssl: True +freeze_feature_extractor: True +vocab_size: 1000 +save_embedding: False + +### Config for Tokenizer +# Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) +num_codebooks: [1, 3, 7, 12, 18, 23] +deduplicate: [False, False, False, False, False, False] +bpe_tokenizer_path: [null, null, null, null, null, null] +sample_rate: 16000 +encoder_dim: 1024 + +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + WavLM: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + HuBERT: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + Wav2Vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + output_norm: False + freeze: !ref + freeze_feature_extractor: !ref + output_all_hiddens: True + save_path: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml new file mode 100644 index 000000000..31211ec75 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -0,0 +1,63 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +bandwidth: 24.0 +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml new file mode 100644 index 000000000..9a53ed27b --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -0,0 +1,53 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +vocab_size: 1024 +num_codebooks: 8 +sample_rate: 16000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + +# EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py new file mode 120000 index 000000000..39f1a78c2 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/libritts_prepare.py @@ -0,0 +1 @@ +../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py new file mode 100644 index 000000000..6d0ca9f0a --- /dev/null +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -0,0 +1,331 @@ +""" +LibriTTS data preparation + +Authors + * Pradnya Kandarkar 2022 +""" + +import json +import os +import random + +import torch +import torchaudio +from tqdm import tqdm + +from speechbrain.inference.text import GraphemeToPhoneme +from speechbrain.utils.data_utils import get_all_files +from speechbrain.utils.logger import get_logger +from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations + +logger = get_logger(__name__) +LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/" + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def prepare_libritts( + data_folder, + save_json_train, + save_json_valid, + save_json_test, + sample_rate, + split_ratio=[80, 10, 10], + libritts_subsets=None, + train_split=None, + valid_split=None, + test_split=None, + seed=1234, + model_name=None, + max_valid_size=500, + skip_prep=False, +): + """ + Prepares the json files for the LibriTTS dataset. + Downloads the dataset if it is not found in the `data_folder` as expected. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored. + save_json_train : str + Path where the train data specification file will be saved. + save_json_valid : str + Path where the validation data specification file will be saved. + save_json_test : str + Path where the test data specification file will be saved. + sample_rate : int + The sample rate to be used for the dataset + split_ratio : list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + libritts_subsets: list + List of librispeech subsets to use (e.g., dev-clean, train-clean-100, ...) for the experiment. + This parameter will be ignored if explicit data splits are provided. + Explicit data splits parameters: "train_split", "valid_split", "test_split" + train_split : list + List of librispeech subsets to use (e.g.,train-clean-100, train-clean-360) for the experiment training stage. + valid_split : list + List of librispeech subsets to use (e.g., dev-clean) for the experiment validation stage. + test_split : list + List of librispeech subsets to use (e.g., test-clean) for the experiment testing stage. + seed : int + Seed value + model_name : str + Model name (used to prepare additional model specific data) + skip_prep: Bool + If True, skip preparation. + + Returns + ------- + None + """ + + if skip_prep: + return + + # Setting the seed value + random.seed(seed) + + # Checks if this phase is already done (if so, skips it) + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed in previous run, skipping.") + return + + logger.info( + f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}" + ) + + # If specific splits are provided, creates data manifest files accordingly + if train_split: + wav_list = prepare_split(data_folder, train_split) + create_json(wav_list, save_json_train, sample_rate, model_name) + if valid_split: + wav_list = prepare_split(data_folder, valid_split) + # TODO add better way to speedup evaluation + if len(wav_list) > max_valid_size: + wav_list = random.sample(wav_list, max_valid_size) + create_json(wav_list, save_json_valid, sample_rate, model_name) + if test_split: + wav_list = prepare_split(data_folder, test_split) + create_json(wav_list, save_json_test, sample_rate, model_name) + + if skip(save_json_train, save_json_valid, save_json_test): + logger.info("Preparation completed.") + return + + # If specific splits are not provided, and a list of subsets if provided, creates train, valid, test splits + # Creates data manifest files according to the data splits + if libritts_subsets: + wav_list = prepare_split(data_folder, libritts_subsets) + # Random split the signal list into train, valid, and test sets. + data_split = split_sets(wav_list, split_ratio) + # Creating json files + create_json( + data_split["train"], save_json_train, sample_rate, model_name + ) + create_json( + data_split["valid"], save_json_valid, sample_rate, model_name + ) + create_json(data_split["test"], save_json_test, sample_rate, model_name) + + +def prepare_split(data_folder, split_list): + """ + Processes the provided list of LibriTTS subsets and creates a list of all the .wav files present in the subsets. + Downloads the LibriTTS subsets as required. + + Arguments + --------- + data_folder : str + Path to the folder where the LibriTTS dataset is stored + split_list : list + List of librispeech subsets to process (e.g., dev-clean, train-clean-100, ...) + + Returns + ------- + wav_list : list + List of all .wav files to be processed + """ + extension = [".wav"] # The expected extension for audio files + wav_list = list() # Stores all audio file paths for the dataset + + # For every subset of the dataset, if it doesn't exist, downloads it + for subset_name in split_list: + subset_folder = os.path.join(data_folder, subset_name) + subset_archive = os.path.join(subset_folder, subset_name + ".tar.gz") + + if not check_folders(subset_folder): + logger.info( + f"No data found for {subset_name}. Checking for an archive file." + ) + if not os.path.isfile(subset_archive): + logger.info( + f"No archive file found for {subset_name}. Downloading and unpacking." + ) + quit() + # Collects all files matching the provided extension + wav_list.extend(get_all_files(subset_folder, match_and=extension)) + + return wav_list + + +def create_json(wav_list, json_file, sample_rate, model_name=None): + """ + Creates the json file given a list of wav files. + Arguments + --------- + wav_list : list of str + The list of wav files. + json_file : str + The path of the output json file + sample_rate : int + The sample rate to be used for the dataset + model_name : str + Model name (used to prepare additional model specific data) + """ + + # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments + if model_name == "Tacotron2": + logger.info( + "Computing phonemes for labels using SpeechBrain G2P. This may take a while." + ) + g2p = GraphemeToPhoneme.from_hparams( + "speechbrain/soundchoice-g2p", run_opts={"device": DEVICE} + ) + else: + g2p = None + + json_dict = {} + + # Processes all the wav files in the list + for wav_file in tqdm(wav_list): + # Reads the signal + signal, sig_sr = torchaudio.load(wav_file) + duration = signal.shape[1] / sig_sr + + # TODO add better way to filter short utterances + if duration < 1.0: + continue + + # Manipulates path to get relative path and uttid + path_parts = wav_file.split(os.path.sep) + uttid, _ = os.path.splitext(path_parts[-1]) + # relative_path = os.path.join("{data_root}", *path_parts[-4:]) + + # Gets the path for the text files and extracts the input text + normalized_text_path = os.path.join( + "/", *path_parts[:-1], uttid + ".normalized.txt" + ) + try: + with open(normalized_text_path, encoding="utf-8") as f: + normalized_text = f.read() + if normalized_text.__contains__("{"): + normalized_text = normalized_text.replace("{", "") + if normalized_text.__contains__("}"): + normalized_text = normalized_text.replace("}", "") + except FileNotFoundError: + print(f"Warning: The file {normalized_text_path} does not exist.") + continue + + # Resamples the audio file if required + if sig_sr != sample_rate: + resampled_signal = torchaudio.functional.resample( + signal, sig_sr, sample_rate + ) + os.unlink(wav_file) + torchaudio.save(wav_file, resampled_signal, sample_rate=sample_rate) + + # Gets the speaker-id from the utterance-id + spk_id = uttid.split("_")[0] + + # Creates an entry for the utterance + json_dict[uttid] = { + "uttid": uttid, + "wav": wav_file, + "duration": duration, + "spk_id": spk_id, + "label": normalized_text, + "segment": True if "train" in json_file else False, + } + + # Characters are used for Tacotron2, phonemes may be needed for other models + if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None: + # Computes phoneme labels using SpeechBrain G2P and keeps the punctuations + phonemes = _g2p_keep_punctuations(g2p, normalized_text) + json_dict[uttid].update({"label_phoneme": phonemes}) + + # Writes the dictionary to the json file + with open(json_file, mode="w", encoding="utf-8") as json_f: + json.dump(json_dict, json_f, indent=2) + + logger.info(f"{json_file} successfully created!") + + +def skip(*filenames): + """ + Detects if the data preparation has been already done. + If the preparation has been done, we can skip it. + + Arguments + --------- + *filenames : tuple + Set of filenames to check for existence. + + Returns + ------- + bool + if True, the preparation phase can be skipped. + if False, it must be done. + """ + for filename in filenames: + if isinstance(filename, list): + if any(not os.path.isfile(item) for item in filename): + return False + else: + if not os.path.isfile(filename): + return False + return True + + +def split_sets(wav_list, split_ratio): + """Randomly splits the wav list into training, validation, and test lists. + + Arguments + --------- + wav_list : list + list of all the signals in the dataset + split_ratio: list + List composed of three integers that sets split ratios for train, valid, + and test sets, respectively. For instance split_ratio=[80, 10, 10] will + assign 80% of the sentences to training, 10% for validation, and 10% + for test. + + Returns + ------- + dictionary containing train, valid, and test splits. + """ + # Random shuffles the list + random.shuffle(wav_list) + tot_split = sum(split_ratio) + tot_snts = len(wav_list) + data_split = {} + splits = ["train", "valid"] + + for i, split in enumerate(splits): + n_snts = int(tot_snts * split_ratio[i] / tot_split) + data_split[split] = wav_list[0:n_snts] + del wav_list[0:n_snts] + data_split["test"] = wav_list + + return data_split + + +def check_folders(*folders): + """Returns False if any passed folder does not exist.""" + for folder in folders: + if not os.path.exists(folder): + return False + return True diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 804227d55..14aa38693 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -20,6 +20,8 @@ PositionalEncoding as TransformerPositionalEncoding, get_lookahead_mask, ) +from speechbrain.dataio.batch import PaddedBatch +from speechbrain.utils.data_utils import batch_pad_right from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear @@ -592,216 +594,6 @@ def forward(self, enc_out, length, emb=None): ) -class TokotronSearchWrapper(nn.Module): - """A wrapper class to facilitate seach-based inference. It takes care of re-interpreting - a multi-headed sequence as multiple samples, for compatibility, and for the retention - of attention tensors - - Arguments - --------- - decoder : TokotronTransformerDecoder - the Tokotron transformer decoder - """ - - def __init__(self, decoder): - super().__init__() - self.tokens_per_step = decoder.tokens_per_step - self.decoder = decoder - - def decode(self, memory, enc_states, enc_lens): - """Wraps the decode operation, will all the necessary - reshaping - - Arguments - --------- - memory : torch.Tensor - Characters predicted so far - enc_states : torch.Tensor - Encoder states - enc_lens : torch.Tensor - Encoder state lengths - """ - batch_size = enc_states.size(0) // self.tokens_per_step - _, mem_len = memory.shape - memory = memory.reshape( - self.tokens_per_step, batch_size, mem_len - ).permute(1, 2, 0) - dec_out, dec_self_attn, dec_attn = self.decoder.decode( - enc_out=enc_states[:batch_size], - src_length=enc_lens[:batch_size], - tgt=memory, - ) - self.dec_self_attn = dec_self_attn - self.dec_attn = dec_attn - return dec_out, dec_attn - - -class TokotronTransformerBeamSearcher(S2STransformerBeamSearcher): - """A slight modification of S2STransformerBeamSearcher that uses an - explicit number of tokens instead of trying to infer it from the - weights of the linear layer. This is needed because Tokotron is - multi-header and the final output layer outputs multiple output states - - Arguments - --------- - num_tokens : int - The number of audio tokens available - """ - - def __init__(self, num_tokens, *args, **kwargs): - super().__init__(*args, **kwargs) - self.num_tokens = num_tokens - - def set_n_out(self): - """Set the number of output tokens.""" - return self.num_tokens - - -class SearchLinearWrapper(nn.Module): - """A wrapper for the final linear layer of the Transformer. The goal is to - make it compatible with the SpeechBrain Beam Search implementation, which is - single-headed, by expanding multiple heads along the batch dimensions. - - Arguments - --------- - lin : torch.Tensor - A linear layer with an output feature dimensions of - (tokens_per_step x num_tokens) - tokens_per_step : int - the numer of tokens the model outputs for each - time step - """ - - def __init__(self, lin, tokens_per_step): - super().__init__() - self.lin = lin - self.tokens_per_step = tokens_per_step - - def forward(self, x): - """Performs a forward pass with all the required reshape operations - - Arguments - --------- - x : torch.Tensor - The decoder output - - Returns - ------- - result : torch.Tensor - The layer output, reshaped along the batch dimension - """ - x = self.lin(x) - batch_size, max_len, out_dim = x.shape - num_tokens = x.size(-1) // self.tokens_per_step - x = ( - # batch x tokens x length - x.transpose(2, 1) - # batch x heads x tokens x length - .view(batch_size, self.tokens_per_step, num_tokens, max_len) - # heads x batch x tokens x length - .transpose(0, 1) - # heads * batch x tokens x length - .reshape(self.tokens_per_step * batch_size, num_tokens, max_len) - # heads * batch x length x tokens - .transpose(1, 2) - ) - return x - - -class TokotronSearchInference(nn.Module): - """A beam search-based inference implementation - - All keyword arguments will be passed on to the underlying - beam search - """ - - def __init__(self, audio_token_shift=1, **kwargs): - super().__init__() - self.search_kwargs = kwargs - self.audio_token_shift = audio_token_shift - self.decoder, self.search, self.tokens_per_step = None, None, None - - def bind(self, model=None): - """Binds this inference implementation to a model - - Arguments - --------- - model : TokotronTransformerModel - The transformer model - """ - decoder = model.decoder - self.tokens_per_step = decoder.tokens_per_step - self.decoder = TokotronSearchWrapper(decoder) - self.search = TokotronTransformerBeamSearcher( - modules=[ - self.decoder, - SearchLinearWrapper(decoder.out_proj, self.tokens_per_step), - ], - num_tokens=decoder.num_tokens + self.audio_token_shift, - **self.search_kwargs, - ) - - def decode(self, enc_out, length): - """"Decodes the encoder representation using Beam Search - - Arguments - --------- - enc_out : torch.Tensor - Encoder output - length : torch.Tensor - Encoder output lengths - - Returns - ------- - output : TokotronDecoderInfernceOutput - The inference output - """ - with torch.no_grad(): - device = enc_out.device - # The search does not support multiple heads. "Trick" it by expanding encoded - # representations along the batch dimension so that the beam searcher - # treats it as if they were separate, independent samples. - batch_size, max_len, enc_dim = enc_out.shape - enc_out_search = ( - enc_out.unsqueeze(0) - .expand(self.tokens_per_step, batch_size, max_len, enc_dim) - .reshape(self.tokens_per_step * batch_size, max_len, enc_dim) - ) - length_search = ( - length.unsqueeze(0) - .expand(self.tokens_per_step, batch_size) - .reshape(self.tokens_per_step * batch_size) - ) - hyps, audio_length, scores, log_probs = self.search( - enc_out_search, length_search - ) - tokens_batch = PaddedBatch( - [ - {"hyps": torch.tensor(item, device=enc_out.device)} - for item in hyps - ] - ).to(device) - - audio_tokens, length = tokens_batch.hyps - _, audio_max_len = audio_tokens.shape - audio_tokens = audio_tokens.reshape( - self.tokens_per_step, batch_size, audio_max_len - ).permute(1, 2, 0) - length = ( - length.reshape(self.tokens_per_step, batch_size).min(dim=0) - ).values - audio_tokens = audio_tokens - self.audio_token_shift - - return TokotronDecoderInfernceOutput( - audio_tokens=audio_tokens, - length=length, - dec_self_attn=self.decoder.dec_self_attn, - dec_attn=self.decoder.dec_attn, - alignments=get_alignments(self.decoder.dec_attn), - p_eos=None, - ) - - class TokotronTransformerModel(nn.Module): """An end-to-end Tokotron model receiving characters or phonemes as inputs and outputting audio tokens @@ -2263,3 +2055,212 @@ def decode(self, codes): """ codes = codes.permute(2, 0, 1) return self.speech_tokenizer.decode(codes) + + +def get_silence_token( + model, + sample_length=100000, + extract_emb=True, + model_shape="BLH", + unsqueeze=False, + device=None, + model_kwargs=None, +): + """Attempts to find out the silence tokens for a given model, + if applicable + + Arguments + --------- + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + extract_emb : bool + Whether to extract embeddings + model_shape : str + The shape of tokens output by the model + BLH: Batch x Length x Heads (Discrete SSL, Encodec) + BHL: Batch x Heads x Length (DAC) + HBL: Heads x Batch x Length (SpeechTokenizer) + unsqueeze: bool + Whether to add an extra dimension to the audio (needed for DAC) + device : str | torch.Device + The device to use + model_kwargs : dict + Additional arguments to pass to the model + + Returns + ------- + silence_tokens : torch.Tensor + The token(s) corresponding to silence + + silece_emb : torch.Tensor + The embedding(s) corresponding to silence + + """ + if device is None: + device = next(model.parameters()).device + if model_kwargs is None: + model_kwargs = {} + + audio = torch.zeros(1, sample_length, device=device) + if unsqueeze: + audio = audio.unsqueeze(1) + length = torch.ones(1, device=device) + model_training = model.training + model.eval() + if hasattr(model, "encode"): + result = model.encode(audio, length, **model_kwargs) + else: + result = model(audio, length, **model_kwargs) + if model_training: + model.train() + tokens = result if torch.is_tensor(result) else result[0] + if model_shape == "HBL": + tokens = tokens.permute(1, 2, 0) + elif model_shape == "BHL": + tokens = tokens.transpose(-1, -2) + + tokens = tokens.squeeze(0) + if unsqueeze: + tokens = tokens.squeeze(0) + silence_tokens = tokens.mode(0).values + silence_emb = None + if extract_emb: + if hasattr(model, "embeddings"): + silence_emb = model.embeddings( + silence_tokens[None, None, :] + ).squeeze() + else: + heads = tokens.shape[-1] + embs = result[1] + mode_idx = [ + (tokens[:, head] == silence_tokens[head]).nonzero()[0].item() + for head in range(heads) + ] + silence_emb = torch.stack( + [embs[0, idx, head] for head, idx in enumerate(mode_idx)] + ) + return silence_tokens, silence_emb + + +def feature_pad_to(tensor, length, padding=None): + """Pads feature dimensions to the specified length with the specified padding, + assuming a (Batch x Length x Features..) tensor + + Arguments + --------- + tensor : torch.Tensor + The tensor to be padded + + length : int + The length to which the tensor will be padded + + padding : torch.Tensor, optional + The padding tensor - if omitted, zero padding + will be used + + Returns + ------- + result : torch.Tensor + The padded tensor + """ + if padding is None: + padding = torch.zeros(tensor.shape[1:]) + padding = padding[None, ...].expand( + (length - tensor.size(0),) + tensor.shape[1:] + ) + return torch.cat([tensor, padding], dim=0) + + +def batch_feature_pad(tensors, padding=None): + """Similar to batch_pad_right but pads with the specified padding, whcih + can be a vector or a tensor + + Arguments + --------- + tensors : list + The list of tensors to be padded + padding : torch.Tensor + The padding tensor + + Returns + ------- + result : torch.Tensor + the padded tensor + """ + lengths_abs = torch.tensor( + [len(item) for item in tensors], device=tensors[0].device + ) + max_length = lengths_abs.max() + data = torch.stack( + [feature_pad_to(item, max_length, padding) for item in tensors] + ) + lengths = lengths_abs / max_length + return data, lengths + + +def token_collate_fn(examples, silence_token, token_keys): + """A customized collation function for audio tokens where + the specified silence token will be used as padding - instead of + zeros + + Arguments + --------- + examples : list + A list of examples + + silence_token : torch.Tensor + The token(s) representing silence + + token_keys : list + The list of keys to which special padding will be applied + + Returns + ------- + result : speechbrain.dataio.batch.PaddedBatch + A padded batch + """ + token_tensor_ids = {id(examples[0][key]) for key in token_keys} + return PaddedBatch( + examples, + padding_func=_silence_padding, + padding_kwargs={ + "silence_token": silence_token, + "token_tensor_ids": token_tensor_ids, + }, + ) + + +def _silence_padding(values, silence_token, token_tensor_ids): + return ( + batch_feature_pad(values, silence_token) + if id(values[0]) in token_tensor_ids + else batch_pad_right(values) + ) + + +def use_silence_padding(dataloader_opts, silence_token, token_keys): + """Overrides the collation function to add silence padding to + audio token features + + Arguments + --------- + dataloder_opts : dict + Dataloader options + silence_token : torch.Tensor + The tensor to be used as silence padding + token_keys : torch.Tensor + The keys to apply silence padding to + + Returns + ------- + dataloader_opts : dict + Updated data loader options + """ + return { + **dataloader_opts, + "collate_fn": partial( + token_collate_fn, silence_token=silence_token, token_keys=token_keys + ), + } diff --git a/benchmarks/DASB/utils/audio_tokens.py b/benchmarks/DASB/utils/audio_tokens.py deleted file mode 100644 index 9dcc922cd..000000000 --- a/benchmarks/DASB/utils/audio_tokens.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Utilities for discrete audio token models - - -Authors - * Artem Ploujnikov 2023 -""" -import torch -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.utils.data_utils import batch_pad_right -from functools import partial - - -def get_silence_token( - model, - sample_length=100000, - extract_emb=True, - model_shape="BLH", - unsqueeze=False, - device=None, - model_kwargs=None, -): - """Attempts to find out the silence tokens for a given model, - if applicable - - Arguments - --------- - model : nn.Module - A discrete token model, taking (wav, lengths) as arguments - sample_length : int - The length of the sample - extract_emb : bool - Whether to extract embeddings - model_shape : str - The shape of tokens output by the model - BLH: Batch x Length x Heads (Discrete SSL, Encodec) - BHL: Batch x Heads x Length (DAC) - HBL: Heads x Batch x Length (SpeechTokenizer) - unsqueeze: bool - Whether to add an extra dimension to the audio (needed for DAC) - device : str | torch.Device - The device to use - model_kwargs : dict - Additional arguments to pass to the model - - Returns - ------- - silence_tokens : torch.Tensor - The token(s) corresponding to silence - - silece_emb : torch.Tensor - The embedding(s) corresponding to silence - - """ - if device is None: - device = next(model.parameters()).device - if model_kwargs is None: - model_kwargs = {} - - audio = torch.zeros(1, sample_length, device=device) - if unsqueeze: - audio = audio.unsqueeze(1) - length = torch.ones(1, device=device) - model_training = model.training - model.eval() - result = model(audio, length, **model_kwargs) - if model_training: - model.train() - tokens = result if torch.is_tensor(result) else result[0] - if model_shape == "HBL": - tokens = tokens.permute(1, 2, 0) - elif model_shape == "BHL": - tokens = tokens.transpose(-1, -2) - - tokens = tokens.squeeze(0) - if unsqueeze: - tokens = tokens.squeeze(0) - silence_tokens = tokens.mode(0).values - silence_emb = None - if extract_emb: - if hasattr(model, "embeddings"): - silence_emb = model.embeddings( - silence_tokens[None, None, :] - ).squeeze() - else: - heads = tokens.shape[-1] - embs = result[1] - mode_idx = [ - (tokens[0, :, head] == silence_tokens[head]).nonzero()[0].item() - for head in range(heads) - ] - silence_emb = torch.stack( - [embs[0, idx, head] for head, idx in enumerate(mode_idx)] - ) - return silence_tokens, silence_emb - - -def feature_pad_to(tensor, length, padding=None): - """Pads feature dimensions to the specified length with the specified padding, - assuming a (Batch x Length x Features..) tensor - - Arguments - --------- - tensor : torch.Tensor - The tensor to be padded - - length : int - The length to which the tensor will be padded - - padding : torch.Tensor, optional - The padding tensor - if omitted, zero padding - will be used - - Returns - ------- - result : torch.Tensor - The padded tensor - """ - if padding is None: - padding = torch.zeros(tensor.shape[1:]) - padding = padding[None, ...].expand( - (length - tensor.size(0),) + tensor.shape[1:] - ) - return torch.cat([tensor, padding], dim=0) - - -def batch_feature_pad(tensors, padding=None): - """Similar to batch_pad_right but pads with the specified padding, whcih - can be a vector or a tensor - - Arguments - --------- - tensors : list - The list of tensors to be padded - padding : torch.Tensor - The padding tensor - - Returns - ------- - result : torch.Tensor - the padded tensor - """ - lengths_abs = torch.tensor( - [len(item) for item in tensors], device=tensors[0].device - ) - max_length = lengths_abs.max() - data = torch.stack( - [feature_pad_to(item, max_length, padding) for item in tensors] - ) - lengths = lengths_abs / max_length - return data, lengths - - -def token_collate_fn(examples, silence_token, token_keys): - """A customized collation function for audio tokens where - the specified silence token will be used as padding - instead of - zeros - - Arguments - --------- - examples : list - A list of examples - - silence_token : torch.Tensor - The token(s) representing silence - - token_keys : list - The list of keys to which special padding will be applied - - Returns - ------- - result : speechbrain.dataio.batch.PaddedBatch - A padded batch - """ - token_tensor_ids = {id(examples[0][key]) for key in token_keys} - return PaddedBatch( - examples, - padding_func=_silence_padding, - padding_kwargs={ - "silence_token": silence_token, - "token_tensor_ids": token_tensor_ids, - }, - ) - - -def _silence_padding(values, silence_token, token_tensor_ids): - return ( - batch_feature_pad(values, silence_token) - if id(values[0]) in token_tensor_ids - else batch_pad_right(values) - ) - - -def use_silence_padding(dataloader_opts, silence_token, token_keys): - """Overrides the collation function to add silence padding to - audio token features - - Arguments - --------- - dataloder_opts : dict - Dataloader options - silence_token : torch.Tensor - The tensor to be used as silence padding - token_keys : torch.Tensor - The keys to apply silence padding to - - Returns - ------- - dataloader_opts : dict - Updated data loader options - """ - return { - **dataloader_opts, - "collate_fn": partial( - token_collate_fn, silence_token=silence_token, token_keys=token_keys - ), - } diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index c0e14f867..1cf092a46 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -9,30 +9,49 @@ from speechbrain.inference.interfaces import Pretrained from speechbrain.inference.ASR import EncoderDecoderASR from speechbrain.lobes.models.huggingface_transformers import Whisper +from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import length_to_mask from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher from speechbrain.dataio.batch import PaddedBatch from speechbrain.utils.metric_stats import ErrorRateStats from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.data_utils import pad_right_to +from speechbrain.utils.fetching import fetch from collections import namedtuple from pathlib import Path -import os +from torch import nn import torch import torchaudio import re import string import logging -import shutil -import shlex -import subprocess + logger = logging.getLogger(__name__) + +has_transformers = False +try: + from transformers import AutoModelForAudioXVector + has_transformers = True +except ImportError: + logger.warning("transformers library not found - some evaluators may be disabled") + + RE_PUNCTUATION = re.compile( "|".join(re.escape(char) for char in string.punctuation) ) +SAMPLE_RATE = 16000 +DEFAULT_ENCODER_HUB = "chaanks/wav2vec2-small" +DEFAULT_MODEL_URL = "https://huggingface.co/chaanks/UTMOS/resolve/main" +DEFAULT_MODEL_NAME = "utmos.ckpt" +DEFAULT_SAVE_DIR = "./pretrained_models" +DEFAULT_JUDGE_ID = 288 +DEFAULT_DOMAIN_ID = 0 + SpeechEvaluationResult = namedtuple( "SpeechEvaluationResult", ["score", "details"] ) @@ -217,77 +236,6 @@ def __call__(self, wavs, length): return self.mods.model(wavs, length) -class RegressionModelSpeechEvaluator(SpeechEvaluator): - """A speech evaluator that uses a regression model - that produces a quality score (e.g. SSL fine-tuning) - for a sample of speech - - Arguments - --------- - source : str - The source model path or HuggingFace hub name - sample_rate : int - The audio sample rate this evaluator expects - """ - - def __init__(self, source, sample_rate=None, *args, **kwargs): - super().__init__(sample_rate=sample_rate) - self.model = SpeechEvaluationRegressionModel.from_hparams( - source, *args, **kwargs - ) - - def evaluate( - self, - wavs, - length, - text=None, - wavs_ref=None, - length_ref=None, - sample_rate=None, - sample_rate_ref=None, - ): - """Evaluates a batch of waveforms - - Arguments - --------- - Arguments - --------- - wavs: torch.Tensor - the waveforms to evaluate - - length: torch.Tensor - relative lengths (a 1-D tensor) - - text : list, optional - Ground truth text - - wavs_ref : torch.Tensor - the reference waveforms - - length_ref : torch.Tensor - the reference waveform lengths - - sample_rate : int, optional - The sample rate of the audio. If not provided, - the audio is assumed to be at the same sample - rate as the model - - sample_rate_ref : int, optional - The sample rate of the reference samples - - Returns - ------- - result : SpeechEvaluationResult - an aggregated speech evaluation result with a score - for each item - """ - wavs = self.resample(wavs, sample_rate) - scores = self.model(wavs, length) - while scores.dim() > 1 and scores.size(-1) == 1: - scores = scores.squeeze(-1) - return SpeechEvaluationResult(score=scores, details={"score": scores}) - - class ASRSpeechEvaluator(SpeechEvaluator): """A superclass for ASR speech evaluators""" @@ -743,171 +691,334 @@ def evaluate_files(self, file_names, text=None, file_names_ref=None): raise NotImplementedError() -UTMOS_REPO = "https://huggingface.co/spaces/sarulab-speech/UTMOS-demo" +class UTMOSModel(nn.Module): + """The UTMOS model wrapper + Arguments + --------- + source : str + The WavLM source + save_path : str | path-like + The path where the model will be saved + features_dim : int, optional + The features dimension + num_domains : int, optional + The number of domains + domain_dim : int, optional + The dimension of each domain + num_judges : int, optional + The number of "judges" + judge_dim : int, optional + The dimension of each judge + decoder_hidden_size : int, optional + The size of the decoder hidden state + multiplier : float, optional + The number that the raw model output is multiplied by + to compute the score + offset : float, optional + The number that (raw output * multiplier) will be added + to in order to get the score + """ + + def __init__( + self, + source, + save_path, + features_dim=768, + num_domains=3, + domain_dim=128, + num_judges=3000, + judge_dim=128, + decoder_hidden_size=512, + multiplier=2.0, + offset=3.0, + ): + super().__init__() + + self.ssl_encoder = Wav2Vec2( + source, + save_path, + freeze=True, + output_norm=False, + freeze_feature_extractor=True, + output_all_hiddens=False, + ) + + self.domain_embedding = nn.Embedding(num_domains, domain_dim) + self.judge_embedding = nn.Embedding(num_judges, judge_dim) + + self.decoder = nn.LSTM( + input_size=features_dim + domain_dim + judge_dim, + hidden_size=decoder_hidden_size, + num_layers=1, + batch_first=True, + bidirectional=True, + ) + + self.classifier = nn.Sequential( + nn.Linear(decoder_hidden_size * 2, 2048), + torch.nn.ReLU(), + nn.Dropout(0.3), + nn.Linear(2048, 1), + ) + self.multiplier = multiplier + self.offset = offset + + def forward(self, wav, domain_id=None, judge_id=None): + """Computes the forward pass + + Arguments + --------- + wav : torch.Tensor + The raw waveforms + domain_id : torch.Tensor + The domain identifiers + judge_id : torch.Tensor + The judge identifier + + Returns + ------- + result : torch.Tensor + The predicted rating(s) + """ + + if domain_id is None: + domain_id = torch.zeros( + len(wav), dtype=torch.int, device=wav.device + ) + if judge_id is None: + judge_id = ( + torch.ones(len(wav), dtype=torch.int, device=wav.device) + * DEFAULT_JUDGE_ID + ) -class UTMOSSpeechEvaluator(BulkSpeechEvaluator): - """An evaluation wrapper for UTMOS + ssl_features = self.ssl_encoder(wav) + domain_emb = self.domain_embedding(domain_id) + judge_emb = self.judge_embedding(judge_id) + + domain_emb = domain_emb.unsqueeze(1).expand( + -1, ssl_features.size(1), -1 + ) + judge_emb = judge_emb.unsqueeze(1).expand(-1, ssl_features.size(1), -1) + concatenated_feature = torch.cat( + [ssl_features, domain_emb, judge_emb], dim=2 + ) + + decoder_output, _ = self.decoder(concatenated_feature) + pred = self.classifier(decoder_output) + + return pred.mean(dim=1).squeeze(1) * self.multiplier + self.offset + + +class UTMOSSpeechEvaluator(SpeechEvaluator): + """The UTMOS speech evaluator wrapper Github: https://github.com/sarulab-speech/UTMOS22 HuggingFace: https://huggingface.co/spaces/sarulab-speech/UTMOS-demo + Arguments --------- - model_path : str | path-like - The path where the HuggingFace repository was extracted - output_folder : str | path-like - The folder where results will be output - ckpt_path : str | path-like - The path to the checkpoint to be used - script : str | path-like - The path to the evaluation script, defaults to the bundled - predict.py - python : str | path-like, optional - The path to the Python interpreter to be used, defaults to - "python". Depending on the environment, it might need to be - changed (e.g. to "python3" or an absolute path to the interpreter) - use_python : bool - Whether to launch the script using python. This flag will need to be - set to False in environments where running UTMOS requires a wrapper shell - script (e.g. to initialize a different Python virtual environment from - the one in which SpeechBrain is running) - tmp_folder : str | path-like, optional - The temporary folder where files will be copied for evaluation. If - omitted, it will be set to output_folder. This can be useful on - compute environments that provide fast local storage (e.g. certain - compute clusters) - repo : str - The repor + source : str, optional + The WavLM source + save_path : str | path-like, optional + The path where the model will be saved + model_name : str + The name of the model hub + model_url : str + The model URL (if applicable) + domain_id : int + The domain ID of the underlying model + judge_id : int + The judge ID to use (given UTMOS was trained as an ensemble + of judges) + run_opts: dict, optional + The run options + sample_rate : int + The sample rate of the underlying model """ def __init__( self, - model_path, - output_folder, - ckpt_path, - script="predict.py", - python="python", - use_python=True, - batch_size=8, - tmp_folder=None, - repo=UTMOS_REPO, + source=None, + save_path=None, + model_name=None, + model_url=None, + domain_id=None, + judge_id=None, + run_opts=None, + sample_rate=16000, ): - self.output_folder = Path(output_folder) - rand = torch.randint(1, 999999999, (1,)).item() - if tmp_folder is None: - tmp_folder = self.output_folder - else: - tmp_folder = Path(tmp_folder) - self.eval_path = (tmp_folder / f"eval_{rand}").absolute() - self.model_path = Path(model_path).absolute() - script = self.model_path / script - self.script = script - self.ckpt_path = Path(ckpt_path).absolute() - self.batch_size = batch_size - self.python = python - self.use_python = use_python - self.repo = repo - self.install() - - def install(self): - if self.model_path.exists(): - logger.info("UTMOS is already installed in %s", self.model_path) - return - logger.info( - "Attempting to install UTMOS from %s to %s", - self.repo, - self.model_path, - ) - cmd = shlex.join( - [ - "git", - "-C", - str(self.model_path.parent), - "clone", - self.repo, - str(self.model_path.name), - ] + super().__init__(sample_rate=sample_rate) + self.model = UTMOSModel( + source=source, + save_path=save_path, ) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - logger.info("Repository clone successful, performing an LFS fetch") - cwd = Path.cwd() - try: - os.chdir(self.model_path) - cmd = shlex.join(["git", "lfs", "fetch"]) - output, err, return_code = run_shell(cmd) - if return_code != 0: - raise CommandError(cmd, output, err, return_code) - finally: - os.chdir(cwd) - if not self.ckpt_path.exists(): - raise ValueError("ckpt_path {ckpt_path} does not exist") - - def evaluate_files(self, file_names, text, file_names_ref=None): - """Evaluates multiple files + if run_opts is not None: + device = run_opts.get("device") + if device: + self.model = self.model.to(device) + fetch(model_name, model_url, save_path) + model_path = Path(save_path) / model_name + state_dict = torch.load(model_path) + self.model.load_state_dict(state_dict) + self.model.eval() + + self.domain_id = domain_id + self.judge_id = judge_id + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + """Evaluates a batch of waveforms using UTMOS Arguments --------- - file_names : list - A list of files - - text : list - File transcripts (not required for all evaluators) - Not used in this evaluator - - file_names_ref : list, optional - A list of reference files / ground truths (if applicable) - Not used in this evaluator + wavs: torch.Tensor + the waveforms to evaluate + length: torch.Tensor + relative lengths (a 1-D tensor) + text : list, optional + Ground truth text. Ignored for UTMOS. + wavs_ref : torch.Tensor + the reference waveforms. Ignored for UTMOS. + length_ref : torch.Tensor + the reference waveform lengths. Ignored for UTMOS. + sample_rate : int, optional + The sample rate of the audio. If not provided, + the audio is assumed to be at the same sample + rate as the model + sample_rate_ref : int, optional + The sample rate of the reference samples. Ignored for UTMOS. Returns ------- result : SpeechEvaluationResult - a consolidated evaluation result + an aggregated speech evaluation result with a score + for each item """ - current_path = os.getcwd() - try: - self.eval_path.mkdir(parents=True, exist_ok=True) - logger.info("Copying the files to '%s'", self.eval_path) - for file_name in file_names: - target_file_name = self.eval_path / Path(file_name).name - shutil.copy(file_name, target_file_name) - - logger.info("Running evaluation") - result_path = self.eval_path / "result.txt" - os.chdir(self.model_path) - cmd = [ - str(self.script), - "--mode", - "predict_dir", - "--bs", - str(self.batch_size), - "--inp_dir", - str(self.eval_path), - "--out_path", - str(result_path), - "--ckpt_path", - str(self.ckpt_path), - ] - if self.use_python: - cmd = [self.python] + cmd - - output = subprocess.check_output(cmd) - logger.info("Evaluation finished, output: %s", output) - file_names = [path.name for path in self.eval_path.glob("*.wav")] - with open(result_path) as result_path: - scores = [float(line.strip()) for line in result_path] - score_map = dict(zip(file_names, scores)) - scores_ordered = [ - score_map[Path(file_name).name] for file_name in file_names - ] - return SpeechEvaluationResult( - scores_ordered, {"utmos": scores_ordered} + wavs = self.resample(wavs, sample_rate=sample_rate) + domain_id, judge_id = None, None + if self.domain_id is not None: + domain_id = ( + torch.ones(len(wavs), device=wavs.device) * self.domain_id + ) + if self.judge_id is not None: + judge_id = torch.ones(len(wavs), device=wavs.device) * self.judge_id + + scores = self.model(wav=wavs, domain_id=domain_id, judge_id=judge_id) + return SpeechEvaluationResult(score=scores, details={"utmos": scores}) + + +class SpkSimWavLM(SpeechEvaluator): + """A speaker similarity evaluator based on WavLM / XVector + + Arguments + --------- + source : str + The model hub to use + savedir : str + The path where the model will be saved + model_sample_rate : int, optional + The sample rate to which all samples will be resampled + before being processed + """ + def __init__( + self, + source, + savedir, + model_sample_rate=16000, + run_opts=None, + *args, + **kwargs + ): + if not has_transformers: + raise ValueError( + "Unable to use the SpkSimWavLM evaluator because the " + "transformers library is not enabled" + ) + if run_opts is None: + run_opts = {} + device = run_opts.get("device") + self.model = AutoModelForAudioXVector.from_pretrained( + source, cache_dir=savedir, + *args, + **kwargs + ) + if device is not None: + self.model = self.model.to(device) + + self.model.eval() + self.model_sample_rate = model_sample_rate + self.device = next(self.model.parameters()).device + + def evaluate( + self, + wavs, + length, + text=None, + wavs_ref=None, + length_ref=None, + sample_rate=None, + sample_rate_ref=None, + ): + # Resample + if sample_rate is not None: + wavs = torchaudio.functional.resample( + wavs, + orig_freq=sample_rate, + new_freq=self.model_sample_rate ) - finally: - os.chdir(current_path) - shutil.rmtree(self.eval_path) + if sample_rate_ref is not None: + wavs_ref = torchaudio.functional.resample( + wavs_ref, + orig_freq=sample_rate_ref, + new_freq=self.model_sample_rate + ) + + # Concatenate + batch_size, wavs_max_len = wavs.shape + _, wavs_ref_max_len = wavs_ref.shape + length_abs = length * wavs_max_len + length_ref_abs = length_ref * wavs_ref_max_len + max_len = max(wavs_max_len, wavs_ref_max_len) + wavs, _ = pad_right_to( + wavs, + (batch_size, max_len) + ) + wavs_ref, _ = pad_right_to( + wavs_ref, + (batch_size, max_len) + ) + audio = torch.cat([wavs, wavs_ref]) + + length_cat_abs = torch.cat([length_abs, length_ref_abs]) + # Attention mask + attention_mask = length_to_mask( + length_cat_abs.int() + ).long() # 0 for masked tokens + # Forward + embs = self.model( + input_values=audio, + attention_mask=attention_mask, + output_attentions=False, + ).embeddings + hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)]) + scores = torch.nn.functional.cosine_similarity( + hyp_embs, ref_embs, dim=-1 + ) + + return SpeechEvaluationResult( + scores, + {"score": scores} + ) def vocoder_to_device(vocoder, device): From 252f1d738145ade2251f1bd6070595a6ae1c2c49 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 11 Jan 2025 23:12:38 -0500 Subject: [PATCH 052/270] add new tokenziers andadopt to SB main repo --- .../hparams/LSTM/speech_tokenizer.yaml | 2 +- .../hparams/contextnet/speech_tokenizer.yaml | 2 +- .../extraction/hparams/speech_tokenizer.yaml | 2 +- benchmarks/DASB/extra_requirements.txt | 1 + benchmarks/DASB/model/sq_codec.py | 1356 +++++++++++++++++ benchmarks/DASB/utils/tokenizer_interface.py | 287 +++- 6 files changed, 1634 insertions(+), 16 deletions(-) create mode 100644 benchmarks/DASB/model/sq_codec.py diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml index d0e9aae5b..9607dab79 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/LSTM/speech_tokenizer.yaml @@ -126,7 +126,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml index 7fdbf8d51..615777a99 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/hparams/contextnet/speech_tokenizer.yaml @@ -120,7 +120,7 @@ prune_history: False ############################## models ################################ # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 5d897a782..161d4e870 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -42,7 +42,7 @@ freeze_embedding: False save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/extra_requirements.txt b/benchmarks/DASB/extra_requirements.txt index e97e16b28..dffb3cd07 100644 --- a/benchmarks/DASB/extra_requirements.txt +++ b/benchmarks/DASB/extra_requirements.txt @@ -2,6 +2,7 @@ beartype jsonlines kaldiio librosa>=0.9.2 +omegaconf onnxruntime>=1.16.3 orion orion[profet] diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py new file mode 100644 index 000000000..f04c094d4 --- /dev/null +++ b/benchmarks/DASB/model/sq_codec.py @@ -0,0 +1,1356 @@ +"""This lobe enables the integration of speech codec model (SQ-Codec) with scalar quantization,. + +SQ-Codec effectively maps the complex speech signal into a finite and compact latent space, named scalar latent space. + +Repository: https://github.com/yangdongchao/SimpleSpeech +Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + +Authors + * Pooneh Mousavi 2024 +""" + +import logging +import os + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from omegaconf import OmegaConf +from torch.autograd import Function +from torch.nn.utils import remove_weight_norm, weight_norm + + +class SQCodec(nn.Module): + """ + Speech codec model (SQ-Codec) with scalar quantization. It maps the complex speech signal into a finite and compact latent space. + The model consists of an encoder-decoder architecture with optional causal convolutions, downsampling, and upsampling layers. + It uses vector quantization and various convolutional blocks for processing. + + Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo: + - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip + + Repository: https://github.com/yangdongchao/SimpleSpeech + Paper: https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + + Arguments + --------- + save_path : str, optional + Directory where the model and configuration files are saved (default is None). + config : str, optional + Configuration filename for the model. It is extracted form zip file(default is 'config.yaml'). + checkpoint : str, optional + Model checkpoint filename. It is extracted form zip file( (default is 'ckpt_00190000.pth'). + sample_rate : int, optional + Sample rate for input audio (default is 16000). + dim_codebook : int, optional + Dimension of each codebook (default is 19683). + n_codebook : int, optional + Number of codebooks used (default is 4). + bw : float, optional + Bandwidth parameter (default is 2). + clip_length : int, optional + Maximum clip length for processing (default is 450). + + Example + ------- + >>> save_path = "savedir" + >>> config = "config.yaml" + >>> checkpoint = "ckpt_00190000.pth" + >>> model = SQCodec(save_path, config, checkpoint) + >>> audio = torch.randn(3, 16000) + >>> tokens, emb = model.encode(audio) + >>> tokens.shape + torch.Size([3, 200]) + >>> emb.shape + torch.Size([3, 36, 50]) + >>> rec = model.decode(tokens) + >>> rec.shape + torch.Size([3, 1, 16000]) + """ + + def __init__( + self, + save_path, + config, + checkpoint, + sample_rate=16000, + dim_codebook=19683, + n_codebook=4, + bw=2, + clip_length=450, + ): + super(SQCodec, self).__init__() + self.config_path = os.path.join(save_path, config) + self.ckpt_path = os.path.join(save_path, checkpoint) + if not os.path.exists(self.config_path) and not os.path.exists( + self.ckpt_path + ): + err_msg = ( + "the files %s or %s does not exist." + "(make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo:" + " https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip)" + % (self.ckpt_path, self.config_path) + ) + raise FileNotFoundError(err_msg) + self.clip_length = clip_length + + logging.info( + f"Using config {self.config_path} and model {self.ckpt_path}" + ) + + self.scalar_codec = self.build_codec_model(self.config_path) + self.sr = sample_rate + self.dim_codebook = dim_codebook + self.n_codebook = n_codebook + self.bw = bw + self.mask_id = self.dim_codebook * self.n_codebook + + def build_codec_model(self, config): + """ + Loads and builds the scalar codec model from the given configuration. + + Parameters + ---------- + config : str + Path to the configuration file. + + Returns + ------- + ScalarModel + The built scalar codec model loaded with weights from the checkpoint. + """ + exp_model_config = OmegaConf.load(config) + scalar_codec = ScalarModel(**exp_model_config.generator.config) + parameter_dict = torch.load(self.ckpt_path) + scalar_codec.load_state_dict(parameter_dict["codec_model"]) + return scalar_codec + + def _flatten_codebooks(self, arr, offset_size=None): + """ + Flattens a 3D array (B, N, D) to a 1D array while applying an offset to each codebook if specified. + + Parameters + ---------- + arr : numpy.ndarray + A 3D array of shape (B, N, D). + offset_size : int or None, optional + The offset size to be applied to each codebook slice (default is None). + + Returns + ------- + numpy.ndarray + A 1D array representing the flattened codebooks. + """ + assert ( + len(arr.shape) == 3 + ), "Input array must have 3 dimensions [B, N, D]" + N, B, D = arr.shape + arr = arr.copy() + if offset_size is not None: + for n in range(N): + arr[n, :, :] += offset_size * n + flattened_arr = arr.transpose(1, 2, 0).reshape(B, N * D) + return flattened_arr + + def encode(self, inputs): + """ + Encodes the input audio tensor using the scalar codec and quantizes the output. + + Parameters + ---------- + inputs : torch.Tensor + Input audio tensor of shape (B, T) or (B, 1, T), where B is the batch size + and T is the length of the audio sequence. + + Returns + ------- + tuple + A tuple containing: + - torch.Tensor: The flattened and quantized encoded representation of the input. + - torch.Tensor: Quantized embedding. + """ + if inputs.dim() == 2: + inputs = inputs.unsqueeze(1) + compressed = self.scalar_codec.encode(inputs) + chunks = compressed.chunk(self.n_codebook, dim=1) + codec_ls = [] + for i, chunk in enumerate(chunks): + chunk = chunk.detach().cpu().numpy().astype(np.int32) + 1 + tmp_codec = ternary_matrix_to_decimal(chunk) + codec_ls.append(tmp_codec) + codec_ls = np.array(codec_ls) + flat_codec = self._flatten_codebooks(codec_ls, self.dim_codebook) + flat_codec = torch.from_numpy(flat_codec).to(torch.int32) + return flat_codec.to(inputs.device), compressed.to(inputs.device) + + def decode(self, codes): + """ + Decodes the quantized codes back into an audio tensor. + + Parameters + ---------- + codes : torch.Tensor + Quantized codes with shape (B, T). + + Returns + ------- + torch.Tensor + Reconstructed audio signal. + """ + assert codes.dim() == 2 + B, T = codes.shape + assert ( + T % self.n_codebook == 0 + ), "Length T must be divisible by n_codebook" + codes = codes.view(B, -1, self.n_codebook).permute(2, 0, 1) + for i in range(self.n_codebook): + codes[i, :, :] -= i * self.dim_codebook + emb_quant = [] + for i in range(self.n_codebook): + tmp_list = decimal_to_ternary_matrix(codes[i, :, :], D=9) - 1 + emb_quant.append(tmp_list) + emb_quant = torch.cat(emb_quant, dim=1) + out = self.scalar_codec.decode(emb_quant.float().to(codes.device)) + return out.detach().cpu().squeeze(0) + + def reconstruct(self, wav_root): + """ + Processes a given waveform file by encoding and decoding it through the scalar codec. + + Parameters + ---------- + wav_root : str + Path to the waveform file. + + Returns + ------- + torch.Tensor or None + Processed waveform tensor or None if the file is empty. + """ + wav, sr = torchaudio.load(wav_root) + if wav.numel() == 0: + return None + if sr != self.sr: + wav = torchaudio.transforms.Resample(sr, self.sr)(wav) + wav = wav.unsqueeze(1) + emb, emb_quant, x = self.scalar_codec.inference(wav) + return x.detach().cpu().squeeze(0) + + @property + def is_discrete(self): + """Indicates whether the codec works with discrete values.""" + return True + + @property + def codebook_length(self): + """Returns the total length of the codebook.""" + return self.dim_codebook * self.n_codebook + 1 + + def find_length(self, x): + """ + Finds the length of the tokenized version of the input tensor. + + Parameters + ---------- + x : torch.Tensor + Input tensor. + + Returns + ------- + int + The length of the tokenized input. + """ + return self.tokenize(x).shape[0] // self.n_codebook + + +class ScalarModel(nn.Module): + """ + A custom neural network model for encoding and decoding audio signals. + + The model consists of an encoder-decoder architecture with optional + causal convolutions, downsampling, and upsampling layers. It uses + vector quantization and various convolutional blocks for processing. + + + Arguments + --------- + num_bands : int + Number of input bands (or channels). + sample_rate : int + Sample rate of the input signal. + causal : bool + If True, uses causal convolutions for processing. + num_samples : int + Number of samples to process for downsampling or upsampling. + downsample_factors : list of int + List of factors to downsample the input. + downsample_kernel_sizes : list of int + List of kernel sizes for downsampling layers. + upsample_factors : list of int + List of factors to upsample the input. + upsample_kernel_sizes : list of int + List of kernel sizes for upsampling layers. + latent_hidden_dim : int + Dimension of the latent representation. + default_kernel_size : int + Default kernel size for convolutional layers. + delay_kernel_size : int + Kernel size used for the delay convolutional layer. + init_channel : int + Number of initial channels for the encoder and decoder. + res_kernel_size : int + Kernel size used for the residual convolutional blocks. + + Example + ------- + >>> model = ScalarModel(num_bands=1, sample_rate=16000,causal=True,num_samples=2,downsample_factors=[2,4,4,5],downsample_kernel_sizes=[4,8,8,10],upsample_factors=[5,4,4,2],upsample_kernel_sizes=[10,8,8,4],latent_hidden_dim=36,default_kernel_size=7,delay_kernel_size=5,init_channel=48,res_kernel_size=7) # doctest: +SKIP + >>> audio = torch.randn(3, 1, 16000) + >>> quant_emb = model.encode(audio) # doctest: +SKIP + >>> quant_emb.shape + torch.Size([3, 36, 50]) + >>> rec = model.decode(quant_emb) # doctest: +SKIP + >>> rec.shap) # doctest: +SKIP + torch.Size([3, 1, 16000]) + """ + + def __init__( + self, + num_bands, + sample_rate, + causal, + num_samples, + downsample_factors, + downsample_kernel_sizes, + upsample_factors, + upsample_kernel_sizes, + latent_hidden_dim, + default_kernel_size, + delay_kernel_size, + init_channel, + res_kernel_size, + ): + super(ScalarModel, self).__init__() + self.sample_rate = sample_rate + self.encoder = [] + self.decoder = [] + self.vq = lambda x: CustomRoundingFunction.apply(x, "binary") + + # Encoder layers + self.encoder.append( + weight_norm( + Conv1d( + num_bands, + init_channel, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + if num_samples > 1: + # Downsampling layer + self.encoder.append( + PreProcessor( + init_channel, + init_channel, + num_samples, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + for i, down_factor in enumerate(downsample_factors): + self.encoder.append( + ResEncoderBlock( + init_channel * np.power(2, i), + init_channel * np.power(2, i + 1), + down_factor, + downsample_kernel_sizes[i], + res_kernel_size, + causal=causal, + ) + ) + self.encoder.append( + weight_norm( + Conv1d( + init_channel * np.power(2, len(downsample_factors)), + latent_hidden_dim, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + + # Decoder layers + self.decoder.append( + weight_norm( + Conv1d( + latent_hidden_dim, + init_channel * np.power(2, len(upsample_factors)), + kernel_size=delay_kernel_size, + ) + ) + ) + for i, upsample_factor in enumerate(upsample_factors): + self.decoder.append( + ResDecoderBlock( + init_channel * np.power(2, len(upsample_factors) - i), + init_channel * np.power(2, len(upsample_factors) - i - 1), + upsample_factor, + upsample_kernel_sizes[i], + res_kernel_size, + causal=causal, + ) + ) + if num_samples > 1: + self.decoder.append( + PostProcessor( + init_channel, + init_channel, + num_samples, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + self.decoder.append( + weight_norm( + Conv1d( + init_channel, + num_bands, + kernel_size=default_kernel_size, + causal=causal, + ) + ) + ) + + self.encoder = nn.ModuleList(self.encoder) + self.decoder = nn.ModuleList(self.decoder) + + def forward(self, x): + """ + Performs a forward pass through the encoder and decoder. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + torch.Tensor + Reconstructed output tensor. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + x = self.vq(x) # Quantization step + for i, layer in enumerate(self.decoder): + x = layer(x) + return x + + def inference(self, x): + """ + Encodes input tensor `x` and decodes the quantized embeddings. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + tuple + A tuple (emb, emb_quant, x), where `emb` is the latent embedding, + `emb_quant` is the quantized embedding, and `x` is the decoded output. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + emb = x + emb_quant = self.vq(emb) + x = emb_quant + for i, layer in enumerate(self.decoder): + x = layer(x) + return emb, emb_quant, x + + def encode(self, x): + """ + Encodes the input tensor `x` into a quantized embedding. + + Parameters + ---------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, length). + + Returns + ------- + torch.Tensor + Quantized embedding. + """ + for i, layer in enumerate(self.encoder): + if i != len(self.encoder) - 1: + x = layer(x) + else: + x = F.tanh(layer(x)) + emb = x + emb_quant = self.vq(emb) + return emb_quant + + def decode(self, emb_quant): + """ + Decodes the quantized embeddings back into a tensor. + + Parameters + ---------- + emb_quant : torch.Tensor + Quantized embedding tensor. + + Returns + ------- + torch.Tensor + Reconstructed output tensor. + """ + x = emb_quant + for i, layer in enumerate(self.decoder): + x = layer(x) + return x + + +class CustomRoundingFunction(Function): + """ + A customizable rounding function for various rounding operations, including: + - Rounding to the nearest multiple of a specified divisor. + - Rounding to the nearest integer. + - Applying the Heaviside step function. + + Arguments + --------- + mode : str + The mode of the operation. Can be 'round', 'binary', or 'heaviside'. + divisor : float, optional + The divisor for rounding. Only used in 'round' mode. + """ + + @staticmethod + def forward(ctx, input, mode="round", divisor=1.0): + """ + Forward pass for the custom rounding function. + + Arguments + --------- + ctx : context object + Context object used to store information for the backward computation. + input : torch.Tensor + The input tensor to be processed. + mode : str + The mode of the operation ('round', 'binary', 'heaviside'). + divisor : float + The divisor for rounding. Only used in 'round' mode. + + Returns + ------- + torch.Tensor + The processed tensor after applying the operation. + """ + ctx.mode = mode + ctx.divisor = divisor + + if mode == "round": + return torch.round(divisor * input) / divisor + elif mode == "binary": + return torch.round(input) + elif mode == "heaviside": + values = torch.tensor([0.0]).type_as(input) + return torch.heaviside(input, values) + else: + raise ValueError( + f"Invalid mode '{mode}'. Supported modes: 'round', 'binary', 'heaviside'." + ) + + @staticmethod + def backward(ctx, grad_output): + """ + Backward pass for the custom rounding function. + + Arguments + --------- + ctx : context object + Context object containing information saved during the forward pass. + grad_output : torch.Tensor + The gradient of the output with respect to the loss. + + Returns + ------- + torch.Tensor + The gradient of the input with respect to the loss. + """ + # For all modes, the gradient is propagated unchanged. + return grad_output.clone(), None, None + + +class PreProcessor(nn.Module): + """ + A module for preprocessing input data through convolution and pooling operations. + It is used as an initial step before the encoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + num_samples : int + Number of samples for pooling. + kernel_size : int, optional + Size of the convolutional kernel (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False): + super(PreProcessor, self).__init__() + self.pooling = torch.nn.AvgPool1d(kernel_size=num_samples) + self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal) + self.activation = nn.PReLU() + + def forward(self, x): + """ + Applies convolution, activation, and pooling to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + output = self.activation(self.conv(x)) + output = self.pooling(output) + return output + + +class PostProcessor(nn.Module): + """ + A module for postprocessing data through convolution and reshaping. + It is used as an initial step after the decoder blocks in the ScalarModel, particularly when the kernel_size for average pooling operation exceeds 1. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + num_samples : int + Number of samples for repetition. + kernel_size : int, optional + Size of the convolutional kernel (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, num_samples, kernel_size=7, causal=False): + super(PostProcessor, self).__init__() + self.num_samples = num_samples + self.conv = Conv1d(n_in, n_out, kernel_size=kernel_size, causal=causal) + self.activation = nn.PReLU() + + def forward(self, x): + """ + Applies reshaping, repetition, and convolution to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = torch.transpose(x, 1, 2) + B, T, C = x.size() + x = x.repeat(1, 1, self.num_samples).view(B, -1, C) + x = torch.transpose(x, 1, 2) + output = self.activation(self.conv(x)) + return output + + +class DownsampleLayer(nn.Module): + """ + A downsampling layer that applies convolution, optional pooling, and activation. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + causal : bool, optional + If True, applies causal convolution (default is False). + activation : nn.Module, optional + Activation function (default is PReLU). + use_weight_norm : bool, optional + If True, applies weight normalization to the convolution (default is True). + pooling : bool, optional + If True, applies an average pooling operation (default is False). + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + activation=nn.PReLU(), + use_weight_norm: bool = True, + pooling: bool = False, + ): + super(DownsampleLayer, self).__init__() + self.pooling = pooling + self.stride = stride + self.activation = activation + self.use_weight_norm = use_weight_norm + if pooling: + self.layer = Conv1d( + in_channels, out_channels, kernel_size, causal=causal + ) + self.pooling = nn.AvgPool1d(kernel_size=stride) + else: + self.layer = Conv1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + causal=causal, + ) + if use_weight_norm: + self.layer = weight_norm(self.layer) + + def forward(self, x): + """ + Applies convolution, optional pooling, and activation to the input data. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.layer(x) + x = self.activation(x) if self.activation is not None else x + if self.pooling: + x = self.pooling(x) + return x + + def remove_weight_norm(self): + """ + Removes weight normalization from the convolutional layer. + """ + if self.use_weight_norm: + remove_weight_norm(self.layer) + + +class UpsampleLayer(nn.Module): + """ + An upsampling layer that applies transposed convolution or repetition, with activation. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the transposed convolution (default is 1). + causal : bool, optional + If True, applies causal convolution (default is False). + activation : nn.Module, optional + Activation function (default is PReLU). + use_weight_norm : bool, optional + If True, applies weight normalization to the convolution (default is True). + repeat : bool, optional + If True, applies repetition instead of transposed convolution (default is False). + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + causal: bool = False, + activation=nn.PReLU(), + use_weight_norm: bool = True, + repeat: bool = False, + ): + super(UpsampleLayer, self).__init__() + self.repeat = repeat + self.stride = stride + self.activation = activation + self.use_weight_norm = use_weight_norm + if repeat: + self.layer = Conv1d( + in_channels, out_channels, kernel_size, causal=causal + ) + else: + self.layer = ConvTranspose1d( + in_channels, + out_channels, + kernel_size, + stride=stride, + causal=causal, + ) + if use_weight_norm: + self.layer = weight_norm(self.layer) + + def forward(self, x): + """ + Applies upsampling through transposed convolution or repetition, followed by activation. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.layer(x) + x = self.activation(x) if self.activation is not None else x + if self.repeat: + x = torch.transpose(x, 1, 2) + B, T, C = x.size() + x = x.repeat(1, 1, self.stride).view(B, -1, C) + x = torch.transpose(x, 1, 2) + return x + + def remove_weight_norm(self): + """ + Removes weight normalization from the convolutional layer. + """ + if self.use_weight_norm: + remove_weight_norm(self.layer) + + +class ResidualUnit(nn.Module): + """ + A residual unit with two convolutional layers and activation functions. + This module is commonly used in the encoder and decoder blocks of the ScalarModel + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + dilation : int + Dilation factor for the first convolutional layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__(self, n_in, n_out, dilation, res_kernel_size=7, causal=False): + super(ResidualUnit, self).__init__() + self.conv1 = weight_norm( + Conv1d( + n_in, + n_out, + kernel_size=res_kernel_size, + dilation=dilation, + causal=causal, + ) + ) + self.conv2 = weight_norm( + Conv1d(n_in, n_out, kernel_size=1, causal=causal) + ) + self.activation1 = nn.PReLU() + self.activation2 = nn.PReLU() + + def forward(self, x): + """ + Applies two convolutional layers with activations and adds the input for a residual connection. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Output tensor with residual connection applied. + """ + output = self.activation1(self.conv1(x)) + output = self.activation2(self.conv2(output)) + return output + x + + +class ResEncoderBlock(nn.Module): + """ + A residual encoder block with multiple residual units and a downsampling layer. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + stride : int + Stride for the downsampling layer. + down_kernel_size : int + Kernel size for the downsampling layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__( + self, + n_in, + n_out, + stride, + down_kernel_size, + res_kernel_size=7, + causal=False, + ): + super(ResEncoderBlock, self).__init__() + self.convs = nn.ModuleList( + [ + ResidualUnit( + n_in, + n_out // 2, + dilation=1, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=3, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=5, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=7, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out // 2, + n_out // 2, + dilation=9, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ] + ) + self.down_conv = DownsampleLayer( + n_in, n_out, down_kernel_size, stride=stride, causal=causal + ) + + def forward(self, x): + """ + Applies a series of residual units and a downsampling layer. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + for conv in self.convs: + x = conv(x) + x = self.down_conv(x) + return x + + +class ResDecoderBlock(nn.Module): + """ + A residual decoder block with upsampling and multiple residual units. + + Arguments + --------- + n_in : int + Number of input channels. + n_out : int + Number of output channels. + stride : int + Stride for the upsampling layer. + up_kernel_size : int + Kernel size for the upsampling layer. + res_kernel_size : int, optional + Size of the convolutional kernel for residual connections (default is 7). + causal : bool, optional + If True, applies causal convolution (default is False). + """ + + def __init__( + self, + n_in, + n_out, + stride, + up_kernel_size, + res_kernel_size=7, + causal=False, + ): + super(ResDecoderBlock, self).__init__() + self.up_conv = UpsampleLayer( + n_in, + n_out, + kernel_size=up_kernel_size, + stride=stride, + causal=causal, + activation=None, + ) + self.convs = nn.ModuleList( + [ + ResidualUnit( + n_out, + n_out, + dilation=1, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=3, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=5, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=7, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ResidualUnit( + n_out, + n_out, + dilation=9, + res_kernel_size=res_kernel_size, + causal=causal, + ), + ] + ) + + def forward(self, x): + """ + Applies upsampling followed by a series of residual units. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Processed output tensor. + """ + x = self.up_conv(x) + for conv in self.convs: + x = conv(x) + return x + + +class Conv1d(nn.Conv1d): + """ + Custom 1D convolution layer with an optional causal mode. + + This class extends PyTorch's `nn.Conv1d` and allows for causal convolutions + by automatically applying the correct amount of padding to ensure that the output + does not depend on future inputs, which is useful for sequential data processing. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + dilation : int, optional + Dilation factor for the convolution (default is 1). + groups : int, optional + Number of blocked connections from input channels to output channels (default is 1). + padding_mode : str, optional + Padding mode to use ('zeros', 'reflect', 'replicate', or 'circular') (default is 'zeros'). + bias : bool, optional + If True, adds a learnable bias to the output (default is True). + padding : int, optional + Explicit padding value. If not provided, it will be computed automatically. + causal : bool, optional + If True, applies causal convolution where the output depends only on the past and current inputs (default is False). + w_init_gain : str, optional + Gain value used for Xavier initialization (e.g., 'relu', 'tanh', etc.). If provided, applies Xavier uniform initialization to the convolutional weights. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + padding_mode: str = "zeros", + bias: bool = True, + padding=None, + causal: bool = False, + w_init_gain=None, + ): + self.causal = causal + if padding is None: + if causal: + padding = 0 + self.left_padding = dilation * (kernel_size - 1) + else: + padding = get_padding(kernel_size, dilation) + super(Conv1d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + padding_mode=padding_mode, + bias=bias, + ) + if w_init_gain is not None: + torch.nn.init.xavier_uniform_( + self.weight, gain=torch.nn.init.calculate_gain(w_init_gain) + ) + + def forward(self, x): + """ + Applies the forward pass of the convolutional layer. + + Arguments + --------- + x : torch.Tensor + Input tensor of shape (batch_size, channels, sequence_length). + + Returns + ------- + torch.Tensor + The output tensor after applying the convolution operation. + If `causal` is True, the input tensor is padded to ensure that + the output at each timestep only depends on the current and previous inputs. + """ + if self.causal: + x = F.pad(x.unsqueeze(2), (self.left_padding, 0, 0, 0)).squeeze(2) + + return super(Conv1d, self).forward(x) + + +class ConvTranspose1d(nn.ConvTranspose1d): + """ + Custom transposed 1D convolution layer with causal option. + + Arguments + --------- + in_channels : int + Number of input channels. + out_channels : int + Number of output channels. + kernel_size : int + Size of the convolutional kernel. + stride : int, optional + Stride of the convolution (default is 1). + output_padding : int, optional + Additional size added to one side of the output (default is 0). + groups : int, optional + Number of blocked connections (default is 1). + bias : bool, optional + If True, adds a learnable bias (default is True). + dilation : int, optional + Dilation factor (default is 1). + padding : int, optional + Explicit padding value (default is None). + padding_mode : str, optional + Padding mode (default is 'zeros'). + causal : bool, optional + If True, applies causal convolution. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + output_padding: int = 0, + groups: int = 1, + bias: bool = True, + dilation: int = 1, + padding=None, + padding_mode: str = "zeros", + causal: bool = False, + ): + if padding is None: + padding = 0 if causal else (kernel_size - stride) // 2 + if causal: + assert ( + padding == 0 + ), "padding is not allowed in causal ConvTranspose1d." + assert ( + kernel_size == 2 * stride + ), "kernel_size must be equal to 2*stride is not allowed in causal ConvTranspose1d." + super(ConvTranspose1d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + bias=bias, + dilation=dilation, + padding_mode=padding_mode, + ) + self.causal = causal + self.stride = stride + + def forward(self, x): + """ + Applies the transposed convolution operation. + + Arguments + --------- + x : torch.Tensor + Input tensor. + + Returns + ------- + torch.Tensor + Transposed convolved output tensor. + """ + x = super(ConvTranspose1d, self).forward(x) + if self.causal: + x = x[:, :, : -self.stride] + return x + + +def decimal_to_ternary_matrix(decimals, D): + """ + Convert a tensor of decimal numbers to a D*T ternary matrix for each batch. + + Arguments + --------- + decimals : torch.Tensor + A 2D tensor of decimal numbers with shape (B, T), where B is the batch size + and T is the number of elements in each batch. + D : int + Number of ternary digits to represent each number (depth). + + Returns + ------- + torch.Tensor + A 3D tensor of shape (B, D, T) where each slice along the first dimension + corresponds to a batch, and each column is represented as a ternary number. + """ + B, T = decimals.shape + ternary_matrix = torch.zeros((B, D, T), dtype=torch.long) + for pos in range(D): + ternary_matrix[:, pos, :] = decimals % 3 # Modulo operation + decimals //= 3 # Floor division for next ternary digit + + return ternary_matrix + + +def ternary_matrix_to_decimal(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + B, D, N = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** np.arange(D) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, np.newaxis] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = np.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + +def get_padding(kernel_size, dilation=1): + """ + Computes the padding size for a given kernel size and dilation. + + Arguments + --------- + kernel_size : int + Size of the convolutional kernel. + dilation : int, optional + Dilation factor for convolution (default is 1). + + Returns + ------- + int + Calculated padding size. + """ + return int((kernel_size * dilation - dilation) / 2) diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index ff1194968..c8e81eb7a 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -7,7 +7,8 @@ --------- * Pooneh Mousavi, 2024 """ - +import sys +import os import torch from abc import ABC, abstractmethod from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec @@ -15,9 +16,16 @@ DiscreteSSL, ) from speechbrain.lobes.models.discrete.dac import DAC -from speechbrain.lobes.models.discrete.speechtokenizer_interface import ( - SpeechTokenizer_interface, -) +from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer +from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer +from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi + +base_dir = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) # noqa: E402 +sys.path.append(base_dir) # noqa: E402 + +from model.sq_codec import SQCodec # noqa: E402 class BaseTokenizer(ABC): @@ -52,7 +60,7 @@ def __init__(self): @abstractmethod @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): """ Encode a signal into discrete tokens. @@ -114,18 +122,40 @@ def get_pretrained_embeddings(self, vocab_size, num_codebooks, **kwargs): Returns ------- embeddings : torch.Tensor - Pretrained embedding weights with shape [K, C, H], where H is the embedding dimension. + Pretrained embedding weights with shape [K * C, H], where H is the embedding dimension. """ pass class EncodecTokenizer(Encodec, BaseTokenizer): + """This is a wrapper for the Encodec implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2210.13438 + Example + ------- + >>> model_hub = "facebook/encodec_24khz" + >>> save_path = "savedir" + >>> model = EncodecTokenizer(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([2048, 128]) + >>> audio = torch.randn(4, 1000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens= model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 4, 2]) + >>> rec = model.tokens_to_sig(tokens, lenght=length) + >>> rec.shape + torch.Size([4, 1280] + """ + def __init__(self, *args, **kwargs): Encodec.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): self.eval() tokens, _ = self.encode(signal, lengths) if num_codebooks: @@ -151,12 +181,31 @@ def get_pretrained_embeddings( class DACTokenizer(DAC, BaseTokenizer): + """This is a wrapper for the DAC implemented in the SpeechBrain main repository. + + Source paper: + http://arxiv.org/abs/2306.06546 + Example + ------- + >>> model = DACTokenizer(load_pretrained=True, model_type="24KHz", model_bitrate="8kbps", tag="latest") + >>> audio = torch.randn(4, 16000) + >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8) + >>> emb.shape + torch.Size([8192, 1024]) + >>> tokens= model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([4, 50, 32]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([4, 15992]) + """ + def __init__(self, *args, **kwargs): DAC.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): self.eval() tokens, _ = self(signal[:, None], n_quantizers=num_codebooks) return tokens.movedim(-1, -2) @@ -185,14 +234,35 @@ def get_pretrained_embeddings( return torch.cat(z_qs)[:, :, 0] -class SpeechTokenizer(SpeechTokenizer_interface, BaseTokenizer): +class SpeechTokenizerWrapper(SpeechTokenizer, BaseTokenizer): + """This is a wrapper for the SpeechTokenizer implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2308.16692 + Example + ------- + >>> audio = torch.rand([10, 600]) + >>> model_hub = "fnlp/SpeechTokenizer" + >>> save_path = "savedir" + >>> model = SpeechTokenizerWrapper(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings(vocab_size=1024, num_codebooks=8) + >>> emb.shape + torch.Size([8192, 1024]) + >>> tokens= model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([10, 2, 8]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([10, 640]) + """ + def __init__(self, *args, **kwargs): - SpeechTokenizer_interface.__init__(self, *args, **kwargs) + SpeechTokenizer.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) self.sample_rate = 16000 @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): self.eval() tokens = self(signal) if num_codebooks: @@ -223,12 +293,41 @@ def get_pretrained_embeddings( class DiscreteSSLTokenizer(DiscreteSSL, BaseTokenizer): + """This is a wrapper for the Encodec implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2210.13438 + Example + ------- + >>> from speechbrain.lobes.models.huggingface_transformers.wavlm import (WavLM) + >>> inputs = torch.rand([3, 2000]) + >>> model_hub = "microsoft/wavlm-large" + >>> save_path = "savedir" + >>> ssl_layer_num = [7,23] + >>> deduplicate =[False, True] + >>> bpe_tokenizers=[None, None] + >>> vocoder_repo_id = "speechbrain/hifigan-wavlm-k1000-LibriTTS" + >>> kmeans_dataset = "LibriSpeech" + >>> num_clusters = 1000 + >>> ssl_model = WavLM(model_hub, save_path,output_all_hiddens=True) + >>> model = DiscreteSSLTokenizer(save_path, ssl_model, vocoder_repo_id=vocoder_repo_id, kmeans_dataset=kmeans_dataset,num_clusters=num_clusters) + >>> emb=model.get_pretrained_embeddings(num_codebooks=ssl_layer_num) + >>> emb.shape + torch.Size([2000, 1024]) + >>> tokens= model.sig_to_tokens(inputs,num_codebooks=ssl_layer_num, deduplicates=deduplicate, bpe_tokenizers=bpe_tokenizers) + >>> tokens.shape + torch.Size([3, 6, 2]) + >>> sig = model.tokens_to_sig(tokens, SSL_layers=ssl_layer_num) + >>> sig.shape + torch.Size([3, 1920]) + """ + def __init__(self, *args, **kwargs): DiscreteSSL.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) @torch.no_grad() - def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): self.eval() tokens, _, _ = self.encode( signal, lengths, SSL_layers=num_codebooks, **kwargs @@ -238,7 +337,7 @@ def sig_to_tokens(self, signal, lengths, num_codebooks=None, **kwargs): @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): self.eval() - return self.decode(tokens, **kwargs) + return self.decode(tokens, **kwargs).squeeze(1) @torch.no_grad() def get_pretrained_embeddings( @@ -253,3 +352,165 @@ def get_pretrained_embeddings( embs.append(torch.as_tensor(vocabulary, dtype=torch.float32)) embs = torch.cat(embs) return embs + + +class MimiTokenizer(Mimi, BaseTokenizer): + """This is a wrapper for the Mimi implemented in the SpeechBrain main repository. + + Source paper: + https://kyutai.org/Moshi.pdf + Example + ------- + >>> model_hub = "kyutai/mimi" + >>> save_path = "savedir" + >>> model = MimiTokenizer(model_hub, save_path) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([16384, 256]) + >>> audio = torch.randn(4, 48000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens = model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 25, 8]) + >>> rec = model.tokens_to_sig(tokens, length=length) + >>> rec.shape + torch.Size([4, 48000]) + """ + + def __init__(self, *args, **kwargs): + Mimi.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal, lengths) + if num_codebooks: + if tokens.shape[-1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:, :num_codebooks, :] + return tokens.movedim(-1, -2) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.movedim(-1, -2), **kwargs)[:, 0] + return signal + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + return self.embeddings.view(-1, self.embeddings.size(-1)) + + +class WavTokenizerWrapper(WavTokenizer, BaseTokenizer): + """This is a wrapper for the WavTokenizer implemented in the SpeechBrain main repository. + + Source paper: + https://arxiv.org/abs/2408.16532 + + Example + ------- + >>> model_hub = "novateur/WavTokenizer" + >>> save_path = "savedir" + >>> config="wavtokenizer_smalldata_frame40_3s_nq1_code4096_dim512_kmeans200_attn.yaml" + >>> checkpoint="WavTokenizer_small_600_24k_4096.ckpt" + >>> model = WavTokenizerWrapper(model_hub, save_path,config=config,checkpoint=checkpoint) + >>> emb=model.get_pretrained_embeddings() + >>> emb.shape + torch.Size([4096, 512]) + >>> audio = torch.randn(4, 48000) + >>> length = torch.tensor([1.0, .5, .75, 1.0]) + >>> tokens= model.sig_to_tokens(audio, length) + >>> tokens.shape + torch.Size([4, 80, 1]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([4, 48000]) + """ + + def __init__(self, *args, **kwargs): + WavTokenizer.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal) + if num_codebooks: + if tokens.shape[1] < num_codebooks: + raise ValueError( + f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested" + ) + tokens = tokens[:, :num_codebooks, :] + + return tokens.movedim(-2, -1) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.movedim(-1, -2)) + return signal + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + return self.embeddings + + +class SQCodecTokenizer(SQCodec, BaseTokenizer): + """This is a wrapper for the SQCoced implemented in the model folder. + + Source paper: + https://arxiv.org/abs/2406.02328, https://arxiv.org/abs/2408.13893 + + + Make sure that you download and extract the SQ-codec.zip in save_path from following Huggingface repo: + - HF repo: https://huggingface.co/Dongchao/UniAudio/blob/main/SQ-Codec.zip + + Example + ------- + >>> save_path = "savedir" + >>> config = "config.yaml" + >>> checkpoint = "ckpt_00190000.pth" + >>> model = SQCodecTokenizer(save_path, config, checkpoint) + >>> audio = torch.randn(3, 48000) + >>> tokens = model.sig_to_tokens(audio) + >>> tokens.shape + torch.Size([3, 150, 4]) + >>> rec = model.tokens_to_sig(tokens) + >>> rec.shape + torch.Size([3, 48000] + """ + + def __init__(self, *args, **kwargs): + SQCodec.__init__(self, *args, **kwargs) + BaseTokenizer.__init__(self) + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.eval() + tokens, _ = self.encode(signal) + return tokens.view(tokens.shape[0], -1, self.n_codebook) + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.eval() + signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs) + return signal.squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + """ + This method is not implemented for SQCodec, as it uses scalar quantization + and does not have any trainable quantizer or embedding. + """ + raise ValueError( + "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization." + ) From f534bdfdd9c630e4a467cded91b529e0ce9f1225 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Sat, 11 Jan 2025 23:17:31 -0500 Subject: [PATCH 053/270] fix precommit --- benchmarks/DASB/model/sq_codec.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index f04c094d4..6057a5f73 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1323,7 +1323,11 @@ def ternary_matrix_to_decimal(matrix): A 2D numpy array of shape (B, N), where each value represents the decimal equivalent of the corresponding ternary number in the input matrix. """ - B, D, N = ( + ( + B, + D, + N, + ) = ( matrix.shape ) # B is the batch size, D is the number of digits, N is the number of ternary numbers powers_of_three = 3 ** np.arange(D) # [3^0, 3^1, ..., 3^(D-1)] From 54dab6782b7da1adc7eb41eab11f77b9db5da326 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 13 Jan 2025 00:25:02 -0500 Subject: [PATCH 054/270] Tokotron: Fixes --- .../DASB/LJSpeech/TTS/tokotron/evaluate.py | 42 +------- .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 52 ++------- .../TTS/tokotron/hparams/train_encodec.yaml | 102 ++++-------------- .../DASB/LibriTTS/TTS/tokotron/train.py | 2 +- .../LibriTTS/TTS/tokotron/train_encodec.py | 2 +- benchmarks/DASB/model/Tokotron.py | 7 +- 6 files changed, 38 insertions(+), 169 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py index bcb2670a6..e40e9bb31 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py @@ -51,17 +51,7 @@ def __init__(self, hparams, create_waveform_fn, device): else: self.evaluators = {} - bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {}) - if bulk_evaluators: - self.bulk_evaluators = { - key: evaluator_f() - for key, evaluator_f in bulk_evaluators.items() - if key in self.enabled_evaluators - } - else: - self.bulk_evaluators = {} - - if not self.evaluators and not self.bulk_evaluators: + if not self.evaluators: logger.warn( "No evaluators were defined - this run will produce samples only" ) @@ -98,9 +88,7 @@ def on_evaluate_start(self, stage, epoch): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + list( - self.bulk_evaluators.keys() - ) + details_keys = list(self.evaluators.keys()) self.details = {evaluator_key: [] for evaluator_key in details_keys} self.sample_text = [] self.sample_file_names = [] @@ -141,7 +129,6 @@ def on_evaluate_end(self): dataset : speechbrain.dataio.dataset.DynamicItemDataset a dataset """ - self.evaluate_bulk() self.write_summary() logger.info("Evaluation done") @@ -182,19 +169,6 @@ def get_report_columns(self, evaluator_key): wavs_ref=bogus_wavs, length_ref=bogus_length, ) - else: - bogus_file_name = self.output_folder / "bogus.wav" - evaluator = self.bulk_evaluators[evaluator_key] - sb.dataio.dataio.write_audio( - str(bogus_file_name), - bogus_wavs[0].cpu(), - samplerate=self.hparams.model_sample_rate, - ) - result = evaluator.evaluate_files( - file_names=[bogus_file_name], - text=["BOGUS"], - file_names_ref=[bogus_file_name], - ) return ["uttid"] + list(result.details.keys()) @@ -228,18 +202,6 @@ def evaluate_batch(self, batch): self.write_result(evaluator_key, batch.uttid, details) self.details[evaluator_key].extend(details) - def evaluate_bulk(self): - """Runs all configured bulk evaluators, which evaluate a directory - of files - rather than one file at a time""" - for evaluator_key, evaluator in self.bulk_evaluators.items(): - result = evaluator.evaluate_files( - file_names=self.sample_file_names, - text=self.sample_text, - file_names_ref=self.ref_file_names, - ) - self.details[evaluator_key].append(result.details) - details = undo_batch(result.details) - self.write_result(evaluator_key, self.item_ids, details) def write_result(self, evaluator_key, uttid, details): """Outputs the result details to the report for the specified evaluator diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index d72df92aa..3d5320fdb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -62,17 +62,7 @@ def __init__(self, hparams, create_waveform_fn, device): else: self.evaluators = {} - bulk_evaluators = getattr(self.hparams, "bulk_evaluators", {}) - if bulk_evaluators: - self.bulk_evaluators = { - key: evaluator_f() - for key, evaluator_f in bulk_evaluators.items() - if key in self.enabled_evaluators - } - else: - self.bulk_evaluators = {} - - if not self.evaluators and not self.bulk_evaluators: + if not self.evaluators: logger.warning("No evaluators were defined - this run will produce samples only") self.attention = [] @@ -101,9 +91,7 @@ def on_evaluate_start(self, stage, epoch): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + list( - self.bulk_evaluators.keys() - ) + details_keys = list(self.evaluators.keys()) self.details = {evaluator_key: [] for evaluator_key in details_keys} self.sample_text = [] self.sample_file_names = [] @@ -157,7 +145,7 @@ def evaluate(self, dataset): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + list(self.bulk_evaluators.keys()) + details_keys = list(self.evaluators.keys()) self.details = { evaluator_key: [] for evaluator_key in details_keys @@ -170,7 +158,6 @@ def evaluate(self, dataset): batch_count = math.ceil(len(dataset) / self.hparams.batch_size) for batch in tqdm(loader_it, desc="Evaluation", total=batch_count): self.evaluate_batch(batch) - self.evaluate_bulk() self.write_summary() logger.info("Evaluation done") @@ -285,19 +272,6 @@ def get_report_columns(self, evaluator_key): wavs_ref=bogus_wavs, length_ref=bogus_length, ) - else: - bogus_file_name = self.output_folder / "bogus.wav" - evaluator = self.bulk_evaluators[evaluator_key] - sb.dataio.dataio.write_audio( - str(bogus_file_name), - bogus_wavs[0].cpu(), - samplerate=self.hparams.model_sample_rate, - ) - result = evaluator.evaluate_files( - file_names=[bogus_file_name], - text=["BOGUS"], - file_names_ref=[bogus_file_name], - ) return ["uttid"] + list(result.details.keys()) @@ -311,9 +285,10 @@ def evaluate_batch(self, batch): with torch.no_grad(): batch = batch.to(self.device) tokens, tokens_length = batch.tokens - vocoder_to_device(self.modules.vocoder, self.device) - if hasattr(self.modules.vocoder, "device"): - self.modules.vocoder.device = self.device + if hasattr(self.modules, "vocoder"): + vocoder_to_device(self.modules.vocoder, self.device) + if hasattr(self.modules.vocoder, "device"): + self.modules.vocoder.device = self.device audio_resampled = torchaudio.functional.resample( batch.sig.data, self.hparams.sample_rate, @@ -361,19 +336,6 @@ def evaluate_batch(self, batch): perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"] self.write_perf_stats(batch.uttid, perf_stats) - - def evaluate_bulk(self): - """Performs bulk evaluation""" - for evaluator_key, evaluator in self.bulk_evaluators.items(): - result = evaluator.evaluate_files( - file_names=self.sample_file_names, - text=self.sample_text, - file_names_ref=self.ref_file_names, - ) - self.details[evaluator_key].append(result.details) - details = undo_batch(result.details) - self.write_result(evaluator_key, self.item_ids, details) - def write_result(self, evaluator_key, uttid, details): """Outputs the result details to the report for the specified evaluator diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index a82d82a2c..0fed45124 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -40,6 +40,11 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + freeze_token_model: True token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p @@ -125,7 +130,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp +gate_offset: !apply:Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -137,14 +142,6 @@ guides_enabled: False silence_padding: !ref use_silence_padding: True - -# Token model (pretrained) -token_model: !new:speechbrain.lobes.models.huggingface_transformers.Encodec - source: !ref - save_path: !ref - bandwidth: !ref - flat_embeddings: True - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa @@ -180,15 +177,6 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - num_workers: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - ####################### Model parameters ########################### # Transformer @@ -197,11 +185,6 @@ nhead: 4 enc_num_layers: 6 dec_num_layers: 12 d_ffn: 2048 -z_dim: 128 -hidden_dim: 2048 -enc_n_dim: 16 -dec_n_dim: 256 -decoder_chunk_size: -1 transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU @@ -221,38 +204,6 @@ bandwidth: 1.5 attention_type: regularMHA ############################## models ################################ - -vocoder: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encodec: !new:benchmarks.DASB.model.custom_model.EncodecVocoder - encodec: !ref - vocos: !new:speechbrain.lobes.models.huggingface_transformers.vocos.Vocos - source: !ref - save_path: !ref - - -inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference - bos_index: !ref - eos_index: !ref - min_decode_ratio: !ref - max_decode_ratio: !ref - beam_size: !ref - using_eos_threshold: False - length_normalization: True - audio_token_shift: !ref - -inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference - scale_factor: !ref - gate_threshold: !ref - eos_mode: !ref - -inference: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - search: !ref - forward: !ref - emb: spk: kind: "pretrained" @@ -260,17 +211,12 @@ emb: vocoder: !ref injection: !ref -model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref d_ffn: !ref - z_dim: !ref - hidden_dim: !ref - enc_n_dim: !ref - dec_n_dim: !ref - decoder_chunk_size: !ref nhead: !ref enc_num_layers: !ref dec_num_layers: !ref @@ -278,13 +224,11 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d target_dropout: !ref activation: !ref attention_type: !ref - vocoder: !ref gate_threshold: !ref gate_offset: !ref audio_emb_size: !ref audio_emb_freeze: !ref max_audio_length: !ref - inference: !ref eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref @@ -292,16 +236,27 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d scale_factor: !ref emb: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + modules: model: !ref - vocoder: !ref compute_cost: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss +compute_cost: !new:Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -327,26 +282,11 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:benchmarks.DASB.utils.preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref -progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger - current_path: !ref - archive_path: !ref - meta_path: !ref - epoch_counter: !ref - -progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport - logger: !ref - sample_rate: !ref - eos_threshold: !ref - spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler - seed: !ref \ No newline at end of file + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index a09a4cc23..e19cf3eba 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -246,7 +246,7 @@ def on_stage_start(self, stage, epoch): The currently-starting epoch. This is passed `None` during the test stage. """ - if hasattr(self.modules.vocoder, "model"): + if hasattr(self.modules, "vocoder") and hasattr(self.modules.vocoder, "model"): self.modules.vocoder.model.device = self.device self.layer_idx = self._get_selected_layer_idx() self.loss_metric = sb.utils.metric_stats.MultiMetricStats( diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py index 07edbbd8c..98f1b27cc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py @@ -36,7 +36,7 @@ def create_waveform(self, audio, length, emb): ------- wav : torch.Tensor """ - wav = self.modules.token_model.decode(audio) + wav = self.modules.tokenizer.decode(audio) wav = wav.squeeze(1) clean_padding_(wav, length) return wav diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 14aa38693..266090be4 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -12,6 +12,7 @@ import math import torch +import inspect from torch import nn from torch.nn import functional as F from speechbrain.lobes.models.transformer.Transformer import ( @@ -2110,7 +2111,11 @@ def get_silence_token( model_training = model.training model.eval() if hasattr(model, "encode"): - result = model.encode(audio, length, **model_kwargs) + spec = inspect.getfullargspec(model.encode) + if "length" in spec.args: + result = model.encode(audio, length, **model_kwargs) + else: + result = model.encode(audio, **model_kwargs) else: result = model(audio, length, **model_kwargs) if model_training: From 2552b06264c93686001c90ea3d07bec04b3dc1b4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 14 Jan 2025 11:42:09 -0500 Subject: [PATCH 055/270] DASB: Tokotron: Cosmetic changes --- .../DASB/LJSpeech/TTS/tokotron/evaluate.py | 1 - .../TTS/tokotron/hparams/train_dac.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/train.py | 43 +++---- benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 1 - .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 110 ++++++++--------- .../hparams/train_continuous_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_dac.yaml | 4 +- .../tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../hparams/train_speech_tokenizer.yaml | 4 +- .../DASB/LibriTTS/TTS/tokotron/train.py | 114 ++++++++++-------- .../DASB/LibriTTS/extraction/extract.py | 2 +- benchmarks/DASB/utils/eval.py | 38 ++---- 13 files changed, 151 insertions(+), 174 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py index e40e9bb31..52b7e1817 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/evaluate.py @@ -202,7 +202,6 @@ def evaluate_batch(self, batch): self.write_result(evaluator_key, batch.uttid, details) self.details[evaluator_key].extend(details) - def write_result(self, evaluator_key, uttid, details): """Outputs the result details to the report for the specified evaluator diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 4f50c7ed2..240b57a7d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -199,7 +199,7 @@ tokenizer: !new:utils.tokenizer_interface.DACTokenizer n_codebooks: !ref load_pretrained: True tag: latest - + modules: model: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index deb8a3236..8da11247b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -57,8 +57,9 @@ def __init__( create_waveform_fn=self.create_waveform, device=self.device, ) - self.representation_mode = RepresentationMode(self.hparams.representation_mode) - + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) def compute_forward(self, batch, stage): """Runs all the computation of the Tokotron TTS @@ -97,7 +98,7 @@ def prepare_features(self, batch): """Prepares features, depending on the configuration Arguments - --------- + --------- batch : PaddedBatch This batch object contains all the relevant tensors for computation @@ -122,13 +123,10 @@ def prepare_features(self, batch): 1, 2, 0, 3 ) batch_size, _, heads, dim = audio.shape - bos = torch.zeros_like( - audio[:, :1, :, :] - ).reshape(batch_size, self.hparams.bos_width, heads, dim) - audio_bos = torch.concatenate( - [bos, audio], - dim=1 + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim ) + audio_bos = torch.concatenate([bos, audio], dim=1) audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) audio_tgt = audio audio_tgt_length = audio_length @@ -469,13 +467,16 @@ def audio_ref_pipeline(wav): if layers_key in hparams else hparams["audio_tokens_per_step"] ) - if use_silence_padding and representation_mode == RepresentationMode.DISCRETE: + if ( + use_silence_padding + and representation_mode == RepresentationMode.DISCRETE + ): silence_token, _ = get_silence_token( hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"), extract_emb=False, model_shape=hparams.get("model_shape", "BLH"), - unsqueeze=hparams.get("model_needs_channel", False) + unsqueeze=hparams.get("model_needs_channel", False), ) else: silence_token = ( @@ -495,7 +496,9 @@ def audio_ref_pipeline(wav): @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") def audio_pipeline(id): - audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step) + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=audio_tokens_per_step + ) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -737,9 +740,7 @@ def run_experiment(brain_cls): "extract_phonemes": hparams["input"] == "phonemes", "model_name": "tokotron", "g2p_src": hparams["g2p_src"], - "skip_ignore_folders": hparams[ - "prepare_skip_ignore_folders" - ], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], "frozen_split_path": hparams.get("frozen_split_path"), "device": run_opts.get("device", "cpu"), }, @@ -767,21 +768,18 @@ def run_experiment(brain_cls): # stopped at any point, and will be resumed on next call. dataloader_opts = [ - hparams[f"{key}_dataloader_opts"] - for key in ["train", "valid", "test"] + hparams[f"{key}_dataloader_opts"] for key in ["train", "valid", "test"] ] representation_mode = RepresentationMode(hparams["representation_mode"]) if representation_mode == RepresentationMode.DISCRETE: dataloader_opts = [ - use_silence_padding( - opts, silence_padding, audio_keys - ) + use_silence_padding(opts, silence_padding, audio_keys) for opts in dataloader_opts ] ( train_dataloader_opts, valid_dataloader_opts, - test_dataloader_opts + test_dataloader_opts, ) = dataloader_opts tts_brain.fit( @@ -794,8 +792,7 @@ def run_experiment(brain_cls): # Load best checkpoint for evaluation tts_brain.evaluate( - test_set=datasets["test"], - test_loader_kwargs=test_dataloader_opts, + test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts, ) # Save final checkpoint (fixed name) diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index bfd1b3743..06292fd34 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -797,4 +797,3 @@ def custom_clean(text, model_name): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text - diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 3d5320fdb..0a75a3482 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -4,7 +4,7 @@ * Artem Ploujnikov 2024 """ -#TODO: There are too many evaluation scripts. Refactor to extract common +# TODO: There are too many evaluation scripts. Refactor to extract common # features import speechbrain as sb @@ -33,7 +33,7 @@ class TokotronEvaluator: """An evaluator class for the TTS model - + Arguments --------- hparams: dict @@ -41,6 +41,7 @@ class TokotronEvaluator: device : str | torch.device the device """ + def __init__(self, hparams, create_waveform_fn, device): self.hparams = SimpleNamespace(**hparams) self.create_waveform_fn = create_waveform_fn @@ -63,7 +64,9 @@ def __init__(self, hparams, create_waveform_fn, device): self.evaluators = {} if not self.evaluators: - logger.warning("No evaluators were defined - this run will produce samples only") + logger.warning( + "No evaluators were defined - this run will produce samples only" + ) self.attention = [] @@ -91,7 +94,7 @@ def on_evaluate_start(self, stage, epoch): self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] - details_keys = list(self.evaluators.keys()) + details_keys = list(self.evaluators.keys()) self.details = {evaluator_key: [] for evaluator_key in details_keys} self.sample_text = [] self.sample_file_names = [] @@ -124,7 +127,6 @@ def get_output_folder(self, stage, epoch): output_folder.mkdir(parents=True, exist_ok=True) return output_folder - def evaluate(self, dataset): """Runs evaluation on a dataset @@ -139,17 +141,18 @@ def evaluate(self, dataset): raise ValueError("Unable to recover the checkpoint") self.modules.model.eval() if self.hparams.eval_samples is not None: - dataset = dataset.filtered_sorted(select_n=self.hparams.eval_samples) - loader = sb.dataio.dataloader.make_dataloader(dataset, batch_size=self.hparams.batch_size) + dataset = dataset.filtered_sorted( + select_n=self.hparams.eval_samples + ) + loader = sb.dataio.dataloader.make_dataloader( + dataset, batch_size=self.hparams.batch_size + ) loader_it = iter(loader) self.create_reports() self.modules.model.show_inference_progress = False self.item_ids = [] details_keys = list(self.evaluators.keys()) - self.details = { - evaluator_key: [] - for evaluator_key in details_keys - } + self.details = {evaluator_key: [] for evaluator_key in details_keys} self.read_reports() self.sample_text = [] self.sample_file_names = [] @@ -187,7 +190,7 @@ def create_reports(self): "vocoder_flops", "total_flops", "total_flops_per_step", - ] + ], ) self.perf_writer.writeheader() @@ -221,18 +224,14 @@ def vocoder(self, infer_out, emb): with flop_counter: wav = self.create_waveform_fn( - infer_out.audio, - length=infer_out.length, - emb=emb + infer_out.audio, length=infer_out.length, emb=emb ) if wav.dim() > 2: wav = wav.squeeze(1) if self.hparams.eval_perf: flops = flop_counter.get_total_flops() - stats = { - "vocoder_flops": flops - } + stats = {"vocoder_flops": flops} return wav, stats def read_reports(self): @@ -245,7 +244,10 @@ def read_reports(self): reader = csv.DictReader(report_file) for row in reader: del row["uttid"] - row = {key : handle_number(value) for key, value in row.items()} + row = { + key: handle_number(value) + for key, value in row.items() + } self.details[evaluator_key].append(row) def get_report_columns(self, evaluator_key): @@ -262,7 +264,7 @@ def get_report_columns(self, evaluator_key): a list of column headers """ bogus_wavs = torch.randn(2, 10000, device=self.device) - bogus_length = torch.tensor([1., 1.], device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) if evaluator_key in self.evaluators: evaluator = self.evaluators[evaluator_key] result = evaluator.evaluate( @@ -294,21 +296,14 @@ def evaluate_batch(self, batch): self.hparams.sample_rate, self.hparams.model_sample_rate, ) - mel_spec = self.spk_emb_model.mel_spectogram( - audio=audio_resampled - ) + mel_spec = self.spk_emb_model.mel_spectogram(audio=audio_resampled) spk_emb = self.spk_emb_model.encode_mel_spectrogram_batch( mel_spec, batch.sig.lengths ).squeeze(1) infer_out, perf_stats = self.infer( - tokens=tokens, tokens_length=tokens_length, - emb={ - "spk": spk_emb - } - ) - wav, vocoder_stats = self.vocoder( - infer_out, spk_emb + tokens=tokens, tokens_length=tokens_length, emb={"spk": spk_emb} ) + wav, vocoder_stats = self.vocoder(infer_out, spk_emb) perf_stats.update(vocoder_stats) length = infer_out.length if wav.dim() > 2: @@ -324,7 +319,7 @@ def evaluate_batch(self, batch): wavs_ref=batch.sig.data, length_ref=batch.sig.lengths, sample_rate_ref=self.hparams.sample_rate, - sample_rate=self.hparams.model_sample_rate + sample_rate=self.hparams.model_sample_rate, ) details = undo_batch(result.details) self.write_result(evaluator_key, batch.uttid, details) @@ -332,8 +327,12 @@ def evaluate_batch(self, batch): if self.hparams.eval_perf: perf_stats.update(vocoder_stats) - perf_stats["total_flops"] = perf_stats["vocoder_flops"] + perf_stats["infer_flops"] - perf_stats["total_flops_per_step"] = perf_stats["total_flops"] / perf_stats["steps"] + perf_stats["total_flops"] = ( + perf_stats["vocoder_flops"] + perf_stats["infer_flops"] + ) + perf_stats["total_flops_per_step"] = ( + perf_stats["total_flops"] / perf_stats["steps"] + ) self.write_perf_stats(batch.uttid, perf_stats) def write_result(self, evaluator_key, uttid, details): @@ -354,9 +353,7 @@ def write_result(self, evaluator_key, uttid, details): "uttid": uttid, **details_item, } - writer.writerow( - ascii_only(flatten(report_details)) - ) + writer.writerow(ascii_only(flatten(report_details))) self.report_files[evaluator_key].flush() def save_samples(self, batch, wav, length): @@ -375,12 +372,12 @@ def save_samples(self, batch, wav, length): for item_id, infer_wav, wav_length in zip( batch.uttid, wav, wav_length_abs ): - file_name = str( - self.samples_folder / f"{item_id}_pred.wav" - ) - infer_wav_cut = infer_wav[:wav_length.item()].cpu() + file_name = str(self.samples_folder / f"{item_id}_pred.wav") + infer_wav_cut = infer_wav[: wav_length.item()].cpu() sb.dataio.dataio.write_audio( - file_name, infer_wav_cut, samplerate=self.hparams.model_sample_rate + file_name, + infer_wav_cut, + samplerate=self.hparams.model_sample_rate, ) self.sample_file_names.append(file_name) @@ -392,28 +389,22 @@ def write_summary(self): json.dump(summary, output_file, indent=4) def write_perf_stats(self, uttid, details): - self.perf_writer.writerow( - { - "uttid": " ".join(uttid), - **details - } - ) + self.perf_writer.writerow({"uttid": " ".join(uttid), **details}) self.perf_file.flush() - def compute_summary(self): """Computes the summarized statistics""" return { f"{evaluator_key}_{stat_key}": value for evaluator_key in self.enabled_evaluators if evaluator_key in self.details - for metric_key in self.hparams.eval_summary[evaluator_key]["descriptive"] + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] for stat_key, value in descriptive_statistics( - items=self.details[evaluator_key], - key=metric_key, + items=self.details[evaluator_key], key=metric_key, ).items() } - def flatten(value): @@ -436,18 +427,15 @@ def flatten(value): RE_PUNCTUATION = re.compile( - "|".join( - re.escape(char) for char in string.punctuation - ) + "|".join(re.escape(char) for char in string.punctuation) ) -RE_NON_ASCII = re.compile(r'[^\x00-\x7F]+') +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") def ascii_only(values): return { - key: RE_NON_ASCII.sub('', value) if isinstance(value, str) - else value + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value for key, value in values.items() } @@ -494,7 +482,7 @@ def audio_ref_pipeline(wav): def descriptive_statistics(items, key): """Computes descriptive statistics for the summary - + Arguments --------- items : list @@ -515,8 +503,7 @@ def descriptive_statistics(items, key): "iqr": q3 - q1, } return { - f"{key}_{stat_key}": value.item() - for stat_key, value in stats.items() + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() } @@ -562,4 +549,3 @@ def handle_number(value): elif RE_FLOAT.match(value): value = float(value) return value - diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml index 3626079ef..2cbca90fb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml @@ -233,7 +233,7 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide source: !ref savedir: !ref /asr-transformer - + # Dataloader options train_dataloader_opts: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index c6875498c..2d91a521e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -143,7 +143,7 @@ token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa - + # Dataloader options train_dataloader_opts: @@ -257,7 +257,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d z_dim: !ref hidden_dim: !ref n_dim: !ref - decoder_chunk_size: !ref + decoder_chunk_size: !ref nhead: !ref enc_num_layers: !ref dec_num_layers: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index c1c2f9f1c..2ecb72a84 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -224,7 +224,7 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class pymodule_file: custom_interface.py classname: DiscreteSpkEmb overrides: - ssl_layer_num_selected: !ref + ssl_layer_num_selected: !ref # Dataloader options train_dataloader_opts: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 0fed45124..1f3764ceb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -145,7 +145,7 @@ use_silence_padding: True spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa - + # Dataloader options train_dataloader_opts: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 1711b10f4..2de6e121e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -139,7 +139,7 @@ token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa - + # Dataloader options train_dataloader_opts: @@ -250,7 +250,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d z_dim: !ref hidden_dim: !ref n_dim: !ref - decoder_chunk_size: !ref + decoder_chunk_size: !ref nhead: !ref enc_num_layers: !ref dec_num_layers: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index e19cf3eba..3df858844 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -28,7 +28,7 @@ get_silence_token, use_silence_padding, feature_pad_to, -) +) from types import SimpleNamespace from evaluate import TokotronEvaluator import re @@ -60,7 +60,9 @@ def __init__( create_waveform_fn=self.create_waveform, device=self.device, ) - self.representation_mode = RepresentationMode(self.hparams.representation_mode) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) def create_waveform(self, audio, length, emb): """Creates a waveform from a discrete or continuous audio @@ -104,7 +106,7 @@ def compute_forward(self, batch, stage): audio_bos_length, audio_tgt, audio_tgt_length, - spk_emb + spk_emb, ) = features predictions = self.modules.model( @@ -112,9 +114,7 @@ def compute_forward(self, batch, stage): input_length=tokens_length, audio=audio_bos, audio_length=audio_bos_length, - emb={ - "spk": spk_emb - } + emb={"spk": spk_emb}, ) return predictions, features @@ -136,13 +136,10 @@ def prepare_features(self, batch): 1, 2, 0, 3 ) batch_size, _, heads, dim = audio.shape - bos = torch.zeros_like( - audio[:, :1, :, :] - ).reshape(batch_size, self.hparams.bos_width, heads, dim) - audio_bos = torch.concatenate( - [bos, audio], - dim=1 + bos = torch.zeros_like(audio[:, :1, :, :]).reshape( + batch_size, self.hparams.bos_width, heads, dim ) + audio_bos = torch.concatenate([bos, audio], dim=1) audio_bos_length = audio_length * audio.size(1) / audio_bos.size(1) audio_tgt = audio audio_tgt_length = audio_length @@ -150,8 +147,7 @@ def prepare_features(self, batch): return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length, spk_emb def _compute_spk(self, wav, wav_length): - mel_spec = self.spk_emb_model.mel_spectogram( - wav.squeeze(1)) + mel_spec = self.spk_emb_model.mel_spectogram(wav.squeeze(1)) spk_emb_pred = self.spk_emb_model.encode_mel_spectrogram_batch( mel_spec, wav_length ) @@ -159,12 +155,14 @@ def _compute_spk(self, wav, wav_length): def _get_selected_layer_idx(self): selected_layers = None - if hasattr(self.hparams, "select_layers") and self.hparams.select_layers: + if ( + hasattr(self.hparams, "select_layers") + and self.hparams.select_layers + ): layers = self.hparams.select_layers model_layers_map = { layer: idx - for idx, layer in enumerate( - self.hparams.token_model_layers) + for idx, layer in enumerate(self.hparams.token_model_layers) } selected_layers = [model_layers_map[layer] for layer in layers] return selected_layers @@ -214,7 +212,7 @@ def compute_objectives(self, predictions, batch, stage): audio_bos_length, audio_tgt, audio_tgt_length, - spk_emb + spk_emb, ) = features loss_details = self.hparams.compute_cost( @@ -246,7 +244,9 @@ def on_stage_start(self, stage, epoch): The currently-starting epoch. This is passed `None` during the test stage. """ - if hasattr(self.modules, "vocoder") and hasattr(self.modules.vocoder, "model"): + if hasattr(self.modules, "vocoder") and hasattr( + self.modules.vocoder, "model" + ): self.modules.vocoder.model.device = self.device self.layer_idx = self._get_selected_layer_idx() self.loss_metric = sb.utils.metric_stats.MultiMetricStats( @@ -273,7 +273,9 @@ def on_stage_start(self, stage, epoch): self.spk_emb_model = self.hparams.spk_emb_model( run_opts=pretrained_run_opts ) - self.representation_mode = RepresentationMode(self.hparams.representation_mode) + self.representation_mode = RepresentationMode( + self.hparams.representation_mode + ) # If speaker embedding shuffling is enabled, re-initialize them for the # epoch if self.hparams.spk_emb_shuffle: @@ -370,7 +372,7 @@ def evaluate_batch(self, batch, stage): loss = self.compute_objectives(out, batch, stage=stage) if self.is_evaluating: self.evaluator.evaluate_batch(batch) - return loss.detach().cpu() + return loss.detach().cpu() def make_dataloader( self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs @@ -398,7 +400,9 @@ def make_dataloader( ------- DataLoader for the input dataset """ - if stage == sb.Stage.TRAIN and not getattr(self, "_ckpt_recovered", False): + if stage == sb.Stage.TRAIN and not getattr( + self, "_ckpt_recovered", False + ): self.checkpointer.recover_if_possible() self._ckpt_recovered = True if self.guides_running(pre_epoch=True): @@ -407,7 +411,7 @@ def make_dataloader( dataset=dataset, stage=stage, ckpt_prefix=ckpt_prefix, - **loader_kwargs + **loader_kwargs, ) def guides_running(self, pre_epoch=False): @@ -477,7 +481,6 @@ def fit_batch(self, batch): return loss - INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} @@ -491,7 +494,7 @@ def dataio_prepare(hparams, guide_ctx=None): hparams : dict This dictionary is loaded from the `train.yaml` file, and it includes all the hyperparameters needed for dataset construction and loading. - + guide_ctx : SimpleNamespace, optional The guide context with pretrained models @@ -546,19 +549,18 @@ def text_pipeline(label): yield label.upper() label_norm_eval = RE_PUNCTUATION.sub("", label_norm) yield label_norm_eval - @sb.utils.data_pipeline.takes(input_feature) @sb.utils.data_pipeline.provides("tokens") def tokens_pipeline(label): """Processes the transcriptions to generate proper labels""" return label_encoder.encode_sequence_torch(label) - + @sb.utils.data_pipeline.takes("label_norm") @sb.utils.data_pipeline.provides("asr_tokens") def asr_tokens_pipeline(label): """Processes the transcriptions to generate proper labels""" - return torch.tensor(guide_ctx.asr_model.encode(label)) + return torch.tensor(guide_ctx.asr_model.encode(label)) use_silence_padding = hparams.get("use_silence_padding", True) if "token_model_layers" in hparams: @@ -577,7 +579,11 @@ def asr_tokens_pipeline(label): * hparams["eos_index"] ) - silence_padding = silence_token if representation_mode == RepresentationMode.DISCRETE else silence_emb + silence_padding = ( + silence_token + if representation_mode == RepresentationMode.DISCRETE + else silence_emb + ) silence_padding = silence_padding.cpu() silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) @@ -585,14 +591,18 @@ def asr_tokens_pipeline(label): torch.ones(bos_width, audio_tokens_per_step) * hparams["bos_index"] ) if representation_mode == RepresentationMode.CONTINUOUS: - audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat(1, 1, hparams["audio_dim"]) + audio_bos_prefix = audio_bos_prefix.unsqueeze(-1).repeat( + 1, 1, hparams["audio_dim"] + ) tokens_loader = hparams.get("tokens_loader") @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") def audio_pipeline(id): - audio = tokens_loader.tokens_by_uttid(id, num_codebooks=audio_tokens_per_step) + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=audio_tokens_per_step + ) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -613,7 +623,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample): text_pipeline, tokens_pipeline, audio_ref_pipeline, - audio_pipeline + audio_pipeline, ] output_keys = [ "uttid", @@ -628,7 +638,11 @@ def spk_emb_random_match(uttid, dataset, spk_sample): resample_fn = {} for dataset in data_info: - dataset_output_keys = output_keys if dataset == "train" else output_keys + ["label_norm_eval"] + dataset_output_keys = ( + output_keys + if dataset == "train" + else output_keys + ["label_norm_eval"] + ) dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( json_path=data_info[dataset], replacements={"data_root": data_folder}, @@ -639,10 +653,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample): datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False if hparams["spk_emb_shuffle"]: - spk_idx, spk_samplers = group_by_speaker( - dynamic_dataset, - hparams - ) + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) spk_sample = {} spk_emb_random_match_pipeline = partial( spk_emb_random_match, @@ -659,7 +670,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample): spk_idx=spk_idx, sample=spk_sample, dataset=dynamic_dataset, - spk_samplers=spk_samplers + spk_samplers=spk_samplers, ) resample_fn[dataset](epoch=0) @@ -689,9 +700,7 @@ def spk_emb_random_match(uttid, dataset, spk_sample): if hparams["input"] == "phonemes": for key in datasets: datasets[key] = datasets[key].filtered_sorted( - key_test={ - "phn": lambda value: value - } + key_test={"phn": lambda value: value} ) datasets["sample"] = select_sample(hparams, datasets) return datasets, silence_padding, resample_fn @@ -748,7 +757,7 @@ def group_by_speaker(dataset, hparams): the dataset from which to select items hparams : dict hyperparameters - + Returns ------- spk_idx : dict @@ -929,14 +938,14 @@ def get_guide_ctx(hparams, run_opts): """Initializes a context object for guides, containing pretrained models only for guides that will be used per hparams - + Arguments --------- - hparams : dict + hparams : dict Hyperparameters run_opts : dict Run options - + Returns ------- ctx : SimpleNamespace @@ -960,7 +969,6 @@ def get_guide_ctx(hparams, run_opts): ) - def run_experiment(brain_cls): """Starts the experiement @@ -1014,14 +1022,16 @@ def run_experiment(brain_cls): "save_json_train": hparams["train_json"], "save_json_valid": hparams["valid_json"], "save_json_test": ( - hparams["test_json"] if "test" in hparams["splits"] + hparams["test_json"] + if "test" in hparams["splits"] else None ), "sample_rate": hparams["sample_rate"], "train_split": hparams["train_split"], "valid_split": hparams["valid_split"], "test_split": ( - hparams["test_split"] if "test" in hparams["splits"] + hparams["test_split"] + if "test" in hparams["splits"] else None ), "seed": hparams["seed"], @@ -1031,11 +1041,9 @@ def run_experiment(brain_cls): # We can now directly create the datasets for training, valid, and test guide_ctx = get_guide_ctx(hparams, run_opts) - ( - datasets, - silence_padding, - resample_fn - ) = dataio_prepare(hparams, guide_ctx) + (datasets, silence_padding, resample_fn) = dataio_prepare( + hparams, guide_ctx + ) # Apply overfit test settings datasets = apply_overfit_test(hparams, datasets) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py index ad2f5bf0c..87de6f84b 100644 --- a/benchmarks/DASB/LibriTTS/extraction/extract.py +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -35,7 +35,7 @@ overrides=overrides, ) - # Dataset prep (parsing Librispeech + # Dataset prep (parsing Librispeech from libritts_prepare import prepare_libritts # noqa # multi-gpu (ddp) save data preparation diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 1cf092a46..ecc5a7e34 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -34,9 +34,12 @@ has_transformers = False try: from transformers import AutoModelForAudioXVector + has_transformers = True except ImportError: - logger.warning("transformers library not found - some evaluators may be disabled") + logger.warning( + "transformers library not found - some evaluators may be disabled" + ) RE_PUNCTUATION = re.compile( @@ -850,10 +853,7 @@ def __init__( sample_rate=16000, ): super().__init__(sample_rate=sample_rate) - self.model = UTMOSModel( - source=source, - save_path=save_path, - ) + self.model = UTMOSModel(source=source, save_path=save_path,) if run_opts is not None: device = run_opts.get("device") if device: @@ -930,6 +930,7 @@ class SpkSimWavLM(SpeechEvaluator): The sample rate to which all samples will be resampled before being processed """ + def __init__( self, source, @@ -937,7 +938,7 @@ def __init__( model_sample_rate=16000, run_opts=None, *args, - **kwargs + **kwargs, ): if not has_transformers: raise ValueError( @@ -948,9 +949,7 @@ def __init__( run_opts = {} device = run_opts.get("device") self.model = AutoModelForAudioXVector.from_pretrained( - source, cache_dir=savedir, - *args, - **kwargs + source, cache_dir=savedir, *args, **kwargs ) if device is not None: self.model = self.model.to(device) @@ -972,15 +971,13 @@ def evaluate( # Resample if sample_rate is not None: wavs = torchaudio.functional.resample( - wavs, - orig_freq=sample_rate, - new_freq=self.model_sample_rate + wavs, orig_freq=sample_rate, new_freq=self.model_sample_rate ) if sample_rate_ref is not None: wavs_ref = torchaudio.functional.resample( wavs_ref, orig_freq=sample_rate_ref, - new_freq=self.model_sample_rate + new_freq=self.model_sample_rate, ) # Concatenate @@ -989,14 +986,8 @@ def evaluate( length_abs = length * wavs_max_len length_ref_abs = length_ref * wavs_ref_max_len max_len = max(wavs_max_len, wavs_ref_max_len) - wavs, _ = pad_right_to( - wavs, - (batch_size, max_len) - ) - wavs_ref, _ = pad_right_to( - wavs_ref, - (batch_size, max_len) - ) + wavs, _ = pad_right_to(wavs, (batch_size, max_len)) + wavs_ref, _ = pad_right_to(wavs_ref, (batch_size, max_len)) audio = torch.cat([wavs, wavs_ref]) length_cat_abs = torch.cat([length_abs, length_ref_abs]) @@ -1015,10 +1006,7 @@ def evaluate( hyp_embs, ref_embs, dim=-1 ) - return SpeechEvaluationResult( - scores, - {"score": scores} - ) + return SpeechEvaluationResult(scores, {"score": scores}) def vocoder_to_device(vocoder, device): From f982325bff464753f376094e95c7285a99590f69 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 15 Jan 2025 14:37:50 -0500 Subject: [PATCH 056/270] DASB: More cosmetic changes from linters --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 15 ++- .../hparams/train_continuous_ssl.yaml | 10 +- .../TTS/tokotron/hparams/train_dac.yaml | 13 +-- .../tokotron/hparams/train_discrete_ssl.yaml | 55 ++------- .../TTS/tokotron/hparams/train_encodec.yaml | 16 +-- .../hparams/train_speech_tokenizer.yaml | 6 +- .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 33 ------ .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 15 ++- .../hparams/train_continuous_ssl.yaml | 105 ++++-------------- .../TTS/tokotron/hparams/train_dac.yaml | 20 ++-- .../tokotron/hparams/train_discrete_ssl.yaml | 53 ++++----- .../TTS/tokotron/hparams/train_encodec.yaml | 36 +++--- .../hparams/train_speech_tokenizer.yaml | 18 +-- .../LibriTTS/extraction/hparams/encodec.yaml | 1 - benchmarks/DASB/model/Tokotron.py | 2 - benchmarks/DASB/utils/eval.py | 1 - 16 files changed, 127 insertions(+), 272 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index bad9ce7c1..98b2bb00d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -15,17 +15,16 @@ eval_utmos_model_name: utmos.ckpt eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main eval_utmos_domain_id: null eval_utmos_judge_id: null -eval_perf: false +eval_perf: False eval_utmos: !name:eval.UTMOSSpeechEvaluator - source: !ref - save_path: !ref - model_name: !ref - model_url: !ref - domain_id: !ref - judge_id: !ref - + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref eval_asr: !apply:speechbrain.utils.hparams.choice value: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml index 087eb6cf9..9c0b98d3b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml @@ -310,8 +310,8 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 240b57a7d..4c4f03689 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -41,7 +41,7 @@ samples_interval: 5 tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref token_model_kwargs: n_quantizers: !ref @@ -194,11 +194,11 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line infer_max_audio_length: !ref tokenizer: !new:utils.tokenizer_interface.DACTokenizer - model_type: !ref - model_bitrate: !ref - n_codebooks: !ref - load_pretrained: True - tag: latest + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest modules: @@ -238,4 +238,3 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index e3b549549..e14c1ce9d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -4,23 +4,18 @@ # ############################################################################ experiment_name: tokotron/discrete_ssl - # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] - # Model Type ssl_model_type: wavlm representation_mode: discrete - output_folder: !ref results/tokotron/// save_folder: !ref /save train_log: !ref /train_log.txt - - # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/discrete- +data_folder: !PLACEHOLDER +prepare_save_folder: !ref /prepared/discrete- # e.g., /path/to/LibriSpeech pretrained_model_save_folder: !ref vocoder_model_name: !ref unithifigan-dasb--discrete vocoder_model_path: !ref / @@ -37,12 +32,9 @@ progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 - -tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. - +tokens_folder: !PLACEHOLDER tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref - + data_path: !ref freeze_token_model: True token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref @@ -50,7 +42,6 @@ token_model_src: !apply:speechbrain.utils.hparams.choice wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k wav2vec2: facebook/wav2vec2-large-960h-lv60-self - g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint @@ -67,15 +58,10 @@ vocoder_src: !apply:speechbrain.utils.hparams.choice wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False - vocoder_available_layers: [1, 3, 7, 12, 18, 23] - splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - ckpt_interval_minutes: 30 # save checkpoint every N min - # Training parameters input: text number_of_epochs: 50 @@ -88,8 +74,6 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - - # index pad_index: 0 bos_index: 0 @@ -97,7 +81,6 @@ bos_width: 1 eos_index: 0 eos_width: 1 audio_token_shift: 0 - # stages related parameters lr: 0.0005 lr_warmup_steps: 10000 @@ -109,25 +92,20 @@ gate_threshold: 0.5 gate_loss_beta: 0.2 gate_loss_gamma: 0.01 gate_loss_max_weight: 1. - # Inference parameters eos_mode: gate decoder_mode: autoregressive scale_factor: 4 - # Beam Search-specific parameters min_decode_ratio: 1.0 max_decode_ratio: 10.0 beam_size: 5 - - # Feature parameters sample_rate: 22050 model_sample_rate: 16000 max_audio_length: 1000 infer_max_audio_length: !ref debug_infer_max_audio_length: 10 - # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder token_list_file_text: ./hparams/char_en.txt @@ -137,17 +115,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref - # Gate offset gate_offset: !apply:Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref - silence_padding: !ref use_silence_padding: True - - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -167,12 +141,9 @@ ssl_model: !apply:speechbrain.utils.hparams.choice save_path: !ref freeze: !ref output_all_hiddens: True - - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -181,32 +152,26 @@ train_dataloader_opts: collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - test_dataloader_opts: batch_size: 1 num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - sample_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - token_model_kwargs: SSL_layers: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 @@ -264,7 +229,6 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- decoder_mode: !ref scale_factor: !ref representation_mode: discrete - modules: model: !ref vocoder: !ref @@ -272,6 +236,7 @@ modules: ssl_model: !ref # define two optimizers here for two-stage training + opt_class: !name:torch.optim.Adam lr: !ref @@ -314,8 +279,8 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 0082e20db..7ccd9d716 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -41,7 +41,7 @@ samples_interval: 5 tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] @@ -220,10 +220,10 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ec6de9bb2..39c394d71 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -41,7 +41,7 @@ samples_interval: 5 tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref splits: ["train", "valid", "test"] @@ -240,5 +240,5 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 0a75a3482..439869651 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -11,14 +11,12 @@ import json import logging import math -import sys import csv import torch import torchaudio import string import re from pathlib import Path -from hyperpyyaml import load_hyperpyyaml from types import SimpleNamespace from torch.nn import ModuleDict from tqdm.auto import tqdm @@ -507,37 +505,6 @@ def descriptive_statistics(items, key): } -def select_subset(dataset, hparams): - """Selects a subset of the dataset provided, if specified. - The selection is controlled by a hyperparameter named - eval_subset, which is expected to list the IDs of the - data items on which evaluation will take place, one per line - - Arguments - --------- - dataset : speechbrain.dataio.dataset.DynamicItemDataset - A dataset - hparams : dict - A hyperparameters file - - Returns - ------- - subset : dataset - The dataset, filtered down if applicable - """ - eval_subset_path = hparams.get("eval_subset") - if eval_subset_path is not None: - eval_subset_path = Path(eval_subset_path) - if not eval_subset_path.exists(): - raise ValueError(f"eval_subset {eval_subset_path} does not exist") - with open(eval_subset_path) as eval_subset_file: - eval_subset_ids = [line.strip() for line in eval_subset_file] - subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) - else: - subset = dataset - return subset - - RE_INTEGER = re.compile(r"^-?\d+$") RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index a4c8b6b59..94fb319c8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -21,7 +21,7 @@ eval_utmos_model_name: utmos.ckpt eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main eval_utmos_domain_id: null eval_utmos_judge_id: null -eval_perf: false +eval_perf: False eval_asr: !apply:speechbrain.utils.hparams.choice @@ -39,12 +39,12 @@ eval_asr: !apply:speechbrain.utils.hparams.choice savedir: !ref eval_utmos: !name:eval.UTMOSSpeechEvaluator - source: !ref - save_path: !ref - model_name: !ref - model_url: !ref - domain_id: !ref - judge_id: !ref + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref eval_spk_sim: !name:eval.SpkSimWavLM source: !ref @@ -63,4 +63,3 @@ eval_summary: descriptive: ["utmos"] spk_sim: descriptive: ["score"] - diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml index 2cbca90fb..08ddc0984 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml @@ -9,10 +9,9 @@ __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt - # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -data_folder_alignments: null +data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref ssl_model_type: wavlm @@ -26,11 +25,11 @@ train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] frozen_split_path: null @@ -41,15 +40,12 @@ progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 - # Position shift use_position_shift: True max_position_shift: 1000 position_shift_seed: 42 position_shift_probability: 1.0 - freeze_token_model: True - token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -91,13 +87,9 @@ spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite asr_src: speechbrain/asr-transformer-transformerlm-librispeech spk_emb_shuffle: True - splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - ckpt_interval_minutes: 30 # save checkpoint every N min - # Training parameters input: text number_of_epochs: 1000 @@ -113,8 +105,6 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - - # index pad_index: 0 bos_index: 0 @@ -122,7 +112,6 @@ bos_width: 1 eos_index: 0 eos_width: 1 audio_token_shift: 0 - # stages related parameters lr: 0.0005 lr_warmup_steps: 10000 @@ -134,29 +123,23 @@ gate_threshold: 0.5 gate_loss_beta: 0.2 gate_loss_gamma: 0.01 gate_loss_max_weight: 1. - # Inference parameters inference_mode: autoregressive eos_mode: gate decoder_mode: autoregressive scale_factor: 4 - # Embedding Injection spk_emb_injection: null - # Beam Search-specific parameters min_decode_ratio: 1.0 max_decode_ratio: 10.0 beam_size: 5 - - # Feature parameters sample_rate: 24000 model_sample_rate: 16000 max_audio_length: 1000 infer_max_audio_length: !ref debug_infer_max_audio_length: 10 - # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder token_list_file_text: ./hparams/char_en.txt @@ -166,16 +149,13 @@ token_list_file: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref - # Gate offset gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref - silence_padding: !ref use_silence_padding: True - # Guides guides_enabled: False guides_start_epoch: 40 @@ -184,9 +164,6 @@ guides_spk_discrete: True guides_spk_loss_weight: 0.2 guides_asr: True guides_asr_loss_weight: 0.1 - - - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -206,8 +183,6 @@ ssl_model: !apply:speechbrain.utils.hparams.choice save_path: !ref freeze: !ref output_all_hiddens: True - - token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL ssl_model: !ref kmeans_repo_id: !ref @@ -217,11 +192,9 @@ token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl layers_num: !apply:benchmarks.DASB.utils.hparams.as_list value: !ref dtype: int - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa - spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class source: !ref savedir: !ref /ecapa- @@ -229,13 +202,12 @@ spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class classname: DiscreteSpkEmb overrides: ssl_layer_num_selected: !ref - asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide source: !ref - savedir: !ref /asr-transformer + savedir: !ref /asr-transformer +# Dataloader options -# Dataloader options train_dataloader_opts: batch_size: !ref shuffle: True @@ -243,28 +215,24 @@ train_dataloader_opts: collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - valid_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - test_dataloader_opts: batch_size: 1 num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - sample_dataloader_opts: batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref - token_model_kwargs: SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list value: !ref @@ -275,8 +243,6 @@ token_model_kwargs: bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers layers: !ref value: null - - extract_features_opts: dataloader_opts: batch_size: !ref @@ -292,8 +258,6 @@ extract_features_opts: model_sample_rate: !ref spk_emb_model: !ref data_folder_alignments: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 @@ -331,9 +295,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 6 attention_type: regularMHA - ############################## models ################################ - vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list value: !apply:speechbrain.utils.hparams.choice value: !ref @@ -341,28 +303,24 @@ vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list choices: null: !ref dtype: int - vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper - model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref - available_layers: !ref - layers: !ref - num_units: !ref - offset: !ref - + model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams + source: !ref + savedir: !ref + available_layers: !ref + layers: !ref + num_units: !ref + offset: !ref vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams source: !ref savedir: !ref - vocoder: !apply:benchmarks.DASB.utils.hparams.choice value: !ref apply: True choices: discrete: !ref continuous: !ref - inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference bos_index: !ref eos_index: !ref @@ -372,28 +330,24 @@ inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference using_eos_threshold: False length_normalization: True audio_token_shift: !ref - inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference scale_factor: !ref gate_threshold: !ref eos_mode: !ref representation_mode: !ref - inference: !apply:speechbrain.utils.hparams.choice value: !ref choices: search: !ref forward: !ref - emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: True - injection: !ref - -model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length - input_num_tokens: !ref + spk: + kind: "pretrained" + dim: 192 + vocoder: True + injection: !ref +model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref @@ -430,16 +384,13 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d position_shift_seed: !ref emb: !ref layerwise_renorm: !ref - modules: model: !ref vocoder: !ref compute_cost: !ref - # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref - compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref @@ -456,33 +407,25 @@ compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss spk_weight: !ref asr_weight: !ref representation_mode: !ref - - lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref - checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref - freezer: !new:benchmarks.DASB.utils.preparation.Freezer save_path: !ref archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref - train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport logger: !ref sample_rate: !ref eos_threshold: !ref - spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler - seed: !ref \ No newline at end of file + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 2d91a521e..3333cfceb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -22,11 +22,11 @@ train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] frozen_split_path: null @@ -219,8 +219,6 @@ attention_type: regularMHA vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder dac: !ref - - inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference bos_index: !ref eos_index: !ref @@ -243,10 +241,10 @@ inference: !apply:speechbrain.utils.hparams.choice forward: !ref emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: !ref + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 2ecb72a84..3ba568d94 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -9,10 +9,9 @@ __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt - # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -data_folder_alignments: null +data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref ssl_model_type: wavlm @@ -26,11 +25,11 @@ train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] frozen_split_path: null @@ -42,10 +41,10 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref # Position shift use_position_shift: True @@ -192,7 +191,6 @@ guides_asr: True guides_asr_loss_weight: 0.1 - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -270,8 +268,6 @@ extract_features_opts: model_sample_rate: !ref spk_emb_model: !ref data_folder_alignments: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 @@ -303,22 +299,18 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 6 attention_type: regularMHA - ############################## models ################################ - vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams source: !ref savedir: !ref - emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: True - injection: !ref - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length - input_num_tokens: !ref + spk: + kind: "pretrained" + dim: 192 + vocoder: True + injection: !ref +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length audio_num_tokens: !ref audio_tokens_per_step: !ref d_model: !ref @@ -342,16 +334,13 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- scale_factor: !ref representation_mode: !ref emb: !ref - modules: model: !ref vocoder: !ref compute_cost: !ref - # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref - compute_cost: !new:Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref @@ -389,8 +378,8 @@ spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 1f3764ceb..e766267e7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -24,11 +24,11 @@ train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] frozen_split_path: null @@ -43,7 +43,7 @@ samples_interval: 5 tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader - data_path: !ref + data_path: !ref freeze_token_model: True token_model_src: "facebook/encodec_24khz" @@ -205,11 +205,11 @@ attention_type: regularMHA ############################## models ################################ emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: !ref - injection: !ref + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref @@ -238,13 +238,13 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False modules: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 2de6e121e..97ab94275 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -24,11 +24,11 @@ train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] frozen_split_path: null @@ -236,10 +236,10 @@ inference: !apply:speechbrain.utils.hparams.choice forward: !ref emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: !ref + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml index 31211ec75..d3cd83c3e 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -22,7 +22,6 @@ valid_json: !ref /dev-clean.json test_json: !ref /test.json - batch_size: 8 num_workers: 8 src_key: wav diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 266090be4..949840380 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -28,8 +28,6 @@ from speechbrain.nnet.linear import Linear from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss from speechbrain.dataio.dataio import length_to_mask -from speechbrain.dataio.batch import PaddedBatch -from speechbrain.decoders.seq2seq import S2STransformerBeamSearcher from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index ecc5a7e34..9d5e8642f 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -15,7 +15,6 @@ from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher from speechbrain.dataio.batch import PaddedBatch from speechbrain.utils.metric_stats import ErrorRateStats -from speechbrain.utils.superpowers import run_shell from speechbrain.utils.data_utils import pad_right_to from speechbrain.utils.fetching import fetch from collections import namedtuple From 1357ff146a27cf83b34ab4f47b10e41d117f8376 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 16 Jan 2025 11:10:24 -0500 Subject: [PATCH 057/270] DASB: Tokotron: Relative paths --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 3 +++ benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 8da11247b..1b1dd4795 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -31,6 +31,9 @@ from Tokotron import RepresentationMode from evaluate import TokotronEvaluator +base_dir = str(Path(__file__).parent.parent.parent.parent) +sys.path.append(base_dir) + logger = logging.getLogger(__name__) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 3df858844..943727635 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -34,6 +34,8 @@ import re import string +base_dir = str(Path(__file__).parent.parent.parent.parent) +sys.path.append(base_dir) logger = logging.getLogger(__name__) From 958ee870bc83de36ec0d2db07f8abf76113c8b65 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 16 Jan 2025 11:30:44 -0500 Subject: [PATCH 058/270] DASB: Tokotron: Add choices for the model type --- .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 8 +++++++- .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 7 ++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index e14c1ce9d..f96681a3a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -46,7 +46,13 @@ g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech -vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref token_offset: 1 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 3ba568d94..5b1a06b46 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -63,7 +63,12 @@ token_model_src: !apply:speechbrain.utils.hparams.choice g2p_src: flexthink/soundchoice-g2p kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech -vocoder_repo_id: speechbrain/hifigan-wavlm-k1000-LibriTTS +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref select_layers: null From 043eb9ca15d4c175a873747c0c8f0c04d4d6c546 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 16 Jan 2025 16:59:25 -0500 Subject: [PATCH 059/270] DASB: Tokotron: more clean-up --- .../DASB/LJSpeech/TTS/tokotron/Tokotron.py | 1 - .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 4 + .../hparams/train_continuous_ssl.yaml | 317 ------------- .../TTS/tokotron/hparams/train_dac.yaml | 12 +- .../tokotron/hparams/train_discrete_ssl.yaml | 70 ++- .../TTS/tokotron/hparams/train_encodec.yaml | 32 +- .../hparams/train_speech_tokenizer.yaml | 38 +- .../DASB/LJSpeech/TTS/tokotron/preparation.py | 1 - .../DASB/LJSpeech/TTS/tokotron/train.py | 46 +- .../TTS/tokotron/train_continuous_ssl.py | 45 -- .../DASB/LJSpeech/TTS/tokotron/train_dac.py | 45 -- .../TTS/tokotron/train_discrete_ssl.py | 77 ---- .../LJSpeech/TTS/tokotron/train_encodec.py | 44 -- .../TTS/tokotron/train_speech_tokenizer.py | 44 -- .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 40 +- .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 5 + .../hparams/train_continuous_ssl.yaml | 431 ------------------ .../TTS/tokotron/hparams/train_dac.yaml | 47 +- .../tokotron/hparams/train_discrete_ssl.yaml | 57 +-- .../hparams/train_speech_tokenizer.yaml | 47 +- .../DASB/LibriTTS/TTS/tokotron/train.py | 92 ++-- .../TTS/tokotron/train_continuous_ssl.py | 47 -- .../DASB/LibriTTS/TTS/tokotron/train_dac.py | 47 -- .../TTS/tokotron/train_discrete_ssl.py | 79 ---- .../LibriTTS/TTS/tokotron/train_encodec.py | 46 -- .../TTS/tokotron/train_speech_tokenizer.py | 46 -- 26 files changed, 154 insertions(+), 1606 deletions(-) delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py delete mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py delete mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py deleted file mode 120000 index 097a6d488..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/Tokotron.py +++ /dev/null @@ -1 +0,0 @@ -../../../model/Tokotron.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index 98b2bb00d..8ca3fb8dd 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -48,3 +48,7 @@ eval_summary: descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] utmos: descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml deleted file mode 100644 index 9c0b98d3b..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ /dev/null @@ -1,317 +0,0 @@ -# ############################################################################ -# Model: Tokenized TTS (WhisperSpeech-inspired) -# Authors: Artem Ploujnikov -# ############################################################################ - -experiment_name: tokotron/continuous_ssl - -# Seed needs to be set at top of yaml, before objects with parameters are made -seed: 74443 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/// -save_folder: !ref /save -train_log: !ref /train_log.txt - -# Model type -ssl_model_type: wavlm -representation_mode: continuous - -# Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/continuous- -pretrained_model_save_folder: !ref -vocoder_model_name: !ref unithifigan-dasb--continuous -vocoder_model_path: !ref / -prepare_archive_path: null -prepare_skip_ignore_folders: False -train_json: !ref /train.json -valid_json: !ref /valid.json -test_json: !ref /test.json -frozen_split_path: null -sample_path: null -progress_folder: !ref /progress -progress_archive: !ref /progress.tar -progress_current: !ref /current -progress_meta: !ref /meta.yaml -num_audio_samples: 32 -samples_interval: 5 - -freeze_ssl_model: True -ssl_model_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: microsoft/wavlm-large - hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self - -g2p_src: speechbrain/soundchoice-g2p -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS -spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec -use_spk_emb: False - -vocoder_available_layers: [1, 3, 7, 12, 18, 23] - -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] - - -ckpt_interval_minutes: 30 # save checkpoint every N min - -# Training parameters -input: text -number_of_epochs: 50 -batch_size: 16 -grad_accumulation_factor: 1 -max_grad_norm: 0.01 -sorting: random -num_workers: 4 -skip_prep: False -overfit_test: False -overfit_test_sample_count: !ref -overfit_test_epoch_data_count: 1000 - - -# index -pad_index: 0 -bos_index: 0 -bos_width: 1 -eos_index: 0 -eos_width: 1 -audio_token_shift: 0 - -# stages related parameters -lr: 0.0005 -lr_warmup_steps: 10000 -lr_annealing_mode: step -guided_attention_weight: 50.0 -guided_attention_sigma: 0.5 -gate_loss_weight: 1.0 -gate_threshold: 0.5 -gate_loss_beta: 0.2 -gate_loss_gamma: 0.01 -gate_loss_max_weight: 1. - -# Inference parameters -inference_mode: autoregressive -eos_mode: gate -decoder_mode: autoregressive -scale_factor: 4 - -# Beam Search-specific parameters -min_decode_ratio: 1.0 -max_decode_ratio: 10.0 -beam_size: 5 - - -# Feature parameters -sample_rate: 22050 -model_sample_rate: 16000 -max_audio_length: 1000 -infer_max_audio_length: !ref -debug_infer_max_audio_length: 10 - -# Label encoder -label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt -token_list_file: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref - phonemes: !ref - -# Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp - beta: !ref - gamma: !ref - max_weight: !ref - -silence_padding: !ref -use_silence_padding: True - - -# Token model (pretrained) -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - -spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams - source: !ref - savedir: !ref /ecapa - -# Dataloader options -train_dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref - -valid_dataloader_opts: - batch_size: !ref - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref - -test_dataloader_opts: - batch_size: 1 - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref - -sample_dataloader_opts: - batch_size: !ref - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref - -extract_features_opts: - dataloader_opts: - batch_size: !ref - ssl_model: !ref - ssl_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - - -####################### Model parameters ########################### -# Transformer -d_model: 512 -nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 -d_ffn: 2048 -transformer_dropout: 0.2 -target_dropout: 0.2 -activation: !name:torch.nn.GELU -audio_num_tokens: 1000 -audio_dim: 1024 -audio_emb_size: 128 -audio_emb_freeze: False -audio_emb_pretrained: False -audio_emb_lr: 0.00001 -audio_emb_weight_decay: 0.001 -text_num_tokens: 39 -phn_num_tokens: 52 -input_num_tokens: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref - phonemes: !ref -audio_tokens_per_step: 6 -attention_type: regularMHA - -############################## models ################################ - -vocoder: !apply:speechbrain.inference.vocoders.HIFIGAN.from_hparams - source: !ref - savedir: !ref - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length - input_num_tokens: !ref - audio_num_tokens: !ref - audio_tokens_per_step: !ref - d_model: !ref - d_ffn: !ref - nhead: !ref - enc_num_layers: !ref - dec_num_layers: !ref - dropout: !ref - target_dropout: !ref - activation: !ref - attention_type: !ref - gate_threshold: !ref - gate_offset: !ref - audio_emb_size: !ref - audio_emb_freeze: !ref - max_audio_length: !ref - eos_mode: !ref - infer_max_audio_length: !ref - audio_token_shift: !ref - decoder_mode: !ref - scale_factor: !ref - audio_dim: !ref - representation_mode: continuous - - -modules: - model: !ref - vocoder: !ref - compute_cost: !ref - ssl_model: !ref - -# define two optimizers here for two-stage training -opt_class: !name:torch.optim.Adam - lr: !ref - -compute_cost: !new:Tokotron.TokotronLoss - guided_attention_weight: !ref - guided_attention_sigma: !ref - gate_weight: !ref - gate_beta: !ref - gate_gamma: !ref - gate_max_weight: !ref - silence_padding: !ref - eos_mode: !ref - eos_index: !ref - eos_width: !ref - audio_tokens_per_step: !ref - audio_token_shift: !ref - representation_mode: continuous - - -lr_annealing: !new:Tokotron.TargetedNoamScheduler - lr_initial: [!ref , !ref ] - n_warmup_steps: !ref - param_group: 0 - -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - lr_scheduler: !ref - counter: !ref - -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref - -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref - -tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 4c4f03689..be20bfa63 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -14,8 +14,6 @@ train_log: !ref /train_log.txt token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" # Model type representation_mode: discrete @@ -105,7 +103,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -173,7 +171,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -209,7 +207,7 @@ modules: opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.Tokotron.oss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -229,10 +227,6 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index f96681a3a..555878c24 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -7,12 +7,14 @@ experiment_name: tokotron/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] + # Model Type ssl_model_type: wavlm representation_mode: discrete output_folder: !ref results/tokotron/// save_folder: !ref /save train_log: !ref /train_log.txt + # Data files data_folder: !PLACEHOLDER prepare_save_folder: !ref /prepared/discrete- # e.g., /path/to/LibriSpeech @@ -47,27 +49,21 @@ token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech vocoder_repo_id: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - hubert: speechbrain/hifigan-hubert-k1000-LibriTTS - wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS - wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref token_offset: 1 -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False -vocoder_available_layers: [1, 3, 7, 12, 18, 23] splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] ckpt_interval_minutes: 30 # save checkpoint every N min + # Training parameters input: text number_of_epochs: 50 @@ -80,6 +76,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 + # index pad_index: 0 bos_index: 0 @@ -87,6 +84,7 @@ bos_width: 1 eos_index: 0 eos_width: 1 audio_token_shift: 0 + # stages related parameters lr: 0.0005 lr_warmup_steps: 10000 @@ -98,20 +96,19 @@ gate_threshold: 0.5 gate_loss_beta: 0.2 gate_loss_gamma: 0.01 gate_loss_max_weight: 1. + # Inference parameters eos_mode: gate decoder_mode: autoregressive scale_factor: 4 -# Beam Search-specific parameters -min_decode_ratio: 1.0 -max_decode_ratio: 10.0 -beam_size: 5 + # Feature parameters sample_rate: 22050 model_sample_rate: 16000 max_audio_length: 1000 infer_max_audio_length: !ref debug_infer_max_audio_length: 10 + # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder token_list_file_text: ./hparams/char_en.txt @@ -121,13 +118,15 @@ token_list_file: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref + # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref silence_padding: !ref use_silence_padding: True + # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -150,6 +149,7 @@ ssl_model: !apply:speechbrain.utils.hparams.choice spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa + # Dataloader options train_dataloader_opts: batch_size: !ref @@ -178,6 +178,7 @@ sample_dataloader_opts: value: !ref token_model_kwargs: SSL_layers: !ref + ####################### Model parameters ########################### # Transformer d_model: 512 @@ -206,12 +207,7 @@ attention_type: regularMHA ############################## models ################################ -vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref - - -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -235,18 +231,23 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- decoder_mode: !ref scale_factor: !ref representation_mode: discrete + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref - ssl_model: !ref - -# define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -262,7 +263,7 @@ compute_cost: !new:Tokotron.TokotronLoss representation_mode: discrete -lr_annealing: !new:Tokotron.TargetedNoamScheduler +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler lr_initial: [!ref , !ref ] n_warmup_steps: !ref param_group: 0 @@ -274,19 +275,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - -tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 7ccd9d716..3ab6eb770 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -14,9 +14,6 @@ train_log: !ref /train_log.txt token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" - # Model type representation_mode: discrete @@ -97,7 +94,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -162,7 +159,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -182,14 +179,24 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + modules: model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -209,21 +216,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - -tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref - sample_rate: !ref - bandwidth: !ref - flat_embeddings: False - freeze: True - renorm_embeddings: False diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 39c394d71..568f8c13e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -14,8 +14,6 @@ train_log: !ref /train_log.txt token_model_src: "fnlp/SpeechTokenizer" g2p_src: flexthink/soundchoice-g2p -vocoder_type: encodec -vocoder_src: "charactr/vocos-encodec-24khz" # Model type representation_mode: discrete @@ -99,7 +97,7 @@ token_list_file: !apply:speechbrain.utils.hparams.choice phonemes: !ref # Gate offset -gate_offset: !apply:Tokotron.distance_diff_loss_ramp +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp beta: !ref gamma: !ref max_weight: !ref @@ -107,14 +105,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref # Token model (pretrained) -speech_tokenizer: !new:speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - source: !ref - save_path: !ref - -token_model: !new:Tokotron.SpeechTokenizerFeatureExtractor - speech_tokenizer: !ref - codebooks: !ref - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -145,14 +135,6 @@ sample_dataloader_opts: padding_kwargs: value: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - token_model: !ref - sample_rate: !ref - model_sample_rate: !ref - - ####################### Model parameters ########################### # Transformer d_model: 512 @@ -181,7 +163,7 @@ attention_type: regularMHA ############################## models ################################ -model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length input_num_tokens: !ref audio_num_tokens: !ref audio_tokens_per_step: !ref @@ -201,15 +183,19 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line max_audio_length: !ref infer_max_audio_length: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + modules: model: !ref - token_model: !ref + tokenizer: !ref # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:Tokotron.TokotronLoss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref @@ -229,16 +215,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer - source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py deleted file mode 120000 index 08621a288..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/preparation.py +++ /dev/null @@ -1 +0,0 @@ -../../../utils/preparation.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 1b1dd4795..8c571babd 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -22,18 +22,19 @@ import string from pathlib import Path from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import clean_padding_ from speechbrain.utils.distributed import run_on_main -from Tokotron import ( - get_silence_token, - use_silence_padding, - feature_pad_to, -) -from Tokotron import RepresentationMode -from evaluate import TokotronEvaluator -base_dir = str(Path(__file__).parent.parent.parent.parent) +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) +from model.Tokotron import ( + get_silence_token, + use_silence_padding, + feature_pad_to, + RepresentationMode, +) # noqa: E402 +from evaluate import TokotronEvaluator # noqa: E402 logger = logging.getLogger(__name__) @@ -268,6 +269,16 @@ def on_stage_end(self, stage, stage_loss, epoch): stage_stats = {"loss": stage_loss, **loss_stats} if stage == sb.Stage.TRAIN: self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: @@ -292,9 +303,6 @@ def on_stage_end(self, stage, stage_loss, epoch): meta={"loss": stage_stats["loss"]}, min_keys=["loss"], ) - if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): - self.evaluator.on_evaluate_end() - def fit_batch(self, batch): """Fit one batch, override to do multiple updates. @@ -363,7 +371,9 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - raise NotImplementedError() + wav = self.modules.tokenizer.tokens_to_sig(audio) + clean_padding_(wav, length) + return wav def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed @@ -686,15 +696,7 @@ def apply_overfit_test(hparams, dataset): ) -def run_experiment(brain_cls): - """Starts the experiement - - Arguments - --------- - brain_cls : type - The brain class to instantiate - """ - +if __name__ == "__main__": # Reading command line arguments hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) @@ -757,7 +759,7 @@ def run_experiment(brain_cls): audio_keys = ["audio_pad", "audio_bos"] # Trainer initialization - tts_brain = brain_cls( + tts_brain = TokotronBrain( modules=hparams["modules"], opt_class=hparams["opt_class"], hparams=hparams, diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py deleted file mode 100644 index f3495eaca..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_continuous_ssl.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Continuous SSL verfsion - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronContinuousSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.vocoder(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronContinuousSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py deleted file mode 100644 index 83b9ff538..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_dac.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDACBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - z, _, _ = self.modules.tokenizer.quantizer.from_codes( - audio.transpose(1, 2).int() - ) - wav = self.modules.tokenizer.decode(z).squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDACBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py deleted file mode 100644 index aa2c57681..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_discrete_ssl.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Discrete SSL version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -import torch -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDiscreteSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def on_stage_start(self, stage, epoch): - self.compute_offset() - return super().on_stage_start(stage, epoch) - - def compute_offset(self): - """Computes per-layer offsets""" - layers_set = set(self.hparams.token_model_layers) - available_layers_set = set(self.hparams.vocoder_available_layers) - if not layers_set.issubset(available_layers_set): - unavailable_layers = ",".join( - str(layer) for layer in (layers_set - available_layers_set) - ) - raise ValueError(f"Layers {unavailable_layers} are not supported") - self.num_units = self.hparams.vocab_size - _, layers_idx = torch.where( - torch.tensor( - self.hparams.vocoder_available_layers, device=self.device - ).unsqueeze(0) - == torch.tensor( - self.hparams.token_model_layers, device=self.device - ).unsqueeze(1) - ) - self.layer_offset = ( - torch.tensor(layers_idx, device=self.device) * self.num_units - )[None, None, :] - self.offset = self.hparams.token_offset - self.modules.vocoder.tokenize = False - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - units_with_offset = ( - audio + self.layer_offset.to(audio.device) + self.offset - ) - wav = self.modules.vocoder(units_with_offset) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDiscreteSSLBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py deleted file mode 100644 index 2168f970d..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_encodec.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronEncodecBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronEncodecBrain) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py deleted file mode 100644 index bc51db78c..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train_speech_tokenizer.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronSTBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - if length is not None: - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronSTBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 439869651..99d547cc5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -124,41 +124,19 @@ def get_output_folder(self, stage, epoch): output_folder = output_folder / str(epoch) output_folder.mkdir(parents=True, exist_ok=True) return output_folder - - def evaluate(self, dataset): - """Runs evaluation on a dataset + + def on_evaluate_end(self): + """Invoked when evaluation starts Arguments --------- - dataset : speechbrain.dataio.dataset.DynamicItemDataset - a dataset + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. """ - logger.info("Recovering the checkpoint") - ckpt = self.hparams.checkpointer.recover_if_possible() - if not ckpt: - raise ValueError("Unable to recover the checkpoint") - self.modules.model.eval() - if self.hparams.eval_samples is not None: - dataset = dataset.filtered_sorted( - select_n=self.hparams.eval_samples - ) - loader = sb.dataio.dataloader.make_dataloader( - dataset, batch_size=self.hparams.batch_size - ) - loader_it = iter(loader) - self.create_reports() - self.modules.model.show_inference_progress = False - self.item_ids = [] - details_keys = list(self.evaluators.keys()) - self.details = {evaluator_key: [] for evaluator_key in details_keys} - self.read_reports() - self.sample_text = [] - self.sample_file_names = [] - self.ref_file_names = [] - logger.info("Starting evaluation") - batch_count = math.ceil(len(dataset) / self.hparams.batch_size) - for batch in tqdm(loader_it, desc="Evaluation", total=batch_count): - self.evaluate_batch(batch) self.write_summary() logger.info("Evaluation done") diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index 94fb319c8..bafd769cc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -63,3 +63,8 @@ eval_summary: descriptive: ["utmos"] spk_sim: descriptive: ["score"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + spk_sim: spk_sim_score_mean \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml deleted file mode 100644 index 08ddc0984..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_continuous_ssl.yaml +++ /dev/null @@ -1,431 +0,0 @@ -# ############################################################################ -# Model: Tokenized TTS (WhisperSpeech-inspired) -# Authors: Artem Ploujnikov -# ############################################################################ -# Seed needs to be set at top of yaml, before objects with parameters are made - -seed: 74443 -__set_seed: !apply:torch.manual_seed [!ref ] -output_folder: !ref results/transformer/ -save_folder: !ref /save -train_log: !ref /train_log.txt -# Data files -data_folder: !PLACEHOLDER -data_folder_alignments: null # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared -pretrained_model_save_folder: !ref -ssl_model_type: wavlm -representation_mode: discrete -vocoder_model_name: !ref unithifigan-dasb---ms -vocoder_model_path: !ref / -prepare_archive_path: null -prepare_skip_ignore_folders: False -data_mode: lite -train_json: !ref /train.json -valid_json: !ref /valid.json -test_json: !ref /test.json -train_split: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - lite: ["train-clean-100"] - clean: ["train-clean-100", "train-clean-360"] - full: ["train-clean-100", "train-clean-360", "train-other-500"] -valid_split: ["dev-clean"] -test_split: ["test-clean"] -frozen_split_path: null -sample_path: null -progress_folder: !ref /progress -progress_archive: !ref /progress.tar -progress_current: !ref /current -progress_meta: !ref /meta.yaml -num_audio_samples: 32 -samples_interval: 5 -# Position shift -use_position_shift: True -max_position_shift: 1000 -position_shift_seed: 42 -position_shift_probability: 1.0 -freeze_token_model: True -token_model_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: microsoft/wavlm-large - hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self -g2p_src: flexthink/soundchoice-g2p -token_model_kmeans_src: poonehmousavi/SSL_Quantization -token_model_kmeans_dataset: LibriSpeech-100-360-500 -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref -select_layers: null -token_offset: 1 -vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS -vocoder_src_continous: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - discrete: !ref - continuous: !ref -vocoder_available_layers: [1, 3, 7, 12, 18, 23] -vocoder_takes_spk_emb: True -spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec -spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: flexthink/discrete_wavlm_spk_rec_ecapatdn_lite - hubert: flexthink/discrete_hubert_spk_rec_ecapatdn_lite - wav2vec2: flexthink/discrete_wav2vec2_spk_rec_ecapatdn_lite -asr_src: speechbrain/asr-transformer-transformerlm-librispeech -spk_emb_shuffle: True -splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] -ckpt_interval_minutes: 30 # save checkpoint every N min -# Training parameters -input: text -number_of_epochs: 1000 -reset_annealing_epoch: null -batch_size: 16 -batch_size_guided: 2 -extract_features_batch_size: 32 -grad_accumulation_factor: 1 -max_grad_norm: 0.01 -sorting: random -num_workers: 4 -skip_prep: False -overfit_test: False -overfit_test_sample_count: !ref -overfit_test_epoch_data_count: 1000 -# index -pad_index: 0 -bos_index: 0 -bos_width: 1 -eos_index: 0 -eos_width: 1 -audio_token_shift: 0 -# stages related parameters -lr: 0.0005 -lr_warmup_steps: 10000 -lr_annealing_mode: step -guided_attention_weight: 50.0 -guided_attention_sigma: 0.5 -gate_loss_weight: 1.0 -gate_threshold: 0.5 -gate_loss_beta: 0.2 -gate_loss_gamma: 0.01 -gate_loss_max_weight: 1. -# Inference parameters -inference_mode: autoregressive -eos_mode: gate -decoder_mode: autoregressive -scale_factor: 4 -# Embedding Injection -spk_emb_injection: null -# Beam Search-specific parameters -min_decode_ratio: 1.0 -max_decode_ratio: 10.0 -beam_size: 5 -# Feature parameters -sample_rate: 24000 -model_sample_rate: 16000 -max_audio_length: 1000 -infer_max_audio_length: !ref -debug_infer_max_audio_length: 10 -# Label encoder -label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt -token_list_file: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref - phonemes: !ref -# Gate offset -gate_offset: !apply:benchmarks.DASB.model.Tokotron.distance_diff_loss_ramp - beta: !ref - gamma: !ref - max_weight: !ref -silence_padding: !ref -use_silence_padding: True -# Guides -guides_enabled: False -guides_start_epoch: 40 -guides_spk: False -guides_spk_discrete: True -guides_spk_loss_weight: 0.2 -guides_asr: True -guides_asr_loss_weight: 0.1 -# Token model (pretrained) -ssl_model: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True - wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 - source: !ref - save_path: !ref - freeze: !ref - output_all_hiddens: True -token_model: !new:speechbrain.lobes.models.huggingface_transformers.discrete_ssl.DiscreteSSL - ssl_model: !ref - kmeans_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref - save_path: !ref - layers_num: !apply:benchmarks.DASB.utils.hparams.as_list - value: !ref - dtype: int -spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams - source: !ref - savedir: !ref /ecapa -spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class - source: !ref - savedir: !ref /ecapa- - pymodule_file: custom_interface.py - classname: DiscreteSpkEmb - overrides: - ssl_layer_num_selected: !ref -asr_model: !name:benchmarks.DASB.model.Tokotron.TransformerASRGuide - source: !ref - savedir: !ref /asr-transformer -# Dataloader options - - -train_dataloader_opts: - batch_size: !ref - shuffle: True - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref -valid_dataloader_opts: - batch_size: !ref - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref -test_dataloader_opts: - batch_size: 1 - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref -sample_dataloader_opts: - batch_size: !ref - num_workers: !ref - collate_fn: !name:speechbrain.dataio.batch.PaddedBatch - padding_kwargs: - value: !ref -token_model_kwargs: - SSL_layers: !apply:benchmarks.DASB.utils.hparams.as_list - value: !ref - dtype: int - deduplicates: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers - layers: !ref - value: False - bpe_tokenizers: !apply:benchmarks.DASB.utils.hparams.repeat_for_layers - layers: !ref - value: null -extract_features_opts: - dataloader_opts: - batch_size: !ref - num_workers: !ref - token_model: !ref - token_model_kwargs: !ref - ssl_model: !ref - ssl_model_layers: !apply:benchmarks.DASB.utils.hparams.as_list - value: !ref - dtype: int - token_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - data_folder_alignments: !ref -####################### Model parameters ########################### -# Transformer -d_model: 512 -nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 -layerwise_renorm: True -d_ffn: 2048 -z_dim: 128 -hidden_dim: 2048 -enc_n_dim: 16 -dec_n_dim: 256 -decoder_chunk_size: -1 -transformer_dropout: 0.2 -target_dropout: 0.2 -emb_dropout: 0.0 -activation: !name:torch.nn.GELU -audio_num_tokens: 1000 -audio_dim: 1024 -audio_emb_size: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - discrete: 1024 - continuous: 128 -audio_emb_freeze: False -audio_emb_lr: 0.00001 -audio_emb_weight_decay: 0.001 -audio_emb_pretrained: False -text_num_tokens: 39 -phn_num_tokens: 52 -input_num_tokens: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref - phonemes: !ref -audio_tokens_per_step: 6 -attention_type: regularMHA -############################## models ################################ -vocoder_layers: !apply:benchmarks.DASB.utils.hparams.as_list - value: !apply:speechbrain.utils.hparams.choice - value: !ref - default: !ref - choices: - null: !ref - dtype: int -vocoder_discrete: !name:benchmarks.DASB.model.custom_model.GumbelUnitVocoderWrapper - model: !name:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref - available_layers: !ref - layers: !ref - num_units: !ref - offset: !ref -vocoder_continuous: !name:benchmarks.DASB.model.custom_model.VocoderWrapper - model: !name:speechbrain.inference.vocoders.HIFIGAN.from_hparams - source: !ref - savedir: !ref -vocoder: !apply:benchmarks.DASB.utils.hparams.choice - value: !ref - apply: True - choices: - discrete: !ref - continuous: !ref -inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference - bos_index: !ref - eos_index: !ref - min_decode_ratio: !ref - max_decode_ratio: !ref - beam_size: !ref - using_eos_threshold: False - length_normalization: True - audio_token_shift: !ref -inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference - scale_factor: !ref - gate_threshold: !ref - eos_mode: !ref - representation_mode: !ref -inference: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - search: !ref - forward: !ref -emb: - spk: - kind: "pretrained" - dim: 192 - vocoder: True - injection: !ref -model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel - input_num_tokens: !ref # yamllint disable-line rule:line-length - audio_num_tokens: !ref - audio_tokens_per_step: !ref - d_model: !ref - d_ffn: !ref - z_dim: !ref - hidden_dim: !ref - enc_n_dim: !ref - dec_n_dim: !ref - decoder_chunk_size: !ref - nhead: !ref - enc_num_layers: !ref - dec_num_layers: !ref - dropout: !ref - target_dropout: !ref - emb_dropout: !ref - activation: !ref - attention_type: !ref - vocoder: !ref - gate_threshold: !ref - gate_offset: !ref - audio_emb_size: !ref - audio_emb_freeze: !ref - max_audio_length: !ref - inference: !ref - eos_mode: !ref - infer_max_audio_length: !ref - audio_token_shift: !ref - decoder_mode: !ref - scale_factor: !ref - representation_mode: !ref - use_position_shift: !ref - max_position_shift: !ref - position_shift_probability: !ref - position_shift_seed: !ref - emb: !ref - layerwise_renorm: !ref -modules: - model: !ref - vocoder: !ref - compute_cost: !ref -# define two optimizers here for two-stage training -opt_class: !name:torch.optim.Adam - lr: !ref -compute_cost: !new:benchmarks.DASB.model.Tokotron.TokotronLoss - guided_attention_weight: !ref - guided_attention_sigma: !ref - gate_weight: !ref - gate_beta: !ref - gate_gamma: !ref - gate_max_weight: !ref - silence_padding: !ref - eos_mode: !ref - eos_index: !ref - eos_width: !ref - audio_tokens_per_step: !ref - audio_token_shift: !ref - spk_weight: !ref - asr_weight: !ref - representation_mode: !ref -lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler - lr_initial: !ref - n_warmup_steps: !ref -checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer - checkpoints_dir: !ref - recoverables: - model: !ref - lr_scheduler: !ref - counter: !ref -freezer: !new:benchmarks.DASB.utils.preparation.Freezer - save_path: !ref - archive_path: !ref -epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter - limit: !ref -train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger - save_file: !ref -progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport - logger: !ref - sample_rate: !ref - eos_threshold: !ref -spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler - seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 3333cfceb..ba05d6f2c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -88,9 +88,7 @@ gate_loss_gamma: 0.01 gate_loss_max_weight: 1. # Inference parameters -inference_mode: autoregressive eos_mode: gate -decoder_mode: autoregressive scale_factor: 4 # Beam Search-specific parameters @@ -216,30 +214,6 @@ attention_type: regularMHA ############################## models ################################ -vocoder: !new:benchmarks.DASB.model.custom_model.DACVocoder - dac: !ref - -inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference - bos_index: !ref - eos_index: !ref - min_decode_ratio: !ref - max_decode_ratio: !ref - beam_size: !ref - using_eos_threshold: False - length_normalization: True - audio_token_shift: !ref - -inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference - scale_factor: !ref - gate_threshold: !ref - eos_mode: !ref - -inference: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - search: !ref - forward: !ref - emb: spk: kind: "pretrained" @@ -269,17 +243,22 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d audio_emb_size: !ref audio_emb_freeze: !ref max_audio_length: !ref - inference: !ref eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref emb: !ref +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref # define two optimizers here for two-stage training @@ -312,17 +291,11 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:benchmarks.DASB.utils.preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref -progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport - logger: !ref - sample_rate: !ref - eos_threshold: !ref +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 5b1a06b46..b335425e6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -16,8 +16,6 @@ prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref ssl_model_type: wavlm representation_mode: discrete -vocoder_model_name: !ref unithifigan-dasb---ms -vocoder_model_path: !ref / prepare_archive_path: null prepare_skip_ignore_folders: False data_mode: lite @@ -64,34 +62,15 @@ g2p_src: flexthink/soundchoice-g2p kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech vocoder_repo_id: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - hubert: speechbrain/hifigan-hubert-k1000-LibriTTS - wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS - wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref select_layers: null token_offset: 1 -vocoder_src_discrete: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-k1000-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-k1000-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-k1000-LibriTTS -vocoder_src_continous: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - wavlm: chaanks/hifigan-wavlm-l1-3-7-12-18-23-LibriTTS - hubert: chaanks/hifigan-hubert-l1-3-7-12-18-23-LibriTTS - wav2vec2: chaanks/hifigan-wav2vec-l1-3-7-12-18-23-LibriTTS -vocoder_src: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - discrete: !ref - continuous: !ref -vocoder_available_layers: [1, 3, 7, 12, 18, 23] -vocoder_takes_spk_emb: True spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec spk_emb_discrete_src: !apply:speechbrain.utils.hparams.choice value: !ref @@ -146,9 +125,7 @@ gate_loss_gamma: 0.01 gate_loss_max_weight: 1. # Inference parameters -inference_mode: autoregressive eos_mode: gate -decoder_mode: autoregressive scale_factor: 4 # Embedding Injection @@ -305,15 +282,12 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice audio_tokens_per_step: 6 attention_type: regularMHA ############################## models ################################ -vocoder: !apply:speechbrain.inference.vocoders.UnitHIFIGAN.from_hparams - source: !ref - savedir: !ref emb: spk: kind: "pretrained" dim: 192 - vocoder: True injection: !ref + model: !new:Tokotron.TokotronTransformerModel input_num_tokens: !ref # yamllint disable-line rule:line-length audio_num_tokens: !ref @@ -335,14 +309,22 @@ model: !new:Tokotron.TokotronTransformerModel eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref representation_mode: !ref emb: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref + # define two optimizers here for two-stage training opt_class: !name:torch.optim.Adam lr: !ref @@ -381,10 +363,3 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref - -tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref - ssl_model: !ref - vocoder_repo_id: !ref - kmeans_dataset: !ref - num_clusters: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 97ab94275..ecc1a754c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -210,31 +210,6 @@ audio_tokens_per_step: 6 attention_type: regularMHA ############################## models ################################ - -vocoder: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerVocoder - tokenizer: !ref - -inference_search: !new:benchmarks.DASB.model.Tokotron.TokotronSearchInference - bos_index: !ref - eos_index: !ref - min_decode_ratio: !ref - max_decode_ratio: !ref - beam_size: !ref - using_eos_threshold: False - length_normalization: True - audio_token_shift: !ref - -inference_forward: !new:benchmarks.DASB.model.Tokotron.TokotronForwardInference - scale_factor: !ref - gate_threshold: !ref - eos_mode: !ref - -inference: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - search: !ref - forward: !ref - emb: spk: kind: "pretrained" @@ -264,7 +239,6 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d audio_emb_size: !ref audio_emb_freeze: !ref max_audio_length: !ref - inference: !ref eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref @@ -272,9 +246,13 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d scale_factor: !ref emb: !ref +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer + source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio + save_path: !ref + modules: model: !ref - vocoder: !ref + tokenizer: !ref compute_cost: !ref # define two optimizers here for two-stage training @@ -307,23 +285,8 @@ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer lr_scheduler: !ref counter: !ref -freezer: !new:benchmarks.DASB.utils.preparation.Freezer - save_path: !ref - archive_path: !ref - epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref - -progress_logger: !new:benchmarks.DASB.utils.train_logger.ArchiveTrainLogger - current_path: !ref - archive_path: !ref - meta_path: !ref - epoch_counter: !ref - -progress_report: !new:benchmarks.DASB.utils.tts.TTSProgressReport - logger: !ref - sample_rate: !ref - eos_threshold: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 943727635..198b35fec 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -22,21 +22,22 @@ from pathlib import Path from hyperpyyaml import load_hyperpyyaml from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset +from speechbrain.dataio.dataio import clean_padding_ from speechbrain.utils.distributed import run_on_main -from Tokotron import ( - RepresentationMode, - get_silence_token, - use_silence_padding, - feature_pad_to, -) -from types import SimpleNamespace -from evaluate import TokotronEvaluator import re import string -base_dir = str(Path(__file__).parent.parent.parent.parent) +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) +from model.Tokotron import ( + RepresentationMode, + get_silence_token, + use_silence_padding, + feature_pad_to, +) # noqa: E402 +from evaluate import TokotronEvaluator # noqa: E402 + logger = logging.getLogger(__name__) SPECIAL_TOKEN_COUNT = 1 @@ -83,7 +84,9 @@ def create_waveform(self, audio, length, emb): ------- wav : torch.Tensor """ - raise NotImplementedError() + wav = self.modules.tokenizer.tokens_to_sig(audio) + clean_padding_(wav, length) + return wav def compute_forward(self, batch, stage): """Runs all the computation of the Tokotron TTS @@ -451,6 +454,16 @@ def on_stage_end(self, stage, stage_loss, epoch): stage_stats = {"loss": stage_loss, **loss_stats} if stage == sb.Stage.TRAIN: self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_end() + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: @@ -473,9 +486,6 @@ def on_stage_end(self, stage, stage_loss, epoch): meta={"loss": stage_stats["loss"]}, min_keys=["loss"], ) - if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): - self.evaluator.on_evaluate_end() - def fit_batch(self, batch): loss = super().fit_batch(batch) if self.hparams.lr_annealing_mode == "step": @@ -486,7 +496,7 @@ def fit_batch(self, batch): INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} -def dataio_prepare(hparams, guide_ctx=None): +def dataio_prepare(hparams): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions. @@ -497,9 +507,6 @@ def dataio_prepare(hparams, guide_ctx=None): This dictionary is loaded from the `train.yaml` file, and it includes all the hyperparameters needed for dataset construction and loading. - guide_ctx : SimpleNamespace, optional - The guide context with pretrained models - Returns ------- datasets : dict @@ -558,12 +565,6 @@ def tokens_pipeline(label): """Processes the transcriptions to generate proper labels""" return label_encoder.encode_sequence_torch(label) - @sb.utils.data_pipeline.takes("label_norm") - @sb.utils.data_pipeline.provides("asr_tokens") - def asr_tokens_pipeline(label): - """Processes the transcriptions to generate proper labels""" - return torch.tensor(guide_ctx.asr_model.encode(label)) - use_silence_padding = hparams.get("use_silence_padding", True) if "token_model_layers" in hparams: audio_tokens_per_step = len(hparams["token_model_layers"]) @@ -936,50 +937,12 @@ def apply_overfit_test(hparams, dataset): return result -def get_guide_ctx(hparams, run_opts): - """Initializes a context object for guides, - containing pretrained models only for guides that will be - used per hparams - - Arguments - --------- - hparams : dict - Hyperparameters - run_opts : dict - Run options - - Returns - ------- - ctx : SimpleNamespace - The resulting context""" - ctx = {} - if hparams["guides_enabled"]: - pretrained_run_opts = {"device": run_opts.get("device", "cpu")} - if hparams["guides_spk"]: - ctx["spk_emb_model"] = hparams["spk_emb_model"]( - run_opts=pretrained_run_opts - ) - if hparams["guides_asr"]: - ctx["asr_model"] = hparams["asr_model"]( - run_opts=pretrained_run_opts - ) - return SimpleNamespace(**ctx) - - RE_PUNCTUATION = re.compile( "|".join(re.escape(char) for char in string.punctuation) ) -def run_experiment(brain_cls): - """Starts the experiement - - Arguments - --------- - brain_cls : type - The brain class to instantiate - """ - +if __name__ == "__main__": # Reading command line arguments hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) @@ -1042,9 +1005,8 @@ def run_experiment(brain_cls): ) # We can now directly create the datasets for training, valid, and test - guide_ctx = get_guide_ctx(hparams, run_opts) (datasets, silence_padding, resample_fn) = dataio_prepare( - hparams, guide_ctx + hparams ) # Apply overfit test settings @@ -1052,7 +1014,7 @@ def run_experiment(brain_cls): audio_keys = ["audio_pad", "audio_bos"] # Trainer initialization - tts_brain = brain_cls( + tts_brain = TokotronBrain( modules=hparams["modules"], opt_class=hparams["opt_class"], hparams=hparams, diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py deleted file mode 100644 index 9c8b243be..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_continuous_ssl.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Continuous SSL verfsion - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronContinuousSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length, emb): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - emb: dict - Embeddings (speaker, etc) - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.vocoder(audio, emb) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronContinuousSSLBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py deleted file mode 100644 index 78c584c45..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_dac.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - DAC version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDACBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length, emb): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - emb: dict - Embeddings (speaker, etc) - - Returns - ------- - wav : torch.Tensor - """ - z, _, _ = self.modules.tokenizer.quantizer.from_codes( - audio.transpose(1, 2).int() - ) - wav = self.modules.tokenizer.decode(z).squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDACBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py deleted file mode 100644 index 3cc0e2644..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_discrete_ssl.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio -Discrete SSL version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -import torch -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronDiscreteSSLBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def on_stage_start(self, stage, epoch): - self.compute_offset() - return super().on_stage_start(stage, epoch) - - def compute_offset(self): - """Computes per-layer offsets""" - layers_set = set(self.hparams.token_model_layers) - available_layers_set = set(self.hparams.vocoder_available_layers) - if not layers_set.issubset(available_layers_set): - unavailable_layers = ",".join( - str(layer) for layer in (layers_set - available_layers_set) - ) - raise ValueError(f"Layers {unavailable_layers} are not supported") - self.num_units = self.hparams.vocab_size - _, layers_idx = torch.where( - torch.tensor( - self.hparams.vocoder_available_layers, device=self.device - ).unsqueeze(0) - == torch.tensor( - self.hparams.token_model_layers, device=self.device - ).unsqueeze(1) - ) - self.layer_offset = ( - torch.tensor(layers_idx, device=self.device) * self.num_units - )[None, None, :] - self.offset = self.hparams.token_offset - self.modules.vocoder.tokenize = False - - def create_waveform(self, audio, length, emb): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - emb: dict - Embeddings (speaker, etc) - - Returns - ------- - wav : torch.Tensor - """ - units_with_offset = ( - audio + self.layer_offset.to(audio.device) + self.offset - ) - wav = self.modules.vocoder(units_with_offset) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronDiscreteSSLBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py deleted file mode 100644 index 98f1b27cc..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_encodec.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronEncodecBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length, emb): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - emb: dict - Embeddings (speaker, etc) - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.tokenizer.decode(audio) - wav = wav.squeeze(1) - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronEncodecBrain) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py deleted file mode 100644 index fdbbb3ed7..000000000 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train_speech_tokenizer.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio - Encodec version - -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model - - -Authors - * Artem Ploujnikov 2024 -""" - -from train import TokotronBrain, run_experiment -from speechbrain.dataio.dataio import clean_padding_ - - -class TokotronSTBrain(TokotronBrain): - """Tokotron implementation for Encodec""" - - def create_waveform(self, audio, length, emb): - """Creates a waveform from a discrete or continuous audio - representation - - Arguments - --------- - audio : torch.Tensor - An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) - lengths : torch.Tensor - A 1-D tensor - emb: dict - Embeddings (speaker, etc) - - Returns - ------- - wav : torch.Tensor - """ - wav = self.modules.token_model.decode(audio) - if length is not None: - clean_padding_(wav, length) - return wav - - -if __name__ == "__main__": - run_experiment(TokotronSTBrain) From 900481d4d1c323a9b78d9b89a569535d58f06499 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 17 Jan 2025 01:00:19 -0500 Subject: [PATCH 060/270] DASB: Tokotron: Updates for hyperparameter fitting --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 2 +- .../TTS/tokotron/hparams/train_dac.yaml | 18 +++++++++++------- .../tokotron/hparams/train_discrete_ssl.yaml | 16 ++++++++++------ .../TTS/tokotron/hparams/train_encodec.yaml | 17 ++++++++++------- .../hparams/train_speech_tokenizer.yaml | 16 ++++++++++------ .../DASB/LJSpeech/TTS/tokotron/train.py | 16 +++++++++++----- .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 2 +- .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 2 +- .../TTS/tokotron/hparams/train_dac.yaml | 9 +++++---- .../tokotron/hparams/train_discrete_ssl.yaml | 9 ++++++--- .../TTS/tokotron/hparams/train_encodec.yaml | 8 +++++--- .../hparams/train_speech_tokenizer.yaml | 8 +++++--- .../DASB/LibriTTS/TTS/tokotron/train.py | 19 +++++++++---------- 13 files changed, 85 insertions(+), 57 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index 8ca3fb8dd..e7ffe2576 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -51,4 +51,4 @@ eval_summary: eval_summary_log: utmos: utmos_utmos_mean - dwer: asr_dwer_median \ No newline at end of file + dwer: asr_dwer_median diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index be20bfa63..8e74cbedb 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -8,9 +8,12 @@ experiment_name: tokotron/dac # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p @@ -19,8 +22,9 @@ g2p_src: flexthink/soundchoice-g2p representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/dac +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -70,7 +74,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -94,8 +98,8 @@ model_bitrate: 8kbps # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -146,8 +150,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 555878c24..2205ecf94 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -7,6 +7,7 @@ experiment_name: tokotron/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER # Model Type ssl_model_type: wavlm @@ -14,10 +15,13 @@ representation_mode: discrete output_folder: !ref results/tokotron/// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + # Data files data_folder: !PLACEHOLDER -prepare_save_folder: !ref /prepared/discrete- # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref vocoder_model_name: !ref unithifigan-dasb--discrete vocoder_model_path: !ref / @@ -86,7 +90,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -111,8 +115,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -183,8 +187,8 @@ token_model_kwargs: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 3ab6eb770..d166f29ed 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -8,9 +8,11 @@ experiment_name: tokotron/encodec # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p @@ -18,8 +20,9 @@ g2p_src: flexthink/soundchoice-g2p representation_mode: discrete # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/encodec +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -65,7 +68,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -85,8 +88,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -136,8 +139,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 568f8c13e..38927d216 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -8,9 +8,12 @@ experiment_name: tokotron/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results// save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + token_model_src: "fnlp/SpeechTokenizer" g2p_src: flexthink/soundchoice-g2p @@ -20,7 +23,8 @@ representation_mode: discrete # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared/st +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False @@ -68,7 +72,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -88,8 +92,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -139,8 +143,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 8c571babd..cf7918e3a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -269,7 +269,7 @@ def on_stage_end(self, stage, stage_loss, epoch): stage_stats = {"loss": stage_loss, **loss_stats} if stage == sb.Stage.TRAIN: self.train_stats = stage_stats - + # End evaluation and report stats if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): self.evaluator.on_evaluate_end() @@ -623,7 +623,10 @@ def read_token_list(file_name): result: list a list of tokens """ - if not Path(file_name).exists(): + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): raise ValueError(f"Token file {file_name} not found") with open(file_name) as token_file: return [line.strip("\r\n") for line in token_file if line] @@ -709,6 +712,8 @@ def apply_overfit_test(hparams, dataset): # Load evaluation hyperparameters eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" if eval_hparams_file.exists(): logger.info( "Using evaluation hyperparameters from %s", eval_hparams_file @@ -796,9 +801,10 @@ def apply_overfit_test(hparams, dataset): ) # Load best checkpoint for evaluation - tts_brain.evaluate( - test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts, - ) + if hparams["testing"]: + tts_brain.evaluate( + test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts, + ) # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 99d547cc5..377b5955c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -124,7 +124,7 @@ def get_output_folder(self, stage, epoch): output_folder = output_folder / str(epoch) output_folder.mkdir(parents=True, exist_ok=True) return output_folder - + def on_evaluate_end(self): """Invoked when evaluation starts diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index bafd769cc..18e39ba42 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -67,4 +67,4 @@ eval_summary: eval_summary_log: utmos: utmos_utmos_mean dwer: asr_dwer_median - spk_sim: spk_sim_score_mean \ No newline at end of file + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index ba05d6f2c..0bc9099a8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -9,6 +9,7 @@ __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech @@ -55,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 @orion_step1: --enc_num_layers~"uniform(2, 32,discrete=True) extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -76,7 +77,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -187,8 +188,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index b335425e6..e4dfe96d1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -6,9 +6,12 @@ seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + # Data files data_folder: !PLACEHOLDER data_folder_alignments: null # e.g., /path/to/LibriSpeech @@ -113,7 +116,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -254,8 +257,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" layerwise_renorm: True d_ffn: 2048 transformer_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index e766267e7..a454802c3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -6,9 +6,11 @@ seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech @@ -86,7 +88,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -182,8 +184,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ecc1a754c..470503e4e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -6,9 +6,11 @@ seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER output_folder: !ref results/transformer/ save_folder: !ref /save train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech @@ -78,7 +80,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -185,8 +187,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 -dec_num_layers: 12 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 198b35fec..3d7d9cd55 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -454,7 +454,7 @@ def on_stage_end(self, stage, stage_loss, epoch): stage_stats = {"loss": stage_loss, **loss_stats} if stage == sb.Stage.TRAIN: self.train_stats = stage_stats - + # End evaluation and report stats if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): self.evaluator.on_evaluate_end() @@ -1005,9 +1005,7 @@ def apply_overfit_test(hparams, dataset): ) # We can now directly create the datasets for training, valid, and test - (datasets, silence_padding, resample_fn) = dataio_prepare( - hparams - ) + (datasets, silence_padding, resample_fn) = dataio_prepare(hparams) # Apply overfit test settings datasets = apply_overfit_test(hparams, datasets) @@ -1041,9 +1039,10 @@ def apply_overfit_test(hparams, dataset): ) # Load best checkpoint for evaluation - tts_brain.evaluate( - test_set=datasets["test"], - test_loader_kwargs=use_silence_padding( - hparams["test_dataloader_opts"], silence_padding, audio_keys - ), - ) + if hparams["testing"]: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=use_silence_padding( + hparams["test_dataloader_opts"], silence_padding, audio_keys + ), + ) From 4dcd1d3ea005a2952cabe596bd5d64f495cf1f2b Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 17 Jan 2025 02:12:27 -0500 Subject: [PATCH 061/270] DASB: Batch size updates, device fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml | 2 +- .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 4 ++++ benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml | 2 +- .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 4 ++++ 10 files changed, 16 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 8e74cbedb..75cbae717 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -57,7 +57,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 2205ecf94..68e54fa83 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index d166f29ed..0e923ffc9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -52,7 +52,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 38927d216..76e3c72e3 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index cf7918e3a..7c1b00083 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -371,6 +371,10 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.codec_vocoder.device = self.device wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 0bc9099a8..8c05d2499 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 @orion_step1: --enc_num_layers~"uniform(2, 32,discrete=True) +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index e4dfe96d1..5c8db0bc4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -94,7 +94,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" batch_size_guided: 2 extract_features_batch_size: 32 grad_accumulation_factor: 1 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index a454802c3..30d2cbfe4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -67,7 +67,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 470503e4e..e3f34fdf8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -59,7 +59,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 3d7d9cd55..f0153271d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -84,6 +84,10 @@ def create_waveform(self, audio, length, emb): ------- wav : torch.Tensor """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.codec_vocoder.device = self.device wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav From fc08f58d5962ac8ce3ec35bfd607508cfb81f5f1 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 17 Jan 2025 02:15:47 -0500 Subject: [PATCH 062/270] DASB: Tokotron: Fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 2 +- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 7c1b00083..8945607f9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -374,7 +374,7 @@ def create_waveform(self, audio, length): self.modules.tokenizer.device = self.device if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) - self.codec_vocoder.device = self.device + self.modules.tokenizer.codec_vocoder.device = self.device wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index f0153271d..da228d6ae 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -87,7 +87,7 @@ def create_waveform(self, audio, length, emb): self.modules.tokenizer.device = self.device if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) - self.codec_vocoder.device = self.device + self.modules.tokenizer.codec_vocoder.device = self.device wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav From fcb37c74b091188ad231af24776dee1138d2e6ee Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 17 Jan 2025 14:59:31 -0500 Subject: [PATCH 063/270] DASB: Ensure UTMOS is maximized rather than minimized! --- benchmarks/DASB/utils/aggregate_results.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/utils/aggregate_results.py b/benchmarks/DASB/utils/aggregate_results.py index 0df315b7e..e11046ade 100644 --- a/benchmarks/DASB/utils/aggregate_results.py +++ b/benchmarks/DASB/utils/aggregate_results.py @@ -144,6 +144,8 @@ def aggregate_metrics(prototype, metrics): # Report final metric to Orion # Remember: orion expects metrics to be minimized! - if eval_metric == "acc" or eval_metric == "f1": + if eval_metric in ["acc", "f1"]: final_metric = 1 - final_metric + elif eval_metric == "utmos": + final_metric = -final_metric report_objective(final_metric) From 9563cd54e766594b84a2c8c47399a97809d00dec Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 20 Jan 2025 14:46:43 -0500 Subject: [PATCH 064/270] DASB: Tokotron: Fixes --- .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index e3f34fdf8..c307ed0bf 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -36,7 +36,6 @@ test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 @@ -248,7 +247,7 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d scale_factor: !ref emb: !ref -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref From 4442b4473c1e572cac62c2f4c98f904a9a33f3ac Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 20 Jan 2025 14:53:52 -0500 Subject: [PATCH 065/270] DASB: Tokotron: Fixes --- .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 76e3c72e3..ce4e6edaa 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -34,7 +34,6 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 @@ -187,7 +186,7 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul max_audio_length: !ref infer_max_audio_length: !ref -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref From d31ad9ce97234f995e4f6312b09b50a15c2557de Mon Sep 17 00:00:00 2001 From: Pooneh Mousavi Date: Mon, 20 Jan 2025 15:10:56 -0500 Subject: [PATCH 066/270] Update tokenizer_interface.py add sampling rate for mimi and wavtokenizer --- benchmarks/DASB/utils/tokenizer_interface.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index c8e81eb7a..a6103de4c 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -380,6 +380,7 @@ class MimiTokenizer(Mimi, BaseTokenizer): def __init__(self, *args, **kwargs): Mimi.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) + self.sample_rate= self.sampling_rate @torch.no_grad() def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): @@ -435,6 +436,7 @@ class WavTokenizerWrapper(WavTokenizer, BaseTokenizer): def __init__(self, *args, **kwargs): WavTokenizer.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) + self.sample_rate = 24000 @torch.no_grad() def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): From fbebd2e0fecad130d72a4f70aeabf08670b4f765 Mon Sep 17 00:00:00 2001 From: Pooneh Mousavi Date: Mon, 20 Jan 2025 15:12:26 -0500 Subject: [PATCH 067/270] Update sq_codec.py dix sampling rate name for SQCodec --- benchmarks/DASB/model/sq_codec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 6057a5f73..4ac4b74ad 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -101,7 +101,7 @@ def __init__( ) self.scalar_codec = self.build_codec_model(self.config_path) - self.sr = sample_rate + self.sample_rate = sample_rate self.dim_codebook = dim_codebook self.n_codebook = n_codebook self.bw = bw @@ -232,8 +232,8 @@ def reconstruct(self, wav_root): wav, sr = torchaudio.load(wav_root) if wav.numel() == 0: return None - if sr != self.sr: - wav = torchaudio.transforms.Resample(sr, self.sr)(wav) + if sr != self.sample_rate: + wav = torchaudio.transforms.Resample(sr, self.sample_rate)(wav) wav = wav.unsqueeze(1) emb, emb_quant, x = self.scalar_codec.inference(wav) return x.detach().cpu().squeeze(0) From 9f64966ca6f0467dc140b7047ac2d908038fefc1 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Mon, 20 Jan 2025 20:25:46 +0000 Subject: [PATCH 068/270] add sq-codec, mimi and wavtokenizer for librispeech --- .../DASB/LibriSpeech/extraction/extract.py | 2 +- .../LibriSpeech/extraction/hparams/mimi.yaml | 58 ++++++++++++++++++ .../extraction/hparams/sqcodec.yaml | 57 ++++++++++++++++++ .../extraction/hparams/wavtokenizer.yaml | 60 +++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml create mode 100644 benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 3979ba731..5a54f72df 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -52,7 +52,7 @@ "skip_prep": hparams["skip_prep"], }, ) - + tokens_extractor = hparams["tokens_extractor"] data_folder = hparams["data_folder"] datasets = [] diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..e2dad7f95 --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml @@ -0,0 +1,58 @@ +# ############################################################################ +# Auido Tokenizer: Mimi +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/dac +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..fe202c90d --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: SQCodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks : 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..bc1b56ddb --- /dev/null +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,60 @@ +# ############################################################################ +# Auido Tokenizer: wavtokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv +test_csv: + - !ref /test-clean.csv + - !ref /test-other.csv + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks : 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref From 1e18ead6eec8ddfb295c174ecd5f0147a5fd2786 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 20 Jan 2025 16:41:10 -0500 Subject: [PATCH 069/270] DASB: VALL-E: Initial import --- .../DASB/LJSpeech/TTS/valle/evaluation.py | 360 +++++++ .../LJSpeech/TTS/valle/hparams/arpabet.txt | 50 + .../LJSpeech/TTS/valle/hparams/char_en.txt | 38 + .../DASB/LJSpeech/TTS/valle/hparams/eval.yaml | 54 + .../TTS/valle/hparams/train_discrete_ssl.yaml | 268 +++++ .../LJSpeech/TTS/valle/ljspeech_prepare.py | 1 + benchmarks/DASB/LJSpeech/TTS/valle/train.py | 854 ++++++++++++++++ benchmarks/DASB/model/valle.py | 924 ++++++++++++++++++ 8 files changed, 2549 insertions(+) create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml create mode 120000 benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/train.py create mode 100644 benchmarks/DASB/model/valle.py diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py new file mode 100644 index 000000000..152db4c87 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -0,0 +1,360 @@ +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + self.attention = [] + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], + key=metric_key, + ).items() + } + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..b80347c82 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml @@ -0,0 +1,54 @@ +eval_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_asr_type: whisper +eval_asr_source: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech + whisper: openai/whisper-small +evaluations: utmos,asr +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_asr: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + encoder_decoder: !name:utils.eval.EncoderDecoderASRSpeechEvaluator + source: !ref + sample_rate: !ref + overrides: + lm_weight: 0.0 + whisper: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +evaluators: + utmos: !ref + asr: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..f0127973c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,268 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +ssl_model_type: wavlm +output_folder: !ref results/tokotron/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +vocoder_model_name: !ref unithifigan-dasb--discrete +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large-960h-lv60-self +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +ssl_model_layers: [1, 3, 7, 12, 18, 23] +flip_layers: True +token_model_layers: !ref +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +decoder_mode: autoregressive +scale_factor: 4 + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1000 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py new file mode 120000 index 000000000..2f703273c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/ljspeech_prepare.py @@ -0,0 +1 @@ +../../ljspeech_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py new file mode 100644 index 000000000..e0ae084a3 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -0,0 +1,854 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio +from speechbrain.dataio.dataio import write_audio +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats + +logger = logging.getLogger(__name__) + +SPECIAL_TOKEN_COUNT = 1 + + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int() + wav = self.modules.tokenizer.tokens_to_sig(audio) + clean_padding_(wav, length) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + nar_track = torch.randint( + 1, num_tracks, (batch_size,), + device=self.device + ) + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + logits_ar_sm = self.hparams.log_softmax(logits_ar) + logits_nar_sm = self.hparams.log_softmax(logits_nar) + batch_size, max_len, _ = prompt.shape + targets_ar = prompt[:, 1:, 0] + batch_idx = torch.arange(batch_size, device=prompt.device) + targets_nar = prompt[batch_idx, 1:, nar_track] + prompt_max_len = prompt.size(1) + length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len) + prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_ar = self.hparams.compute_cost( + log_probabilities=logits_ar_sm, + targets=targets_ar, + mask=mask + ) + self.loss_metric_ar.append( + ids=batch.uttid, + log_probabilities=logits_ar_sm, + targets=targets_ar, + mask=mask, + reduction="batch", + ) + loss_nar = self.hparams.compute_cost( + log_probabilities=logits_nar_sm, + targets=targets_nar, + mask=mask, + ) + self.loss_metric_nar.append( + ids=batch.uttid, + log_probabilities=logits_nar_sm, + targets=targets_nar, + mask=mask, + reduction="batch", + ) + loss = loss_ar + loss_nar + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, + self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.hparams.compute_cost, batch_eval=True, + ) + self.loss_metric_ar = sb.utils.metric_stats.MetricStats( + metric=self.hparams.compute_cost, + batch_eval=True, + ) + self.loss_metric_nar = sb.utils.metric_stats.MetricStats( + metric=self.hparams.compute_cost, + batch_eval=True, + ) + + # TOOO: Reestablish evaluation + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + return epoch % self.hparams.eval_interval == 0 + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, + wav=wav, + length=audio_length, + stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference_results = [ + self.modules.model.inference( + prefix=prefix_item.unsqueeze(0), + opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :] + tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None] + track_start = ( + self.hparams.text_num_tokens + + self.hparams.special_num_tokens + + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + return self.hparams.inference_opts( + masks={ + self.hparams.bos_index: mask + }, + device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio(file_name, sample, self.hparams.model_sample_rate) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], + hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + @sb.utils.data_pipeline.takes("uttid", "tokens") + @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length") + def prompt_pipeline(id, tokens): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=hparams["audio_tokens_per_step"] + ) + + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline] + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length" + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_dynamic_items.append(sig_pipeline) + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + return datasets + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + return encoder + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, _, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from ljspeech_prepare import prepare_ljspeech + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_ljspeech, + kwargs={ + "data_folder": hparams["data_folder"], + "save_folder": hparams["prepare_save_folder"], + "splits": hparams["splits"], + "split_ratio": hparams["split_ratio"], + "seed": hparams["seed"], + "extract_phonemes": hparams["input"] == "phonemes", + "model_name": "tokotron", + "g2p_src": hparams["g2p_src"], + "skip_ignore_folders": hparams["prepare_skip_ignore_folders"], + "frozen_split_path": hparams.get("frozen_split_path"), + "device": run_opts.get("device", "cpu"), + }, + ) + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py new file mode 100644 index 000000000..11311d9d8 --- /dev/null +++ b/benchmarks/DASB/model/valle.py @@ -0,0 +1,924 @@ +"""An adaptation of ESPNET VALL-E +Originally by Jinchuan Tian + +https://github.com/espnet/espnet + +Authors + * Artem Ploujnikov 2024 (adaptation only) +""" + +# Copyright 2024 Jinchuan Tian +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +# Implementation of Vall-E: https://arxiv.org/abs/2301.02111 + +import logging +import torch +from typing import Dict, Tuple, Optional +from speechbrain.dataio.dataio import length_to_mask + +from torch import Tensor +from torch import nn +from torch.nn import functional as F +from dataclasses import dataclass + +from speechbrain.nnet.losses import reduce_loss +from speechbrain.nnet.losses import truncate + + +@dataclass +class SpeechLMInferenceOptions: + """Inference options + """ + + device: str = None + search_algo: str = "topk_sampling" + nbest: int = 1 + sampling_temperature: float = 1.0 + top_k: int = 20 + maxlenratio: float = 0.0 + minlenratio: float = 0.0 + eos: int = 5 + start: int = 1 + masks: torch.Tensor = None + nq: int = None + allow_invalid: bool = True + + +class ValleLM(nn.Module): + """The Vall-E TTS model (decoder-only transformer), adopted from + ESPNET2 + + Arguments + --------- + vocab_size : int + Dimention of vocabulary. + nq : int + Number of codes for each token / frame, usually for speech codec. + share_emb : bool + If true, share the embedding and lm_head weight. + qk_norm : bool + If true, apply LayerNorm to q and k in atention. + dropout : float + dropout rate for attention layers. + att_unit: int + Dimention of Transformer attention. + head : int + Number of heads in Transformer attention. + ar_layer : int + Number of layers in AR Transformer. + nar_layer : int + Number of layers in NAR Transformer. + n_ctx : int + maximum context length of AR & NAR Transformer. + """ + + def __init__( + self, + vocab_size: int, + nq: int, + pad_id: int = 0, + share_emb: bool = True, + qk_norm: bool = False, + dropout: float = 0.0, + att_unit: int = 256, + head: int = 2, + ar_layer: int = 4, + nar_layer: int = 4, + n_ctx: int = 3000, + ): + super().__init__() + + self.emb = torch.nn.Embedding(vocab_size, att_unit) + self.lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) + if share_emb: + self.lm_head.weight = self.emb.weight + + self.ar_decoder = TransformerDecoder( + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=ar_layer, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.nar_decoder = ValleNARDecoder( + n_level=nq - 1, + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=nar_layer, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.nq = nq + self.n_ctx = n_ctx + self.pad_id = pad_id + self._initialize() + + def forward( + self, + dec_seq: torch.Tensor, + dec_seq_lengths: torch.Tensor = None, + prefix_len: torch.Tensor = None, + conti_feats: Tuple = None, + nar_level_idx=1, + ) -> Tuple[torch.Tensor, torch.Tensor, Dict]: + """Vall-E forward for training + + Args: + dec_seq (LongTensor): Batch of decoder sequences (B, T, nq). + dec_seq_lengths (LongTensor): Lengths of batched decoder sequences (B,). + enc_seq (LongTensor): Batch of encoder sequences (B, T, nq), keep + the interface, may not be used. + enc_seq_lengths (LongTensor): Lengths of batched encoder sequences (B,), + keep the interface, may not be used. + prefix_len (LongTensor): Lengths of condition part in dec_seq (B,). + compute_loss (bool): whether to compute loss or just logits. + """ + + assert dec_seq.dim() == 3 + + dec_seq_emb = self.emb(dec_seq) # [B, T, nq, D] + dec_seq_emb, _ = install_continuous_features( + dec_seq_emb, None, conti_feats + ) + + # Auto-Regressive part + input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[ + :, :-1 + ] # [B, T, D] + h_ar = self.ar_decoder(input_ar_emb) + + # Non-Auto-Regressive part + input_nar_emb = self.prepare_input( + dec_seq_emb, prefix_len, nar_level_idx + )[ + :, 1: + ] # [B, T, V] + max_len = dec_seq.size(1) + mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask) + + logits_ar = self.lm_head(h_ar) + logits_nar = self.lm_head(h_nar) + + return logits_ar, logits_nar + + def prepare_input(self, dec_seq_emb, prefix_len, level): + # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage. + # This is because both prefix_mask and level_mask are broadcastable and will + # trigger user warning. + + # (1) level mask, [B, 1, nq, 1], True is to include + if isinstance(level, int): + level = torch.ones_like(dec_seq_emb[:, 0, 0, 0]) * level + level_mask = length_to_mask(level, self.nq).bool() + level_mask = ( + level_mask.unsqueeze(1).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (2) prefix mask, [B, T, 1, 1], True is the prefix + prefix_mask = length_to_mask( + prefix_len * dec_seq_emb.size(1), dec_seq_emb.size(1) + ).bool() + prefix_mask = ( + prefix_mask.unsqueeze(2).unsqueeze(3).expand(dec_seq_emb.size()) + ) + + # (3) mask and then sum in nq-axis. + mask = torch.logical_or(level_mask, prefix_mask) + return dec_seq_emb.masked_fill(~mask, 0.0).sum(2) + + @torch.no_grad() + def inference( + self, + prefix: torch.Tensor, + opts: SpeechLMInferenceOptions, + enc_seq: torch.Tensor = None, + suffix: torch.Tensor = None, + ): + """Vall-E Inference. + + Args: + prefix (LongTensor): Prefix part of dec_seq (B, T, nq). + opts (SpeechLMInferenceOptions): inference options. + enc_seq (LongTensor): Encoder token sequence (B, T, nq). + suffix (LongTensor): suffix part of dec_seq (B, T, nq), + usually the target sequence for teacher-forcing. + """ + + # (1) initialization + cache = self.ar_decoder.init() + + # (2) auto-regressive prefix forward on first code layer + prefix = prefix.expand(opts.nbest, -1, -1) + if opts.search_algo == "teacher_force": + suffix = suffix.expand(opts.nbest, -1, -1) + prefix_emb = self.emb(prefix).sum(dim=2) # [B, T, D] + _ = self.ar_decoder(prefix_emb, kv_cache=cache) + + # (3) auto-regressive loop on first code layer + # (3.1) AR initialization + minlen = ( + int(prefix.size(1) * opts.minlenratio) + if opts.minlenratio > 0 + else 0 + ) + maxlen = int(prefix.size(1) * opts.maxlenratio) + if opts.search_algo == "teacher_force": + assert suffix is not None + minlen = suffix.size(1) + maxlen = suffix.size(1) + if maxlen + prefix.size(1) > self.n_ctx: + maxlen = self.n_ctx - prefix.size(1) + logging.info(f"maxlen={maxlen}, minlen={minlen}") + + generated = {"token": [], "score": []} + finish_idx = ( + torch.Tensor([-1]).expand(opts.nbest).long().to(opts.device) + ) + prev_tok = ( + torch.Tensor([opts.start]) + .tile(opts.nbest, 1) + .long() + .to(opts.device) + ) + modality_index = prev_tok.flatten() + mask = modality_index_to_mask(modality_index, opts) + mask_cache = [] + + for step in range(maxlen): + # (3.2) AR loop + prev_emb = self.emb(prev_tok) # [B, 1, D] + h_ar = self.ar_decoder(prev_emb, kv_cache=cache) + logits = self.lm_head(h_ar) # [B, 1, V] + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + mask, + allow_eos=step >= minlen, + nq_level=0, + ) + # [B, 1, 1] -> [B, 1] + gen_tok, gen_score = gen_tok.squeeze(2), gen_tok.squeeze(2) + + generated["token"].append(gen_tok) + generated["score"].append(gen_score) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, step : step + 1, 0] + else: + prev_tok = gen_tok # [B, 1] + + # (3.3) detect modality swtich + mask_cache.append(mask.clone()) + modality_change_mask = torch.logical_and( + prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, + ) + if torch.any(modality_change_mask): + modality_index = torch.where( + modality_change_mask, prev_tok[:, 0], modality_index, + ) + mask = modality_index_to_mask(modality_index, opts) + logging.warning( + f"Step {step}: change modality index {modality_index}" + ) + + # (3.4) detect ended hypotheses. + finish_idx = torch.where( + torch.logical_and(prev_tok[:, 0] == opts.eos, finish_idx == -1), + step, + finish_idx, + ) + + if torch.all(torch.ge(finish_idx, 0)): + break + + if step == maxlen - 1: + logging.warning( + f"Some examples cannot finish in {maxlen} steps: {finish_idx}" + f"Consider increasing the maxlenratio" + ) + + logging.info(f"Terminate at steps: {finish_idx.cpu().tolist()}") + + # (3.4) finalize auto-regressive + if opts.allow_invalid: + valid_idx = torch.arange(len(finish_idx), device=finish_idx.device) + finish_idx = torch.where(finish_idx == -1, step, finish_idx) + else: + valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0] + if len(valid_idx) == 0: + self.ar_decoder.reset() + logging.warning(f"No valid examples. Return None") + return [], [] + elif len(valid_idx) < prefix.size(0): + logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid") + + finish_idx = finish_idx[valid_idx] + prefix_emb = prefix_emb[valid_idx] + if opts.search_algo == "teacher_force": + suffix = suffix[valid_idx] + gen_tokens_ar = torch.cat(generated["token"], dim=1)[ + valid_idx + ].unsqueeze( + 2 + ) # [B, T, 1] + gen_scores_ar = torch.cat(generated["score"], dim=1)[ + valid_idx + ].unsqueeze(2) + gen_tokens_ar = gen_tokens_ar[:, : finish_idx.max() + 1] # idx -> count + gen_scores_ar = gen_scores_ar[:, : finish_idx.max() + 1] + + self.ar_decoder.reset() + + # (4) non-auto-regressive loop on the remained code layers + # (4.1) NAR initialization + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, 0] + else: + prev_tok = gen_tokens_ar[:, :, 0] + start_emb = self.emb.weight[opts.start].tile( + len(valid_idx), 1, 1 + ) # [B, 1, D] + prev_emb = torch.cat( + [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 + ) # [B, T, D] + + ones = torch.ones_like(valid_idx) + mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) + generated = {"token": [], "score": []} + + mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache + vocab_mask = torch.cat(mask_cache, dim=1) + + # (4.2) NAR loop + for step in range(1, opts.nq): + h_nar = self.nar_decoder( + prev_emb, ones * step - 1, mask=mask + ) # [B, T, D] + logits = self.lm_head(h_nar) + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + vocab_mask, + search_algo="greedy_search", + allow_eos=False, + nq_level=step, + ) + gen_tok, gen_score = ( + gen_tok.squeeze(2), + gen_score.squeeze(2), + ) # [B, T] + + generated["token"].append(gen_tok[:, prefix.size(1) :]) + generated["score"].append(gen_score[:, prefix.size(1) :]) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, step] + else: + prev_tok = generated["token"][-1] + prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] + prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb + + # (5) combine AR and NAR results + gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] + gen_scores_nar = torch.stack(generated["score"], dim=2) + + gen_tokens = torch.cat( + [gen_tokens_ar, gen_tokens_nar], dim=2 + ) # [B, T, nq] + gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + + gen_tokens_list, gen_scores_list = [], [] + for b in range(len(valid_idx)): + gen_tokens_list.append(gen_tokens[b][: finish_idx[b]]) + gen_scores_list.append(gen_scores[b][: finish_idx[b]]) + + return gen_tokens_list, gen_scores_list + + def _initialize(self): + for m in self.modules(): + if isinstance(m, torch.nn.Linear): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + if m.bias is not None: + torch.nn.init.zeros_(m.bias) + elif isinstance(m, torch.nn.Embedding): + torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) + + +class ResidualAttentionBlock(nn.Module): + def __init__( + self, + n_state: int, + n_head: int, + cross_attention: bool = False, + causal: bool = False, + qk_norm: bool = False, + dropout: float = 0.0, + ): + super().__init__() + + self.attn = MultiHeadAttention( + n_state, n_head, causal=causal, qk_norm=qk_norm, dropout=dropout, + ) + self.attn_ln = LayerNorm(n_state) + self.attn_dropout = nn.Dropout(p=dropout) + + self.cross_attn = ( + MultiHeadAttention( + n_state, n_head, causal=False, qk_norm=qk_norm, dropout=dropout, + ) + if cross_attention + else None + ) + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + self.cross_attn_dropout = ( + nn.Dropout(p=dropout) if cross_attention else None + ) + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state) + ) + self.mlp_ln = LayerNorm(n_state) + self.mlp_dropout = nn.Dropout(p=dropout) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + x = x + self.attn_dropout( + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x))) + return x + + +class TransformerDecoder(nn.Module): + def __init__( + self, + n_ctx: int, + n_state: int, + n_head: int, + n_layer: int, + causal: bool = True, + qk_norm: bool = False, + dropout: float = 0.0, + layer_class=ResidualAttentionBlock, + ): + super().__init__() + + self.pos_emb = nn.Embedding(n_ctx, n_state) + + self.blocks = nn.ModuleList( + [ + layer_class( + n_state=n_state, + n_head=n_head, + cross_attention=False, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + for _ in range(n_layer) + ] + ) + self.ln = LayerNorm(n_state) + + self.causal = causal + self.kv_cache = None + + def forward( + self, + x: Tensor, + mask: torch.Tensor = None, + kv_cache: Optional[dict] = None, + ): + if self.causal and mask is not None: + raise ValueError("Causal Transformer dones't allow mask") + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + + for block in self.blocks: + x = block(x, mask=mask, kv_cache=kv_cache) + + x = self.ln(x) + return x + + def init(self): + self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache) + return self.kv_cache + + def reset(self,): + for hook in self.hooks: + hook.remove() + self.kv_cache = None + + +class LayerNorm(nn.LayerNorm): + def forward(self, x: Tensor) -> Tensor: + return super().forward(x.float()).type(x.dtype) + + +class Linear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + return F.linear( + x, + self.weight.to(x.dtype), + None if self.bias is None else self.bias.to(x.dtype), + ) + + +class ResidualAttentionBlockAdaLN(ResidualAttentionBlock): + def __init__( + self, + n_state: int, + n_head: int, + cross_attention: bool = False, + causal: bool = False, + qk_norm: bool = False, + dropout: float = 0.0, + ): + super(ResidualAttentionBlockAdaLN, self).__init__( + n_state=n_state, + n_head=n_head, + cross_attention=cross_attention, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + ) + + self.attn_ln = AdaLN(n_state) + self.mlp_ln = AdaLN(n_state) + + def forward( + self, + x: Tensor, + level: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + x = x + self.attn_dropout( + self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache) + ) + if self.cross_attn: + x = x + self.cross_attn_dropout( + self.cross_attn( + self.cross_attn_ln(x, level), xa, kv_cache=kv_cache + ) + ) + x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x, level))) + return x + + +class ValleNARDecoder(TransformerDecoder): + def __init__( + self, + n_level: int, + n_ctx: int, + n_state: int, + n_head: int, + n_layer: int, + causal: bool = False, + qk_norm: bool = False, + dropout: float = 0.0, + layer_class=ResidualAttentionBlockAdaLN, + ): + + super().__init__( + n_ctx=n_ctx, + n_state=n_state, + n_head=n_head, + n_layer=n_layer, + causal=causal, + qk_norm=qk_norm, + dropout=dropout, + layer_class=layer_class, + ) + + self.level_emb = nn.Embedding(n_level, n_state) + self.ln = AdaLN(n_state) + + def forward( + self, + x: Tensor, + level: Tensor, + mask: Tensor = None, + kv_cache: Optional[dict] = None, + ): + if self.causal and mask is not None: + raise ValueError("mask is not allowed when causal") + + level = self.level_emb(level) + + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + + for block in self.blocks: + x = block(x, level=level, mask=mask, kv_cache=kv_cache) + + x = self.ln(x, level) + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + n_state: int, + n_head: int, + causal: bool = False, + qk_norm: bool = False, + dropout: float = 0.0, + ): + super().__init__() + assert n_state % n_head == 0 + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + self.causal = causal + self.dropout = dropout + + self.qk_norm = qk_norm + if qk_norm: + self.q_norm = LayerNorm(n_state // n_head) + self.k_norm = LayerNorm(n_state // n_head) + + if not hasattr(F, "scaled_dot_product_attention"): + raise ValueError("Install torch 2.0.1+ to support Flash Attention") + + try: + from flash_attn import flash_attn_func + + self.flash_attn_func = flash_attn_func + except ImportError: + self.flash_attn_func = None + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv = self.qkv_attention(q, k, v, mask) + + return self.out(wv) + + def qkv_attention( + self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None + ): + if self.causal and mask is not None: + raise ValueError("mask is not allowed when the attention is causal") + + if self.causal and q.size(1) == k.size(1): + causal = True + else: + causal = False + + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + + if self.qk_norm: + q = self.q_norm(q) + k = self.k_norm(k) + + if self.flash_attn_func is not None and mask is None and self.training: + wv = self.flash_attn_func( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + dropout_p=self.dropout, + causal=causal, + ).flatten(start_dim=2) + else: + wv = ( + F.scaled_dot_product_attention( + q, k, v, mask, is_causal=causal, dropout_p=self.dropout + ) + .permute(0, 2, 1, 3) + .flatten(start_dim=2) + ) + + return wv + + +class AdaLN(nn.Module): + def __init__(self, n_state, eps=1e-5): + super().__init__() + self.weight = nn.Linear(n_state, n_state, bias=False) + self.bias = nn.Linear(n_state, n_state, bias=False) + nn.init.constant_(self.weight.weight, 1.0) + nn.init.constant_(self.bias.weight, 0.0) + + self.n_state = n_state + self.eps = eps + + def forward(self, x: Tensor, level_emb: Tensor): + w = self.weight(level_emb).unsqueeze(1) + b = self.bias(level_emb).unsqueeze(1) + x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps) + x = w * x + b + return x + + +def install_kv_cache_hook(model, cache): + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache: + # save as-is, for the first token or cross attention + cache[module] = output + else: + cache[module] = torch.cat([cache[module], output], dim=1).detach() + return cache[module] + + def install_hooks(layer: torch.nn.Module): + if isinstance(layer, MultiHeadAttention): + hooks.append(layer.key.register_forward_hook(save_to_cache)) + hooks.append(layer.value.register_forward_hook(save_to_cache)) + + model.apply(install_hooks) + return cache, hooks + + +def logits_to_tokens( + logits: torch.Tensor, + opts: SpeechLMInferenceOptions, + mask: torch.Tensor, + search_algo: str = None, + allow_eos: bool = True, + nq_level: int = None, +): + """ + Select the generated tokens and their scores based on logits prediction. + + logits (torch.Tensor), predicted logits, of size [B, T, nq, V] + opts (SpeechLMInferenceOptions): search options + mask (torch.Tensor): mask to specify valid tokens, of size [B, 1, nq, V] + search_algo (str): search algorithm + allow_eos (bool): whether to allow end-of-sentence prediction + nq_level (int or None): if not None, only conpute the specified codec level nq. + + """ + + assert logits.dim() == 4 + search_algo = search_algo if search_algo is not None else opts.search_algo + neg_inf = torch.finfo(logits.dtype).min + + # (1) Apply mask + if nq_level is not None: + mask = mask[:, :, nq_level : nq_level + 1] + + if allow_eos: + mask = mask.clone() + mask[:, :, 0, opts.eos] = False + + logits.masked_fill_(mask, neg_inf) + + # (2) token selection + if search_algo in ["topk_sampling"]: + topk_values, topk_indices = torch.topk(logits, opts.top_k, dim=-1) + probs = torch.softmax(topk_values / opts.sampling_temperature, dim=-1) + inner_indices = torch.multinomial( + probs.flatten(end_dim=-2), num_samples=1 + ).view(probs[..., :1].size()) + gen_token_idx = torch.gather(topk_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["topp_sampling"]: + probs = torch.softmax(logits / opts.sampling_temperature, dim=-1) + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + accum_probs = torch.cumsum(sorted_probs, dim=-1) + clip_probs = torch.where(accum_probs <= opts.top_p, sorted_probs, 0.0) + # always keep at least one candidate no matter what value it is + if torch.any(clip_probs[..., 0] == 0.0): + clip_probs[..., 0] = sorted_probs[..., 0] + clip_probs = clip_probs / clip_probs.sum(dim=-1, keepdim=True) + inner_indices = torch.multinomial( + clip_probs.flatten(end_dim=-2), num_samples=1 + ).view(clip_probs[..., :1].size()) + gen_token_idx = torch.gather(sorted_indices, -1, inner_indices).squeeze( + -1 + ) + gen_token_score = ( + torch.gather(clip_probs, -1, inner_indices).squeeze(-1).log() + ) + + elif search_algo in ["greedy_search", "teacher_force"]: + probs = logits.softmax(dim=-1) + topk_values, topk_indices = torch.topk(logits, 1, dim=-1) + gen_token_idx = topk_indices[:, :, :, 0] + gen_token_score = topk_values[:, :, :, 0].log() + + else: + raise NotImplementedError(f"opts.search_algo={opts.search_algo}") + + return gen_token_idx, gen_token_score + + +@torch.no_grad() +def install_continuous_features( + dec_emb: torch.Tensor, + enc_emb: Optional[torch.Tensor] = None, + conti_feats: Tuple = None, +): + if conti_feats is None: + return dec_emb, enc_emb + + assert dec_emb.size(0) == len(conti_feats) + if enc_emb is not None: + assert enc_emb.size(0) == len(conti_feats) + + for b, conti_feat in enumerate(conti_feats): + for conti_emb, start, end, part in conti_feat: + if part == "dec": + assert conti_emb.size(1) == dec_emb.size(2) + dec_emb[b, start:end] = conti_emb + else: + assert conti_emb.size(1) == enc_emb.size(2) + enc_emb[b, start:end] = conti_emb + + return dec_emb, enc_emb + + +def modality_index_to_mask( + modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions, +): + assert modality_index.dim() == 1 + modality_index = modality_index.cpu().tolist() + mask = torch.stack( + [inference_opts.masks[idx] for idx in modality_index], dim=0 + ).unsqueeze( + 1 + ) # [B, 1, nq, V] + + return mask + + +def masked_nll_loss( + log_probabilities, targets, mask, allowed_len_diff=3, reduction="mean" +): + """Similar to the standard nll_loss from SpeechBrain + but applies a custom mask + + Arguments + --------- + log_probabilities : torch.Tensor + The probabilities after log has been applied. + Format is [batch, log_p] or [batch, frames, log_p]. + targets : torch.Tensor + The targets, of shape [batch] or [batch, frames]. + mask : torch.Tensor + The mask for loss calculation + allowed_len_diff : int + Length difference that will be tolerated before raising an exception. + reduction : str + Options are 'mean', 'batch', 'batchmean', 'sum'. + See pytorch for 'mean', 'sum'. The 'batch' option returns + one loss per item in the batch, 'batchmean' returns sum / batch size. + """ + log_probabilities, targets = truncate( + log_probabilities, targets, allowed_len_diff + ) + log_probabilities = log_probabilities.transpose(1, -1) + loss = torch.nn.functional.nll_loss( + input=log_probabilities, target=targets.long(), reduction="none" + ) + loss *= mask + loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets) + return loss From 3ce4d4daaa4bb284f113185399af30ff76fe68f1 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 20 Jan 2025 17:30:43 -0500 Subject: [PATCH 070/270] DASB: Fixes --- .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ce4e6edaa..9027a945b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -151,7 +151,7 @@ activation: !name:torch.nn.GELU audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False -audio_emb_pretrained: True +audio_emb_pretrained: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice From 57d68cf97c9eb5e3c8113b6542eb6dd5877c6366 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 20 Jan 2025 18:15:24 -0500 Subject: [PATCH 071/270] DASB: Fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 75cbae717..e88469b3b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -34,7 +34,6 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 @@ -211,7 +210,7 @@ modules: opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:model.Tokotron.Tokotron.oss +compute_cost: !new:model.Tokotron.TokotronLoss guided_attention_weight: !ref guided_attention_sigma: !ref gate_weight: !ref From c1c3b52fcf05667970c8a7ea9fbd3e43ddbaf94f Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 21 Jan 2025 14:00:27 -0500 Subject: [PATCH 072/270] DASB: Add a "brokenness check" to ensure that tokens runs that produce no samples at all (non-intelligble speech) are not picked by the hyperparameter optimizer --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 6 +++ .../DASB/LJSpeech/TTS/tokotron/train.py | 53 ++++++++++++++++--- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index e7ffe2576..717532724 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -52,3 +52,9 @@ eval_summary: eval_summary_log: utmos: utmos_utmos_mean dwer: asr_dwer_median + +eval_threshold: + dwer_max: 90.0 + +eval_threshold_set: + utmos: 0.0 \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 8945607f9..05f81b341 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -273,11 +273,7 @@ def on_stage_end(self, stage, stage_loss, epoch): # End evaluation and report stats if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): self.evaluator.on_evaluate_end() - eval_summary = self.evaluator.compute_summary() - eval_summary_stats = { - key: eval_summary.get(value) - for key, value in self.hparams.eval_summary_log.items() - } + eval_summary_stats = self.get_summary_stats() stage_stats.update(eval_summary_stats) # Perform end-of-iteration things, like annealing, logging, etc. @@ -303,6 +299,51 @@ def on_stage_end(self, stage, stage_loss, epoch): meta={"loss": stage_stats["loss"]}, min_keys=["loss"], ) + def get_summary_stats(self): + """Retrieves the stats that needs to be reported on every trial + in the train log, as indicated in eval_summary_log in eval.yaml + + Returns + ------- + eval_summary_stats : dict + A dict with stats""" + eval_summary = self.evaluator.compute_summary() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + self._check_threshold(eval_summary_stats) + return eval_summary_stats + + def _check_threshold(self, eval_summary_stats): + """Checks threshold values for the defined stats and terminates + the trials if the parameters are not met. This is necessary because + some metrics produce bogus high values when the speech samples + do not contain any speech at all (e.g. UTMOS can be above 3 for + silence). + + Classic usage: dWER > 0.9 - treat the whole run as "garbage", set + UTMOS to 0 + + Arguments + --------- + eval_summary_stats : dict + Summary statistics + """ + for key, threshold_value in self.hparams.eval_threshold.items(): + key, threshold_type = key.split("_") + value = eval_summary_stats[key] + if threshold_type == "min": + meets = value >= threshold_value + elif threshold_type == "max": + meets = value <= threshold_value + else: + raise ValueError(f"Invalid threshold definition: {key}, check eval_threshold") + if not meets: + eval_summary_stats["broken"] = True + for key, value in self.hparams.eval_threshold_set.items(): + eval_summary_stats[key] = value + def fit_batch(self, batch): """Fit one batch, override to do multiple updates. @@ -371,7 +412,7 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - self.modules.tokenizer.device = self.device + self.modules.tokenizer.device = self.device if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device From 123e12454d4466343da74075bb62daf90a8cabd4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 21 Jan 2025 14:48:09 -0500 Subject: [PATCH 073/270] DASB: Tokotron/VALL-E: Work in progress --- .../TTS/tokotron/hparams/train_encodec.yaml | 1 - .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 2 - .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 2 +- .../TTS/tokotron/hparams/train_dac.yaml | 11 - .../tokotron/hparams/train_discrete_ssl.yaml | 1 - .../TTS/tokotron/hparams/train_encodec.yaml | 1 - .../hparams/train_speech_tokenizer.yaml | 7 - .../DASB/LibriTTS/TTS/tokotron/train.py | 58 +- .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 0 .../TTS/valle/hparams/train_discrete_ssl.yaml | 280 ++++++ .../TTS/valle/hparams/train_encodec.yaml | 225 +++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 866 ++++++++++++++++++ benchmarks/DASB/model/valle.py | 103 ++- 13 files changed, 1434 insertions(+), 123 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/train.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 0e923ffc9..39b28b437 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -32,7 +32,6 @@ test_json: !ref /test.json frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 377b5955c..aa7ee2c4b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -10,7 +10,6 @@ import speechbrain as sb import json import logging -import math import csv import torch import torchaudio @@ -19,7 +18,6 @@ from pathlib import Path from types import SimpleNamespace from torch.nn import ModuleDict -from tqdm.auto import tqdm from data import undo_batch from eval import vocoder_to_device from torch.utils.flop_counter import FlopCounterMode diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index 18e39ba42..bafd769cc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -67,4 +67,4 @@ eval_summary: eval_summary_log: utmos: utmos_utmos_mean dwer: asr_dwer_median - spk_sim: spk_sim_score_mean + spk_sim: spk_sim_score_mean \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 8c05d2499..5670aa208 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -33,7 +33,6 @@ test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 @@ -129,16 +128,6 @@ use_silence_padding: True # Token model (pretrained) -dac: !new:speechbrain.lobes.models.discrete.dac.DAC - sample_rate: !ref - model_type: !ref - model_bitrate: !ref - load_pretrained: True - -token_model: !new:benchmarks.DASB.model.custom_model.DACFeatureExtractor - dac: !ref - n_quantizers: !ref - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 5c8db0bc4..88bc91aef 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -36,7 +36,6 @@ test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 30d2cbfe4..be49b69f6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -36,7 +36,6 @@ test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress -progress_archive: !ref /progress.tar progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index c307ed0bf..aac74070a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -130,13 +130,6 @@ use_silence_padding: True # Token model (pretrained) -# Token model (pretrained) -token_model: !new:benchmarks.DASB.model.custom_model.SpeechTokenizerInterface - source: !ref - save_path: !ref - shape: compat - codebooks: !ref - spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams source: !ref savedir: !ref /ecapa diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index da228d6ae..3ab03d0a3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -84,7 +84,7 @@ def create_waveform(self, audio, length, emb): ------- wav : torch.Tensor """ - self.modules.tokenizer.device = self.device + self.modules.tokenizer.device = self.device if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device @@ -383,62 +383,6 @@ def evaluate_batch(self, batch, stage): self.evaluator.evaluate_batch(batch) return loss.detach().cpu() - def make_dataloader( - self, dataset, stage, ckpt_prefix="dataloader-", **loader_kwargs - ): - """A custom override of make_dataloader that will change the batch - size if guides are enabled to meet GPU memory constraints - - Arguments - --------- - dataset : Dataset - A set of data to use to create data loader. If the Dataset is a - DynamicItemDataset, PaddedBatch is used as the default collate_fn, - unless specified in loader_kwargs. - stage : Stage - The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST - ckpt_prefix : str, None - Prefix to use for SaveableDataLoader Checkpoint name. The Stage - name is added to this to create the full key. Set to None to not - save the DataLoader. - **loader_kwargs : dict - Additional keyword arguments to the DataLoader. - E.g., batch_size, num_workers, pin_memory. - - Returns - ------- - DataLoader for the input dataset - """ - if stage == sb.Stage.TRAIN and not getattr( - self, "_ckpt_recovered", False - ): - self.checkpointer.recover_if_possible() - self._ckpt_recovered = True - if self.guides_running(pre_epoch=True): - loader_kwargs["batch_size"] = self.hparams.batch_size_guided - return super().make_dataloader( - dataset=dataset, - stage=stage, - ckpt_prefix=ckpt_prefix, - **loader_kwargs, - ) - - def guides_running(self, pre_epoch=False): - """Determines whether guides are currently running - - Arguments - --------- - pre_epoch : bool - If enabled, a correction will be applied to the current epoch - indicating that the current epoch has not yet started""" - epoch = self.hparams.epoch_counter.current - if pre_epoch: - epoch += 1 - return ( - self.hparams.guides_enabled - and epoch >= self.hparams.guides_start_epoch - ) - def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml new file mode 100644 index 000000000..b5eb30f62 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -0,0 +1,280 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/discrete_ssl +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +ssl_model_type: wavlm +representation_mode: discrete +output_folder: !ref results/tokotron/// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +vocoder_model_name: !ref unithifigan-dasb--discrete +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +token_model_src: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: microsoft/wavlm-large + hubert: facebook/hubert-large-ll60k + wav2vec2: facebook/wav2vec2-large-960h-lv60-self +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +vocoder_repo_id: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + hubert: speechbrain/hifigan-hubert-k1000-LibriTTS + wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS + wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS + +ssl_model_layers: [1, 3, 7, 12, 18, 23] +token_model_layers: !ref +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref +silence_padding: !ref +use_silence_padding: True + +# Token model (pretrained) +ssl_model: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + wavlm: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + hubert: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True + wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 + source: !ref + save_path: !ref + freeze: !ref + output_all_hiddens: True +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + decoder_mode: !ref + scale_factor: !ref + representation_mode: discrete + +tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer + save_path: !ref + ssl_model: !ref + vocoder_repo_id: !ref + kmeans_dataset: !ref + num_clusters: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: discrete + + +lr_annealing: !new:model.Tokotron.TargetedNoamScheduler + lr_initial: [!ref , !ref ] + n_warmup_steps: !ref + param_group: 0 + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..39b28b437 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,225 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py new file mode 100644 index 000000000..ebcc78015 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -0,0 +1,866 @@ +#!/usr/bin/env/python3 +"""Recipe for training a Text-to-Speech system based on tokenized audio + +Inspired by WhisperSpeech +https://github.com/collabora/WhisperSpeech + +However, this is not an implementation of WhisperSpeech, but rather +a radical simplification of it that uses only an acoustic model + + +Authors + * Artem Ploujnikov 2024 +""" + + +import logging +import speechbrain as sb +import torch +import sys +import shutil +from pathlib import Path +from hyperpyyaml import load_hyperpyyaml +from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio +from speechbrain.dataio.dataio import write_audio +from speechbrain.utils.distributed import run_on_main +from speechbrain.utils.data_utils import batch_pad_right +import re +import string + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats + +logger = logging.getLogger(__name__) + +SPECIAL_TOKEN_COUNT = 1 + + +# Brain class for speech recognition training +class VALLEBrain(sb.Brain): + """Class that manages the training loop. See speechbrain.core.Brain.""" + + def __init__( + self, + modules=None, + opt_class=None, + hparams=None, + run_opts=None, + checkpointer=None, + ): + super().__init__( + modules, opt_class, hparams, run_opts, checkpointer, + ) + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + self.modules.tokenizer.device = self.device + if hasattr(self.modules.tokenizer, "codec_vocoder"): + self.modules.tokenizer.codec_vocoder.to(self.device) + self.modules.tokenizer.codec_vocoder.device = self.device + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int() + wav = self.modules.tokenizer.tokens_to_sig(audio) + clean_padding_(wav, length) + return wav + + def compute_forward(self, batch, stage): + """Runs all the computation of the Tokotron TTS + + Arguments + --------- + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + predictions : dict + TTS predictions + """ + batch = batch.to(self.device) + prompt, prompt_length = batch.prompt + batch_size, prompt_max_len, num_tracks = prompt.shape + nar_track = torch.randint( + 1, num_tracks, (batch_size,), + device=self.device + ) + logits_ar, logits_nar = self.modules.model( + dec_seq=batch.prompt.data, + dec_seq_lengths=batch.prompt.lengths, + prefix_len=batch.prefix_length / prompt_max_len, + nar_level_idx=nar_track + ) + return logits_ar, logits_nar, nar_track + + def compute_objectives(self, predictions, batch, stage): + """Computes the loss given the predicted and targeted outputs. We here + do multi-task learning and the loss is a weighted sum of the ctc + seq2seq + costs. + + Arguments + --------- + predictions : dict + The output dict from `compute_forward`. + batch : PaddedBatch + This batch object contains all the relevant tensors for computation. + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + + Returns + ------- + loss : torch.Tensor + A one-element tensor used for backpropagating the gradient. + """ + batch = batch.to(self.device) + + logits_ar, logits_nar, nar_track = predictions + prompt, prompt_length = batch.prompt + prefix_length = batch.prefix_length + + logits_ar_sm = self.hparams.log_softmax(logits_ar) + logits_nar_sm = self.hparams.log_softmax(logits_nar) + batch_size, max_len, _ = prompt.shape + targets_ar = prompt[:, 1:, 0] + batch_idx = torch.arange(batch_size, device=prompt.device) + targets_nar = prompt[batch_idx, 1:, nar_track] + prompt_max_len = prompt.size(1) + length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len) + prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not() + mask = (length_mask * prefix_mask)[:, 1:] + + loss_ar = self.hparams.compute_cost( + log_probabilities=logits_ar_sm, + targets=targets_ar, + mask=mask + ) + self.loss_metric_ar.append( + ids=batch.uttid, + log_probabilities=logits_ar_sm, + targets=targets_ar, + mask=mask, + reduction="batch", + ) + loss_nar = self.hparams.compute_cost( + log_probabilities=logits_nar_sm, + targets=targets_nar, + mask=mask, + ) + self.loss_metric_nar.append( + ids=batch.uttid, + log_probabilities=logits_nar_sm, + targets=targets_nar, + mask=mask, + reduction="batch", + ) + loss = loss_ar + loss_nar + return loss + + def on_stage_start(self, stage, epoch): + """Gets called at the beginning of each epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + self.offsets = get_offsets( + self.hparams.vocab_size, + self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( + metric=self.hparams.compute_cost, batch_eval=True, + ) + self.loss_metric_ar = sb.utils.metric_stats.MetricStats( + metric=self.hparams.compute_cost, + batch_eval=True, + ) + self.loss_metric_nar = sb.utils.metric_stats.MetricStats( + metric=self.hparams.compute_cost, + batch_eval=True, + ) + + # TOOO: Reestablish evaluation + self.is_evaluating = False + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: + self.evaluation_metric.on_evaluation_start() + self.is_evaluating = True + + def is_eval_epoch(self, epoch): + """Determines whether or not evaluation should be performed + in the specieied epoch + + Arguments + --------- + epoch : int + The epoch number. If omitted, the epoch number from the + epoch counter will be used + + Returns + ------- + eval_epoch : bool + True if evaluation should be run in this epoch, false + otherwise""" + if epoch is None: + epoch = self.hparams.epoch_counter.current + return epoch % self.hparams.eval_interval == 0 + + def on_fit_start(self): + """Gets called at the beginning of ``fit()``, on multiple processes + if ``distributed_count > 0`` and backend is ddp. + + Default implementation compiles the jit modules, initializes + optimizers, and loads the latest checkpoint to resume training. + """ + # Run this *after* starting all processes since jit/compiled modules + # cannot be pickled. + self._compile() + + # Wrap modules with parallel backend after jit + self._wrap_distributed() + + # Initialize optimizers after parameters are configured + self.init_optimizers() + + # Load latest checkpoint to resume training if interrupted + if self.checkpointer is not None and not getattr( + self, "_ckpt_recovered", False + ): + self.checkpointer.recover_if_possible() + self._ckpt_recovered = True + + @torch.no_grad() + def evaluate_batch(self, batch, stage): + """Evaluate one batch, override for different procedure than train. + + The default implementation depends on two methods being defined + with a particular behavior: + + * ``compute_forward()`` + * ``compute_objectives()`` + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for evaluation. Default implementation assumes + this batch has two elements: inputs and targets. + stage : Stage + The stage of the experiment: Stage.VALID, Stage.TEST + + Returns + ------- + detached loss + """ + out = self.compute_forward(batch, stage=stage) + loss = self.compute_objectives(out, batch, stage=stage) + if self.is_evaluating: + with torch.no_grad(): + audio_tokens, audio_length = self.inference(batch) + if self.hparams.flip_layers: + audio_tokens = audio_tokens.flip(2) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.save_samples( + batch=batch, + wav=wav, + length=audio_length, + stage=stage + ) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + return loss.detach().cpu() + + def on_stage_end(self, stage, stage_loss, epoch): + """Gets called at the end of an epoch. + + Arguments + --------- + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, sb.Stage.TEST + stage_loss : float + The average loss for all of the data processed in this stage. + epoch : int + The currently-starting epoch. This is passed + `None` during the test stage. + """ + + # Store the train loss until the validation stage. + loss_stats = self.loss_metric.summarize(flat=True) + stage_stats = {"loss": stage_loss, **loss_stats} + if stage == sb.Stage.TRAIN: + self.train_stats = stage_stats + + # End evaluation and report stats + if stage != sb.Stage.TRAIN and self.is_evaluating: + self.evaluation_metric.on_evaluation_end() + self.save_eval(stage) + eval_summary = self.evaluation_metric.summarize() + eval_summary_stats = { + key: eval_summary.get(value) + for key, value in self.hparams.eval_summary_log.items() + } + stage_stats.update(eval_summary_stats) + + # Perform end-of-iteration things, like annealing, logging, etc. + if stage == sb.Stage.VALID: + + if self.hparams.lr_annealing_mode == "epoch": + _, new_lr = self.hparams.lr_annealing(stage_loss) + sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr) + + lr = self.optimizer.param_groups[0]["lr"] + + # The train_logger writes a summary to stdout and to the logfile. + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, + ) + + # Save the current checkpoint and delete previous checkpoints. + self.checkpointer.save_and_keep_only( + meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + ) + + def inference(self, batch): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference_results = [ + self.modules.model.inference( + prefix=prefix_item.unsqueeze(0), + opts=self._get_inference_opts() + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + def _get_inference_opts(self): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :] + tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None] + track_start = ( + self.hparams.text_num_tokens + + self.hparams.special_num_tokens + + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + return self.hparams.inference_opts( + masks={ + self.hparams.bos_index: mask + }, + device=self.device, + ) + + def save_samples(self, batch, wav, length, stage): + output_folder = self._get_eval_output_folder(stage) + samples = undo_padding_tensor(wav, length) + for uttid, sample in zip(batch.uttid, samples): + file_name = output_folder / f"pred_{uttid}.wav" + write_audio(file_name, sample, self.hparams.model_sample_rate) + + def save_eval(self, stage): + """Saves evaluation results + + stage : sb.Stage + One of sb.Stage.TRAIN, sb.Stage.VALID, or sb.Stage.TEST. + """ + output_folder = self._get_eval_output_folder(stage) + for src_file_name in self.evaluation_metric.files: + dest_file_name = output_folder / src_file_name.name + shutil.copyfile(src_file_name, dest_file_name) + self.evaluation_metric.clear() + + def _get_eval_output_folder(self, stage): + epoch = self.hparams.epoch_counter.current + output_folder = ( + Path(self.hparams.output_folder) / "eval" / stage.name.lower() + ) + if epoch is not None: + output_folder = output_folder / str(epoch) + output_folder.mkdir(exist_ok=True, parents=True) + return output_folder + + def fit_batch(self, batch): + loss = super().fit_batch(batch) + if self.hparams.lr_annealing_mode == "step": + self.hparams.lr_annealing(self.optimizer) + return loss + + +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} + + +def dataio_prepare(hparams): + """This function prepares the datasets to be used in the brain class. + It also defines the data processing pipeline through user-defined functions. + + + Arguments + --------- + hparams : dict + This dictionary is loaded from the `train.yaml` file, and it includes + all the hyperparameters needed for dataset construction and loading. + + Returns + ------- + datasets : dict + Dictionary containing "train", "valid", and "test" keys that correspond + to the DynamicItemDataset objects. + silence_token : dict + the token used for silence + """ + + # Define datasets from json data manifest file + # Define datasets sorted by ascending lengths for efficiency + datasets = {} + data_folder = hparams["data_folder"] + data_info = { + "train": hparams["train_json"], + "valid": hparams["valid_json"], + "test": hparams["test_json"], + } + label_encoder = hparams["label_encoder"] + input_feature = INPUT_FEATURE_MAP[hparams["input"]] + offsets = get_offsets( + hparams["vocab_size"], + hparams["audio_tokens_per_step"] + ).unsqueeze(0) + if hparams["flip_layers"]: + offsets = offsets.flip(-1) + + tokens_loader = hparams.get("tokens_loader") + + @sb.utils.data_pipeline.takes("label") + @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") + def text_pipeline(label): + """Processes the transcriptions to generate proper labels""" + label_norm = label.upper() + yield label_norm + label_norm_eval = RE_PUNCTUATION.sub("", label_norm) + yield label_norm_eval + + @sb.utils.data_pipeline.takes(input_feature) + @sb.utils.data_pipeline.provides("tokens") + def tokens_pipeline(label): + """Processes the transcriptions to generate proper labels""" + return label_encoder.encode_sequence_torch(label) + + @sb.utils.data_pipeline.takes("uttid", "tokens") + @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length") + def prompt_pipeline(id, tokens): + audio = tokens_loader.tokens_by_uttid( + id, num_codebooks=hparams["audio_tokens_per_step"] + ) + + if hparams["flip_layers"]: + audio = audio.flip(-1) + yield audio + num_tracks = audio.size(1) + prefix = torch.cat( + [ + torch.ones(1, num_tracks) * hparams["bos_index"], + tokens.unsqueeze(-1).expand(len(tokens), num_tracks), + torch.ones(1, num_tracks) * hparams["eot_index"], + ] + ) + yield prefix + prompt = torch.cat( + [ + prefix, + torch.ones(1, num_tracks) * hparams["bos_index"], + audio + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eos_index"], + ] + ).int() + yield prompt + yield len(prefix) + yield len(prompt) + + @sb.utils.data_pipeline.takes("wav") + @sb.utils.data_pipeline.provides("sig") + def sig_pipeline(wav): + sig = sb.dataio.dataio.read_audio(wav) + return sig + + dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline] + + init_sequence_encoder(hparams) + use_spk_emb = hparams.get("use_spk_emb", False) + prepared_features = ["audio_tokens"] + output_keys = [ + "uttid", + "tokens", + "label_norm", + "audio", + "prompt", + "prefix_length", + "length" + ] + if use_spk_emb: + prepared_features.append("spk_emb") + output_keys.append("spk_emb") + + for dataset in data_info: + dataset_dynamic_items = list(dynamic_items) + dataset_output_keys = list(output_keys) + if dataset != "train": + dataset_dynamic_items.append(sig_pipeline) + dataset_output_keys += ["sig", "label_norm_eval", "prefix"] + dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( + json_path=data_info[dataset], + replacements={"data_root": data_folder}, + dynamic_items=dataset_dynamic_items, + output_keys=dataset_output_keys, + ) + + datasets[dataset] = dynamic_dataset + hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + + # Sorting training data with ascending order makes the code much + # faster because we minimize zero-padding. In most of the cases, this + # does not harm the performance. + if hparams["sorting"] == "ascending": + datasets["train"] = datasets["train"].filtered_sorted(sort_key="length") + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "descending": + datasets["train"] = datasets["train"].filtered_sorted( + sort_key="length", reverse=True + ) + hparams["train_dataloader_opts"]["shuffle"] = False + + elif hparams["sorting"] == "random": + hparams["train_dataloader_opts"]["shuffle"] = True + pass + + else: + raise NotImplementedError( + "sorting must be random, ascending or descending" + ) + return datasets + + +def get_offsets(vocab_size, tracks): + """Adds offsets to each track to treat the tokens as distinct + + Arguments + --------- + vocab_size : int + The vocabulary size, for each track + tracks : int + The number of tracks + """ + return torch.arange(tracks) * vocab_size + + +def init_sequence_encoder(hparams): + """Initialize a sequence encoder + + Arguments + --------- + hparams: dict + parsed hyperparameters + prefix: str + the prefix to be prepended to hyperparameter keys, per the naming + convention + + {prefix}_label_encoder: the hparams key for the label encoder + {prefix}_list_file: the hparams key for the list file + + Returns + ------- + encoder: speechbrain.dataio.encoder.TextEncoder + an encoder instance""" + encoder = hparams["label_encoder"] + token_list_file_name = hparams["token_list_file"] + tokens = read_token_list(token_list_file_name) + encoder.add_unk() + encoder.update_from_iterable(tokens, sequence_input=False) + encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + return encoder + + +def read_token_list(file_name): + """Reads a simple text file with tokens (e.g. characters or phonemes) listed + one per line + + Arguments + --------- + file_name: str + the file name + + Returns + ------- + result: list + a list of tokens + """ + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): + raise ValueError(f"Token file {file_name} not found") + with open(file_name) as token_file: + return [line.strip("\r\n") for line in token_file if line] + + +def apply_overfit_test(hparams, dataset): + """Helper for applying an overfit test conditionally based + on hyperparameters: + + `overfit_test`: whether or not to apply an overfit test + `overfit_test_sample_count`: the number of samples to use from the + original dataset + `overfit_test_epoch_data_count`: the number of samples per epoch + + The function will accept datasets, (train, valid, test) tuples + or dictionaries of the form: + {"train": dataset1, "valid": dataset2, "test": dataset3} + + If a tuple or dictionary is used, the training dataset will be of length + overfit_test_epoch_data_count wheres the evaluation dataset will be of + length overfit_test_sample_count. + + Arguments + --------- + hparams: dict + parsed hyperparameters + dataset: DynamicItemDataset|tuple|dict + One of the following + a dataset + a dictionary ({"train": dataset1, "valid": dataset2, "test": dataset3}) + a (train, valid, test) tuple of datasets + + Returns + ------- + result: DynamicItemDataset|tuple|dict + a dataset or collection of datasets suitable for + an overfitting test - in the same format as the + dataset argument (single dataset, dictionary and tuple) + """ + if hparams["overfit_test"]: + if isinstance(dataset, tuple): + dataset_train, _, _ = dataset + dataset_train = apply_overfit_test(hparams, dataset_train) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = dataset_train, dataset_eval, dataset_eval + elif isinstance(dataset, dict): + dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval = dataset_train.filtered_sorted( + select_n=hparams["overfit_test_sample_count"] + ) + result = { + "train": dataset_train, + "valid": dataset_eval, + "test": dataset_eval, + "sample": dataset_eval, + } + else: + result = dataset.overfit_test( + hparams["overfit_test_sample_count"], + hparams["overfit_test_epoch_data_count"], + ) + else: + result = dataset + return result + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Initialize ddp (useful only for multi-GPU DDP training) + sb.utils.distributed.ddp_init_group(run_opts) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml = "\n".join([yaml, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml, overrides, overrides_must_match=True) + + # Create experiment directory + sb.create_experiment_directory( + experiment_directory=hparams["output_folder"], + hyperparams_to_save=hparams_file, + overrides=overrides, + ) + + from ljspeech_prepare import prepare_ljspeech + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + from libritts_prepare import prepare_libritts + + # Data preparation, to be run on only one process. + if not hparams["skip_prep"]: + run_on_main( + prepare_libritts, + kwargs={ + "data_folder": hparams["data_folder"], + "save_json_train": hparams["train_json"], + "save_json_valid": hparams["valid_json"], + "save_json_test": ( + hparams["test_json"] + if "test" in hparams["splits"] + else None + ), + "sample_rate": hparams["sample_rate"], + "train_split": hparams["train_split"], + "valid_split": hparams["valid_split"], + "test_split": ( + hparams["test_split"] + if "test" in hparams["splits"] + else None + ), + "seed": hparams["seed"], + "model_name": hparams["model"].__class__.__name__, + }, + ) + + + # We can now directly create the datasets for training, valid, and test + datasets = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) + + # Load best checkpoint for evaluation + if hparams["testing"]: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 11311d9d8..245ac0fd9 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -75,17 +75,17 @@ class ValleLM(nn.Module): def __init__( self, - vocab_size: int, - nq: int, - pad_id: int = 0, - share_emb: bool = True, - qk_norm: bool = False, - dropout: float = 0.0, - att_unit: int = 256, - head: int = 2, - ar_layer: int = 4, - nar_layer: int = 4, - n_ctx: int = 3000, + vocab_size, + nq, + pad_id=0, + share_emb=True, + qk_norm=False, + dropout=0.0, + att_unit=256, + head=2, + ar_layer=4, + nar_layer=4, + n_ctx=3000, ): super().__init__() @@ -120,23 +120,30 @@ def __init__( def forward( self, - dec_seq: torch.Tensor, - dec_seq_lengths: torch.Tensor = None, - prefix_len: torch.Tensor = None, - conti_feats: Tuple = None, + dec_seq, + dec_seq_lengths=None, + prefix_len=None, + conti_feats=None, nar_level_idx=1, - ) -> Tuple[torch.Tensor, torch.Tensor, Dict]: + ): """Vall-E forward for training - Args: - dec_seq (LongTensor): Batch of decoder sequences (B, T, nq). - dec_seq_lengths (LongTensor): Lengths of batched decoder sequences (B,). - enc_seq (LongTensor): Batch of encoder sequences (B, T, nq), keep - the interface, may not be used. - enc_seq_lengths (LongTensor): Lengths of batched encoder sequences (B,), - keep the interface, may not be used. - prefix_len (LongTensor): Lengths of condition part in dec_seq (B,). - compute_loss (bool): whether to compute loss or just logits. + Arguments + --------- + dec_seq : torch.Tensor + Batch of decoder sequences (B, T, nq). + dec_seq_lengths : torch.Tensor + Lengths of batched decoder sequences (B,). + enc_seq : torch.Tensor + Batch of encoder sequences (B, T, nq), keep + the interface, may not be used. + enc_seq_lengths : torch.Tensor + Lengths of batched encoder sequences (B,), + keep the interface, may not be used. + prefix_len : torch.Tensor + Lengths of condition part in dec_seq (B,). + nar_level_idx : int + the index of the non-autoregressive level to train """ assert dec_seq.dim() == 3 @@ -196,19 +203,24 @@ def prepare_input(self, dec_seq_emb, prefix_len, level): @torch.no_grad() def inference( self, - prefix: torch.Tensor, - opts: SpeechLMInferenceOptions, - enc_seq: torch.Tensor = None, - suffix: torch.Tensor = None, + prefix, + opts, + enc_seq=None, + suffix=None, ): """Vall-E Inference. - Args: - prefix (LongTensor): Prefix part of dec_seq (B, T, nq). - opts (SpeechLMInferenceOptions): inference options. - enc_seq (LongTensor): Encoder token sequence (B, T, nq). - suffix (LongTensor): suffix part of dec_seq (B, T, nq), - usually the target sequence for teacher-forcing. + Arguments + --------- + prefix : torch.Tensor + Prefix part of dec_seq (B, T, nq). + opts : SpeechLMInferenceOptions + inference options. + enc_seq : torch.Tensor + Encoder token sequence (B, T, nq). + suffix : torch.Tensor + suffix part of dec_seq (B, T, nq), + usually the target sequence for teacher-forcing. """ # (1) initialization @@ -783,13 +795,20 @@ def logits_to_tokens( """ Select the generated tokens and their scores based on logits prediction. - logits (torch.Tensor), predicted logits, of size [B, T, nq, V] - opts (SpeechLMInferenceOptions): search options - mask (torch.Tensor): mask to specify valid tokens, of size [B, 1, nq, V] - search_algo (str): search algorithm - allow_eos (bool): whether to allow end-of-sentence prediction - nq_level (int or None): if not None, only conpute the specified codec level nq. - + Arguments + --------- + logits : torch.Tensor + predicted logits, of size [B, T, nq, V] + opts : SpeechLMInferenceOptions + search options + mask : torch.Tensor + mask to specify valid tokens, of size [B, 1, nq, V] + search_algo : str + search algorithm + allow_eos : bool + whether to allow end-of-sentence prediction + nq_level : int, optional + if not None, only conpute the specified codec level nq. """ assert logits.dim() == 4 From b1ca7adfdd5360905c41f4a6a9d74401a5e32d1f Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 21 Jan 2025 18:08:59 -0500 Subject: [PATCH 074/270] DASB: Tokotron: Implement SQCodec, Mimi and WavTokenizer (single-speaker) --- .../TTS/tokotron/hparams/train_dac.yaml | 3 +- .../tokotron/hparams/train_discrete_ssl.yaml | 1 + .../TTS/tokotron/hparams/train_encodec.yaml | 1 + .../TTS/tokotron/hparams/train_mimi.yaml | 225 +++++++++++++++++ .../hparams/train_speech_tokenizer.yaml | 2 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 228 +++++++++++++++++ .../tokotron/hparams/train_wavtokenizer.yaml | 229 ++++++++++++++++++ .../DASB/LJSpeech/TTS/tokotron/train.py | 39 ++- .../DASB/LJSpeech/extraction/hparams/dac.yaml | 2 +- .../extraction/hparams/discrete_ssl.yaml | 2 +- .../LJSpeech/extraction/hparams/encodec.yaml | 2 +- .../LJSpeech/extraction/hparams/mimi.yaml | 56 +++++ .../extraction/hparams/speech_tokenizer.yaml | 2 +- .../LJSpeech/extraction/hparams/sqcodec.yaml | 55 +++++ .../extraction/hparams/wavtokenizer.yaml | 58 +++++ .../DASB/LibriTTS/TTS/tokotron/train.py | 22 +- benchmarks/DASB/model/Tokotron.py | 66 ++--- benchmarks/DASB/model/sq_codec.py | 3 +- 18 files changed, 930 insertions(+), 66 deletions(-) create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml create mode 100644 benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index e88469b3b..946c6d8c1 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -159,6 +159,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -168,8 +169,6 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 2 bandwidth: 1.5 -model_shape: BHL -model_needs_channel: True attention_type: regularMHA ############################## models ################################ diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 68e54fa83..827da9a25 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -197,6 +197,7 @@ vocab_size: 1000 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False audio_emb_lr: 0.00001 audio_emb_weight_decay: 0.001 text_num_tokens: 39 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 0e923ffc9..3bbe8468f 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -149,6 +149,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..ebcb2d17f --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,225 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 2048 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 9027a945b..ff51f8e32 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -152,6 +152,7 @@ audio_num_tokens: 1024 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -161,7 +162,6 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice phonemes: !ref audio_tokens_per_step: 2 bandwidth: 1.5 -model_shape: HBL attention_type: regularMHA ############################## models ################################ diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..7ec88ba7d --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -0,0 +1,228 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/sqcodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 19683 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: True +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 1 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..79fed90fe --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,229 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: flexthink/soundchoice-g2p + +# Model type +representation_mode: discrete + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 150 +batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref + +# Token model (pretrained) +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 4096 +audio_emb_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +audio_token_offsets: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 1 +attention_type: regularMHA + +############################## models ################################ + +model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + infer_max_audio_length: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 05f81b341..44c9804ec 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -22,7 +22,7 @@ import string from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataio import clean_padding_ +from speechbrain.dataio.dataio import clean_padding, clean_padding_ from speechbrain.utils.distributed import run_on_main base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) @@ -120,6 +120,17 @@ def prepare_features(self, batch): if self.representation_mode == RepresentationMode.DISCRETE: audio_bos, audio_bos_length = batch.audio_bos audio_tgt, audio_tgt_length = batch.audio_pad + if self.audio_token_offsets is not None: + audio_bos = torch.cat( + [ + audio_bos[:, :self.hparams.bos_width], + audio_bos[:, self.hparams.bos_width:] - self.audio_token_offsets, + ], + dim=1 + ) + clean_padding_(audio_bos, audio_bos_length) + audio_tgt = audio_tgt - self.audio_token_offsets + clean_padding_(audio_tgt, audio_tgt_length) else: wav, audio_length = batch.sig audio = self.modules.ssl_model(wav) @@ -136,6 +147,16 @@ def prepare_features(self, batch): audio_tgt_length = audio_length return audio_bos, audio_bos_length, audio_tgt, audio_tgt_length + def get_token_offsets(self): + """Computes token offsets for tokenizers that require them""" + token_offsets = None + if self.hparams.audio_token_offsets: + token_offsets = (torch.arange( + self.hparams.audio_tokens_per_step, + device=self.device + ) * self.hparams.audio_num_tokens)[None, None, :] + return token_offsets + @torch.no_grad() def evaluate_batch(self, batch, stage): """Evaluate one batch, override for different procedure than train. @@ -249,6 +270,8 @@ def on_stage_start(self, stage, epoch): elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True + + self.audio_token_offsets = self.get_token_offsets() def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -416,8 +439,11 @@ def create_waveform(self, audio, length): if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device - wav = self.modules.tokenizer.tokens_to_sig(audio) - clean_padding_(wav, length) + with torch.no_grad(): + if self.audio_token_offsets is not None: + audio = clean_padding(audio + self.audio_token_offsets, length) + wav = self.modules.tokenizer.tokens_to_sig(audio) + wav = clean_padding(wav, length) return wav def is_eval_epoch(self, epoch): @@ -529,13 +555,12 @@ def audio_ref_pipeline(wav): use_silence_padding and representation_mode == RepresentationMode.DISCRETE ): - silence_token, _ = get_silence_token( + silence_token = get_silence_token( hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"), - extract_emb=False, - model_shape=hparams.get("model_shape", "BLH"), - unsqueeze=hparams.get("model_needs_channel", False), ) + if silence_token.dim() == 2: + silence_token = silence_token.squeeze(-1) else: silence_token = ( torch.ones(hparams["audio_tokens_per_step"], dtype=torch.int64) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml index ebf155bb2..b90054db6 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/dac.yaml @@ -12,7 +12,7 @@ save_folder: !ref /save train_log: !ref /extraction_log.txt # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] frozen_split_path: null diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml index c4c01f527..d50cb85ef 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/discrete_ssl.yaml @@ -12,7 +12,7 @@ save_folder: !ref /save train_log: !ref /extraction_log.txt # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] frozen_split_path: null diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml index 0b07a6b1f..6de95de73 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml @@ -12,7 +12,7 @@ save_folder: !ref /save train_log: !ref /extraction_log.txt # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] frozen_split_path: null diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..22e15ef75 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml @@ -0,0 +1,56 @@ +# ############################################################################ +# Auido Tokenizer: Mimi +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/mimi +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 1 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 23 +sample_rate: 24000 +# Feature parameters +encoder_dim: 1024 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml index 54da4f210..f91d34908 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -12,7 +12,7 @@ save_folder: !ref /save train_log: !ref /extraction_log.txt # Data files -data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] frozen_split_path: null diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..4f633cee4 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -0,0 +1,55 @@ +# ############################################################################ +# Auido Tokenizer: SQCodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks : 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..d23c25f96 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,58 @@ +# ############################################################################ +# Auido Tokenizer: wavtokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/wavtokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks : 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index da228d6ae..600ee5f39 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -32,6 +32,7 @@ from model.Tokotron import ( RepresentationMode, + get_silence_repr, get_silence_token, use_silence_padding, feature_pad_to, @@ -575,22 +576,21 @@ def tokens_pipeline(label): else: audio_tokens_per_step = hparams["audio_tokens_per_step"] if use_silence_padding: - silence_token, silence_emb = get_silence_token( - hparams["tokenizer"], - extract_emb=True, - model_kwargs=hparams.get("token_model_kwargs"), - ) + if representation_mode == RepresentationMode.DISCRETE: + silence_padding = get_silence_token( + hparams["tokenizer"], + model_kwargs=hparams.get("token_model_kwargs"), + ) + else: + silence_padding = get_silence_repr( + hparams["ssl_model"], + ) else: - silence_token = ( + silence_padding = ( torch.ones(audio_tokens_per_step, dtype=torch.int64) * hparams["eos_index"] ) - silence_padding = ( - silence_token - if representation_mode == RepresentationMode.DISCRETE - else silence_emb - ) silence_padding = silence_padding.cpu() silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 949840380..1c76a2440 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -2059,8 +2059,6 @@ def decode(self, codes): def get_silence_token( model, sample_length=100000, - extract_emb=True, - model_shape="BLH", unsqueeze=False, device=None, model_kwargs=None, @@ -2074,13 +2072,6 @@ def get_silence_token( A discrete token model, taking (wav, lengths) as arguments sample_length : int The length of the sample - extract_emb : bool - Whether to extract embeddings - model_shape : str - The shape of tokens output by the model - BLH: Batch x Length x Heads (Discrete SSL, Encodec) - BHL: Batch x Heads x Length (DAC) - HBL: Heads x Batch x Length (SpeechTokenizer) unsqueeze: bool Whether to add an extra dimension to the audio (needed for DAC) device : str | torch.Device @@ -2108,43 +2099,38 @@ def get_silence_token( length = torch.ones(1, device=device) model_training = model.training model.eval() - if hasattr(model, "encode"): - spec = inspect.getfullargspec(model.encode) - if "length" in spec.args: - result = model.encode(audio, length, **model_kwargs) - else: - result = model.encode(audio, **model_kwargs) - else: - result = model(audio, length, **model_kwargs) + tokens = model.sig_to_tokens(audio, length) if model_training: model.train() - tokens = result if torch.is_tensor(result) else result[0] - if model_shape == "HBL": - tokens = tokens.permute(1, 2, 0) - elif model_shape == "BHL": - tokens = tokens.transpose(-1, -2) - tokens = tokens.squeeze(0) if unsqueeze: tokens = tokens.squeeze(0) silence_tokens = tokens.mode(0).values - silence_emb = None - if extract_emb: - if hasattr(model, "embeddings"): - silence_emb = model.embeddings( - silence_tokens[None, None, :] - ).squeeze() - else: - heads = tokens.shape[-1] - embs = result[1] - mode_idx = [ - (tokens[:, head] == silence_tokens[head]).nonzero()[0].item() - for head in range(heads) - ] - silence_emb = torch.stack( - [embs[0, idx, head] for head, idx in enumerate(mode_idx)] - ) - return silence_tokens, silence_emb + return silence_tokens + + +def get_silence_repr(model, sample_length=100000, device=None): + """Gets continuous silence + + Arguments + --------- + model : nn.Module + A discrete token model, taking (wav, lengths) as arguments + sample_length : int + The length of the sample + device : str | torch.Device + The device to use + + Returns + ------- + silence : torch.Tensor + A silecnce tensor + """ + audio = torch.zeros(1, sample_length, device=device) + length = torch.ones(1, device=device) + audio_repr = model(audio, length) + silence = audio_repr.mean(dim=1)[0] + return silence def feature_pad_to(tensor, length, padding=None): diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 4ac4b74ad..99a38c9bd 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -123,7 +123,8 @@ def build_codec_model(self, config): """ exp_model_config = OmegaConf.load(config) scalar_codec = ScalarModel(**exp_model_config.generator.config) - parameter_dict = torch.load(self.ckpt_path) + device = next(iter(scalar_codec.parameters())).device + parameter_dict = torch.load(self.ckpt_path, map_location=device) scalar_codec.load_state_dict(parameter_dict["codec_model"]) return scalar_codec From 2daeaa5192ee113fbf144132d52cee87713c395c Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 21 Jan 2025 18:22:00 -0500 Subject: [PATCH 075/270] DASB: Cosmetic changes (pre-commit hooks) --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 2 +- .../TTS/tokotron/hparams/train_mimi.yaml | 6 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 6 +- .../tokotron/hparams/train_wavtokenizer.yaml | 10 +-- .../DASB/LJSpeech/TTS/tokotron/train.py | 30 ++++--- .../DASB/LJSpeech/TTS/valle/evaluation.py | 3 +- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 87 ++++++++++--------- .../LJSpeech/extraction/hparams/sqcodec.yaml | 4 +- .../extraction/hparams/wavtokenizer.yaml | 4 +- .../DASB/LibriSpeech/extraction/extract.py | 2 +- .../extraction/hparams/sqcodec.yaml | 4 +- .../extraction/hparams/wavtokenizer.yaml | 4 +- .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 2 - .../DASB/LibriTTS/TTS/tokotron/train.py | 6 +- .../LibriTTS/extraction/hparams/mimi.yaml | 0 benchmarks/DASB/model/Tokotron.py | 3 +- 16 files changed, 87 insertions(+), 86 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index 717532724..f805e23f6 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -57,4 +57,4 @@ eval_threshold: dwer_max: 90.0 eval_threshold_set: - utmos: 0.0 \ No newline at end of file + utmos: 0.0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index ebcb2d17f..0d08747cc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -186,9 +186,9 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul infer_max_audio_length: !ref tokenizer: !new:utils.tokenizer_interface.MimiTokenizer - source: !ref - save_path: !ref - num_codebooks: !ref + source: !ref + save_path: !ref + num_codebooks: !ref modules: model: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 7ec88ba7d..0143ef65b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -189,9 +189,9 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul infer_max_audio_length: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer - save_path: !ref - checkpoint: !ref - config: !ref + save_path: !ref + checkpoint: !ref + config: !ref modules: model: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index 79fed90fe..df0a82050 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -188,11 +188,11 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul infer_max_audio_length: !ref tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper - source: !ref - save_path: !ref - checkpoint: !ref - config: !ref - freeze: True + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True modules: model: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 44c9804ec..e35635b2b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -123,10 +123,11 @@ def prepare_features(self, batch): if self.audio_token_offsets is not None: audio_bos = torch.cat( [ - audio_bos[:, :self.hparams.bos_width], - audio_bos[:, self.hparams.bos_width:] - self.audio_token_offsets, + audio_bos[:, : self.hparams.bos_width], + audio_bos[:, self.hparams.bos_width :] + - self.audio_token_offsets, ], - dim=1 + dim=1, ) clean_padding_(audio_bos, audio_bos_length) audio_tgt = audio_tgt - self.audio_token_offsets @@ -151,10 +152,12 @@ def get_token_offsets(self): """Computes token offsets for tokenizers that require them""" token_offsets = None if self.hparams.audio_token_offsets: - token_offsets = (torch.arange( - self.hparams.audio_tokens_per_step, - device=self.device - ) * self.hparams.audio_num_tokens)[None, None, :] + token_offsets = ( + torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + ) + * self.hparams.audio_num_tokens + )[None, None, :] return token_offsets @torch.no_grad() @@ -270,7 +273,7 @@ def on_stage_start(self, stage, epoch): elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True - + self.audio_token_offsets = self.get_token_offsets() def on_stage_end(self, stage, stage_loss, epoch): @@ -325,7 +328,7 @@ def on_stage_end(self, stage, stage_loss, epoch): def get_summary_stats(self): """Retrieves the stats that needs to be reported on every trial in the train log, as indicated in eval_summary_log in eval.yaml - + Returns ------- eval_summary_stats : dict @@ -337,7 +340,7 @@ def get_summary_stats(self): } self._check_threshold(eval_summary_stats) return eval_summary_stats - + def _check_threshold(self, eval_summary_stats): """Checks threshold values for the defined stats and terminates the trials if the parameters are not met. This is necessary because @@ -361,7 +364,9 @@ def _check_threshold(self, eval_summary_stats): elif threshold_type == "max": meets = value <= threshold_value else: - raise ValueError(f"Invalid threshold definition: {key}, check eval_threshold") + raise ValueError( + f"Invalid threshold definition: {key}, check eval_threshold" + ) if not meets: eval_summary_stats["broken"] = True for key, value in self.hparams.eval_threshold_set.items(): @@ -556,8 +561,7 @@ def audio_ref_pipeline(wav): and representation_mode == RepresentationMode.DISCRETE ): silence_token = get_silence_token( - hparams[model_key], - model_kwargs=hparams.get("token_model_kwargs"), + hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"), ) if silence_token.dim() == 2: silence_token = silence_token.squeeze(-1) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py index 152db4c87..6c2dd1c8d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -240,8 +240,7 @@ def summarize(self, field=None): "descriptive" ] for stat_key, value in descriptive_statistics( - items=self.details[evaluator_key], - key=metric_key, + items=self.details[evaluator_key], key=metric_key, ).items() } if field is not None: diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index e0ae084a3..92ea570da 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -20,8 +20,11 @@ import shutil from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio -from speechbrain.dataio.dataio import write_audio +from speechbrain.dataio.dataio import ( + clean_padding_, + length_to_mask, + write_audio, +) from speechbrain.utils.distributed import run_on_main from speechbrain.utils.data_utils import batch_pad_right import re @@ -30,7 +33,7 @@ base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) -from evaluation import SpeechEvaluationMetricStats +from evaluation import SpeechEvaluationMetricStats # noqa: E402 logger = logging.getLogger(__name__) @@ -55,7 +58,7 @@ def __init__( self.evaluation_metric = SpeechEvaluationMetricStats( self.hparams, self.device ) - + def create_waveform(self, audio, length): """Creates a waveform from a discrete or continuous audio representation @@ -75,7 +78,11 @@ def create_waveform(self, audio, length): if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device - audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int() + audio = ( + (audio - hparams["audio_token_shift"] - self.offsets) + .clip(min=0.0) + .int() + ) wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav @@ -99,14 +106,13 @@ def compute_forward(self, batch, stage): prompt, prompt_length = batch.prompt batch_size, prompt_max_len, num_tracks = prompt.shape nar_track = torch.randint( - 1, num_tracks, (batch_size,), - device=self.device + 1, num_tracks, (batch_size,), device=self.device ) logits_ar, logits_nar = self.modules.model( dec_seq=batch.prompt.data, dec_seq_lengths=batch.prompt.lengths, prefix_len=batch.prefix_length / prompt_max_len, - nar_level_idx=nar_track + nar_level_idx=nar_track, ) return logits_ar, logits_nar, nar_track @@ -142,14 +148,16 @@ def compute_objectives(self, predictions, batch, stage): batch_idx = torch.arange(batch_size, device=prompt.device) targets_nar = prompt[batch_idx, 1:, nar_track] prompt_max_len = prompt.size(1) - length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len) - prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not() + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, - targets=targets_ar, - mask=mask + log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask ) self.loss_metric_ar.append( ids=batch.uttid, @@ -159,9 +167,7 @@ def compute_objectives(self, predictions, batch, stage): reduction="batch", ) loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, - targets=targets_nar, - mask=mask, + log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, ) self.loss_metric_nar.append( ids=batch.uttid, @@ -185,20 +191,17 @@ def on_stage_start(self, stage, epoch): `None` during the test stage. """ self.offsets = get_offsets( - self.hparams.vocab_size, - self.hparams.audio_tokens_per_step, + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, )[None, None, :].to(self.device) self.loss_metric = sb.utils.metric_stats.MultiMetricStats( metric=self.hparams.compute_cost, batch_eval=True, ) self.loss_metric_ar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, - batch_eval=True, + metric=self.hparams.compute_cost, batch_eval=True, ) self.loss_metric_nar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, - batch_eval=True, + metric=self.hparams.compute_cost, batch_eval=True, ) # TOOO: Reestablish evaluation @@ -288,10 +291,7 @@ def evaluate_batch(self, batch, stage): wav = self.create_waveform(audio_tokens, audio_length) wav = wav.squeeze(1) self.save_samples( - batch=batch, - wav=wav, - length=audio_length, - stage=stage + batch=batch, wav=wav, length=audio_length, stage=stage ) self.evaluation_metric.append( ids=batch.uttid, @@ -375,13 +375,14 @@ def inference(self, batch): prefix_items = undo_padding_tensor(prefix.int(), prefix_length) inference_results = [ self.modules.model.inference( - prefix=prefix_item.unsqueeze(0), - opts=self._get_inference_opts() - ) + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) for prefix_item in prefix_items ] inferred_tokens = [ - result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step) + result[0][0] + if result[0] + else torch.zeros(1000, self.hparams.audio_tokens_per_step) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) @@ -389,8 +390,12 @@ def inference(self, batch): return audio, audio_length def _get_inference_opts(self): - idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :] - tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None] + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] track_start = ( self.hparams.text_num_tokens + self.hparams.special_num_tokens @@ -404,10 +409,7 @@ def _get_inference_opts(self): | (idx == self.hparams.bos_index) ).logical_not() return self.hparams.inference_opts( - masks={ - self.hparams.bos_index: mask - }, - device=self.device, + masks={self.hparams.bos_index: mask}, device=self.device, ) def save_samples(self, batch, wav, length, stage): @@ -435,7 +437,7 @@ def _get_eval_output_folder(self, stage): Path(self.hparams.output_folder) / "eval" / stage.name.lower() ) if epoch is not None: - output_folder = output_folder / str(epoch) + output_folder = output_folder / str(epoch) output_folder.mkdir(exist_ok=True, parents=True) return output_folder @@ -481,12 +483,11 @@ def dataio_prepare(hparams): label_encoder = hparams["label_encoder"] input_feature = INPUT_FEATURE_MAP[hparams["input"]] offsets = get_offsets( - hparams["vocab_size"], - hparams["audio_tokens_per_step"] + hparams["vocab_size"], hparams["audio_tokens_per_step"] ).unsqueeze(0) if hparams["flip_layers"]: offsets = offsets.flip(-1) - + tokens_loader = hparams.get("tokens_loader") @sb.utils.data_pipeline.takes("label") @@ -505,7 +506,9 @@ def tokens_pipeline(label): return label_encoder.encode_sequence_torch(label) @sb.utils.data_pipeline.takes("uttid", "tokens") - @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) def prompt_pipeline(id, tokens): audio = tokens_loader.tokens_by_uttid( id, num_codebooks=hparams["audio_tokens_per_step"] @@ -553,7 +556,7 @@ def sig_pipeline(wav): "audio", "prompt", "prefix_length", - "length" + "length", ] if use_spk_emb: prepared_features.append("spk_emb") diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml index 4f633cee4..0117d9afe 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -37,10 +37,10 @@ config: config.yaml checkpoint: ckpt_00190000.pth sample_rate: 16000 save_embedding: False -num_codebooks : 4 +num_codebooks: 4 save_path: /home/ubuntu/sq-codec/SQ-Codec -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer save_path: !ref checkpoint: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml index d23c25f96..5fe91bbce 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -38,10 +38,10 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt sample_rate: 24000 save_embedding: False -num_codebooks : 1 +num_codebooks: 1 vocab_size: 4096 -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref save_path: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 5a54f72df..3979ba731 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -52,7 +52,7 @@ "skip_prep": hparams["skip_prep"], }, ) - + tokens_extractor = hparams["tokens_extractor"] data_folder = hparams["data_folder"] datasets = [] diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml index fe202c90d..44b4388c2 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml @@ -39,10 +39,10 @@ config: config.yaml checkpoint: ckpt_00190000.pth sample_rate: 16000 save_embedding: False -num_codebooks : 4 +num_codebooks: 4 save_path: /home/ubuntu/sq-codec/SQ-Codec -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer save_path: !ref checkpoint: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml index bc1b56ddb..d1bb576a7 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -40,10 +40,10 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt sample_rate: 24000 save_embedding: False -num_codebooks : 1 +num_codebooks: 1 vocab_size: 4096 -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref save_path: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index 377b5955c..aa7ee2c4b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -10,7 +10,6 @@ import speechbrain as sb import json import logging -import math import csv import torch import torchaudio @@ -19,7 +18,6 @@ from pathlib import Path from types import SimpleNamespace from torch.nn import ModuleDict -from tqdm.auto import tqdm from data import undo_batch from eval import vocoder_to_device from torch.utils.flop_counter import FlopCounterMode diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 600ee5f39..31f8ae33a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -85,7 +85,7 @@ def create_waveform(self, audio, length, emb): ------- wav : torch.Tensor """ - self.modules.tokenizer.device = self.device + self.modules.tokenizer.device = self.device if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device @@ -582,9 +582,7 @@ def tokens_pipeline(label): model_kwargs=hparams.get("token_model_kwargs"), ) else: - silence_padding = get_silence_repr( - hparams["ssl_model"], - ) + silence_padding = get_silence_repr(hparams["ssl_model"],) else: silence_padding = ( torch.ones(audio_tokens_per_step, dtype=torch.int64) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 1c76a2440..010f3b26b 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -12,7 +12,6 @@ import math import torch -import inspect from torch import nn from torch.nn import functional as F from speechbrain.lobes.models.transformer.Transformer import ( @@ -2127,7 +2126,7 @@ def get_silence_repr(model, sample_length=100000, device=None): A silecnce tensor """ audio = torch.zeros(1, sample_length, device=device) - length = torch.ones(1, device=device) + length = torch.ones(1, device=device) audio_repr = model(audio, length) silence = audio_repr.mean(dim=1)[0] return silence From 99395f838f3157afe14b605c7f56f598f094613e Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 22 Jan 2025 01:36:40 -0500 Subject: [PATCH 076/270] DASB: Update sample rates --- benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index 0d08747cc..04adb7926 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -84,7 +84,7 @@ gate_loss_max_weight: 1. # Feature parameters sample_rate: 22050 -model_sample_rate: 16000 +model_sample_rate: 24000 max_audio_length: 1000 infer_max_audio_length: !ref debug_infer_max_audio_length: 10 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index df0a82050..a2b90e83a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -86,7 +86,7 @@ gate_loss_max_weight: 1. # Feature parameters sample_rate: 22050 -model_sample_rate: 16000 +model_sample_rate: 24000 max_audio_length: 1000 infer_max_audio_length: !ref debug_infer_max_audio_length: 10 From 0971b8e50203b89d2b72841643c8b53674eb3548 Mon Sep 17 00:00:00 2001 From: poonehmousavi Date: Thu, 23 Jan 2025 00:36:16 +0000 Subject: [PATCH 077/270] fix bug and update LibriSPeech recepie --- .../DASB/LibriSpeech/extraction/extract.py | 4 ++-- .../LibriSpeech/extraction/hparams/dac.yaml | 11 +++++---- .../extraction/hparams/discrete_ssl.yaml | 24 ++++++++++--------- .../extraction/hparams/encodec.yaml | 11 +++++---- .../LibriSpeech/extraction/hparams/mimi.yaml | 10 ++++---- .../extraction/hparams/speech_tokenizer.yaml | 12 ++++++---- .../extraction/hparams/sqcodec.yaml | 20 +++++++++------- .../extraction/hparams/wavtokenizer.yaml | 16 +++++++------ benchmarks/DASB/model/sq_codec.py | 13 +++++----- benchmarks/DASB/utils/tokenizer_interface.py | 7 ++---- speechbrain | 2 +- 11 files changed, 70 insertions(+), 60 deletions(-) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 5a54f72df..814d252be 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -46,13 +46,13 @@ "tr_splits": hparams["train_splits"], "dev_splits": hparams["dev_splits"], "te_splits": hparams["test_splits"], - "save_folder": hparams["output_folder"], + "save_folder": hparams["cached_data_folder"], "merge_lst": hparams["train_splits"], "merge_name": "train.csv", "skip_prep": hparams["skip_prep"], }, ) - + tokens_extractor = hparams["tokens_extractor"] data_folder = hparams["data_folder"] datasets = [] diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml index d2d935ed0..349597c55 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/dac.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml index 7d4938625..cd8ae126e 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/discrete_ssl.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id @@ -35,13 +36,14 @@ dataloader_opts: num_workers: !ref ### Configuration for discrete SSL model -# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | -# |------------|----------------------------------------|-----------------|--------------|----------------------|------------------------------------------| -# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | -# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | -# | Wav2Vec2 | facebook/wav2vec2-large-960h-lv60-self | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | WIP | +# | SSL Model | HF Encoder | K-Means Dataset | K-Means Size | SSL Layers | Vocoder Model | +# |------------|----------------------------------------|-----------------|--------------|----------------------|---------------------------------------------| +# | WavLM | microsoft/wavlm-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wavlm-k1000-LibriTTS | +# | HuBERT | facebook/hubert-large-ll60k | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-hubert-k1000-LibriTTS | +# | Wav2Vec2 | facebook/wav2vec2-large | LibriSpeech960 | 1000 | 1, 3, 7, 12, 18, 23 | speechbrain/hifigan-wav2vec2-k1000-LibriTTS | -# ssl_model_type: hubert, wavlm, wav2vec2 + +# ssl_model_type: HuBERT, WavLM, Wav2Vec2 # ssl_hub: facebook/hubert-large-ll60k, microsoft/wavlm-large, facebook/wav2vec2-large ssl_model_type: WavLM ssl_hub: microsoft/wavlm-large diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml index ee0a7e910..9f6c8b4ed 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/encodec.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml index e2dad7f95..f9720b170 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml @@ -13,15 +13,16 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv batch_size: 1 num_workers: 8 @@ -48,6 +49,7 @@ tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref save_path: !ref num_codebooks: !ref + sample_rate: !ref tokens_extractor: !new:utils.tokens.TokensExtractor diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 161d4e870..3090e9f79 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id @@ -45,6 +46,7 @@ save_embedding: False tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref + sample_rate: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml index fe202c90d..9d5a6c24e 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/sqcodec.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id @@ -39,14 +40,15 @@ config: config.yaml checkpoint: ckpt_00190000.pth sample_rate: 16000 save_embedding: False -num_codebooks : 4 -save_path: /home/ubuntu/sq-codec/SQ-Codec +num_codebooks: 4 +tokenizer_save_path: !PLACEHOLDER -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref + sample_rate: !ref tokens_extractor: !new:utils.tokens.TokensExtractor diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml index bc1b56ddb..976614a3d 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -13,17 +13,18 @@ train_log: !ref /extraction_log.txt # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +cached_data_folder: !ref # e.g., path/to/cache train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" dev_splits: ["dev-clean"] test_splits: ["dev-clean", "test-clean", "test-other"] skip_prep: False -train_csv: !ref /train.csv -valid_csv: !ref /dev-clean.csv +train_csv: !ref /train.csv +valid_csv: !ref /dev-clean.csv test_csv: - - !ref /test-clean.csv - - !ref /test-other.csv + - !ref /test-clean.csv + - !ref /test-other.csv -batch_size: 8 +batch_size: 1 num_workers: 8 src_key: wav id_key: id @@ -40,15 +41,16 @@ config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmean checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt sample_rate: 24000 save_embedding: False -num_codebooks : 1 +num_codebooks: 1 vocab_size: 4096 -# wavtokenizer model +# wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref save_path: !ref checkpoint: !ref config: !ref + sample_rate: !ref freeze: True diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 4ac4b74ad..0e1ffe3f8 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -123,7 +123,8 @@ def build_codec_model(self, config): """ exp_model_config = OmegaConf.load(config) scalar_codec = ScalarModel(**exp_model_config.generator.config) - parameter_dict = torch.load(self.ckpt_path) + device = next(iter(scalar_codec.parameters())).device + parameter_dict = torch.load(self.ckpt_path, map_location=device) scalar_codec.load_state_dict(parameter_dict["codec_model"]) return scalar_codec @@ -148,9 +149,9 @@ def _flatten_codebooks(self, arr, offset_size=None): ), "Input array must have 3 dimensions [B, N, D]" N, B, D = arr.shape arr = arr.copy() - if offset_size is not None: - for n in range(N): - arr[n, :, :] += offset_size * n + # if offset_size is not None: + # for n in range(N): + # arr[n, :, :] += offset_size * n flattened_arr = arr.transpose(1, 2, 0).reshape(B, N * D) return flattened_arr @@ -205,8 +206,8 @@ def decode(self, codes): T % self.n_codebook == 0 ), "Length T must be divisible by n_codebook" codes = codes.view(B, -1, self.n_codebook).permute(2, 0, 1) - for i in range(self.n_codebook): - codes[i, :, :] -= i * self.dim_codebook + # for i in range(self.n_codebook): + # codes[i, :, :] -= i * self.dim_codebook emb_quant = [] for i in range(self.n_codebook): tmp_list = decimal_to_ternary_matrix(codes[i, :, :], D=9) - 1 diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index a6103de4c..be73fda74 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -259,7 +259,6 @@ class SpeechTokenizerWrapper(SpeechTokenizer, BaseTokenizer): def __init__(self, *args, **kwargs): SpeechTokenizer.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) - self.sample_rate = 16000 @torch.no_grad() def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): @@ -380,16 +379,15 @@ class MimiTokenizer(Mimi, BaseTokenizer): def __init__(self, *args, **kwargs): Mimi.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) - self.sample_rate= self.sampling_rate @torch.no_grad() def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): self.eval() tokens, _ = self.encode(signal, lengths) if num_codebooks: - if tokens.shape[-1] < num_codebooks: + if tokens.shape[1] < num_codebooks: raise ValueError( - f"Model only outputs {tokens.shape[-1]} codebooks, but {num_codebooks} requested" + f"Model only outputs {tokens.shape[1]} codebooks, but {num_codebooks} requested" ) tokens = tokens[:, :num_codebooks, :] return tokens.movedim(-1, -2) @@ -436,7 +434,6 @@ class WavTokenizerWrapper(WavTokenizer, BaseTokenizer): def __init__(self, *args, **kwargs): WavTokenizer.__init__(self, *args, **kwargs) BaseTokenizer.__init__(self) - self.sample_rate = 24000 @torch.no_grad() def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): diff --git a/speechbrain b/speechbrain index e602161f4..f07cfc76b 160000 --- a/speechbrain +++ b/speechbrain @@ -1 +1 @@ -Subproject commit e602161f4d305e13a26fc71b7dbe4a4cfeaa8847 +Subproject commit f07cfc76bd4b864c598a9ed5948caa3fe3176516 From 3d3e04ca233785633e4c0f0ecc5cc21206408345 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 22 Jan 2025 21:02:58 -0500 Subject: [PATCH 078/270] DASB: Tokotron: Add validation batch size customization (to avoid OOM) --- benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index 04adb7926..3b7d1d5e8 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -55,6 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 150 batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -118,7 +119,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: From 85353929e67da96a028a5d634ac3745736234e9a Mon Sep 17 00:00:00 2001 From: Pooneh Mousavi Date: Thu, 23 Jan 2025 00:03:03 -0500 Subject: [PATCH 079/270] Update README.md --- benchmarks/DASB/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/DASB/README.md b/benchmarks/DASB/README.md index 0ad632979..1c3f78818 100644 --- a/benchmarks/DASB/README.md +++ b/benchmarks/DASB/README.md @@ -158,8 +158,6 @@ bash run_experiments.sh --hparams benchmarks/DASB/LibriSpeech/ASR/hparams/LSTM/t This workflow ensures flexibility, efficiency, and reproducibility for both training scenarios. Adapt the recipes as needed for your specific requirements! -Here's a polished and formatted version for clarity, suitable for a README or documentation: - # 🎛️ Hyperparameter Tuning From 16912d56a3ca4a49ef88ab10dbc407df48aa3b85 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 23 Jan 2025 09:54:38 -0500 Subject: [PATCH 080/270] DASB: Tokotron: Minor fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index e35635b2b..1880b3049 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -449,6 +449,7 @@ def create_waveform(self, audio, length): audio = clean_padding(audio + self.audio_token_offsets, length) wav = self.modules.tokenizer.tokens_to_sig(audio) wav = clean_padding(wav, length) + wav = wav.to(self.device) return wav def is_eval_epoch(self, epoch): @@ -742,14 +743,17 @@ def apply_overfit_test(hparams, dataset): """ if hparams["overfit_test"]: if isinstance(dataset, tuple): - dataset_train, _, _ = dataset + dataset_train, dataset_valid, _ = dataset dataset_train = apply_overfit_test(hparams, dataset_train) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys())) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) + dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) @@ -757,6 +761,7 @@ def apply_overfit_test(hparams, dataset): "train": dataset_train, "valid": dataset_eval, "test": dataset_eval, + "sample": dataset_eval, } else: result = dataset.overfit_test( From ec47b0de911d372c76b000f5619715ad6d7c2b4b Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 23 Jan 2025 11:55:39 -0500 Subject: [PATCH 081/270] DASB: Fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 1880b3049..1ac700742 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -752,11 +752,10 @@ def apply_overfit_test(hparams, dataset): result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) - dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) - dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) result = { "train": dataset_train, "valid": dataset_eval, From 5dba59d717c7495cdb1c102b1ae8af44c28366f3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 23 Jan 2025 21:42:34 -0500 Subject: [PATCH 082/270] DASB: Tokotron: Update priors --- .../TTS/tokotron/hparams/train_dac.yaml | 8 +- .../tokotron/hparams/train_discrete_ssl.yaml | 6 +- .../TTS/tokotron/hparams/train_encodec.yaml | 8 +- .../TTS/tokotron/hparams/train_mimi.yaml | 8 +- .../hparams/train_speech_tokenizer.yaml | 8 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 8 +- .../tokotron/hparams/train_wavtokenizer.yaml | 8 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_dac.yaml | 4 +- .../tokotron/hparams/train_discrete_ssl.yaml | 4 +- .../TTS/tokotron/hparams/train_encodec.yaml | 4 +- .../hparams/train_speech_tokenizer.yaml | 4 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 124 ++++++++---------- .../TTS/valle/hparams/train_encodec.yaml | 8 +- 14 files changed, 93 insertions(+), 111 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 946c6d8c1..3cdaf3c84 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -73,7 +73,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -149,8 +149,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 827da9a25..40890f6a2 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random @@ -187,8 +187,8 @@ token_model_kwargs: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index a34a5b2eb..ccd736e9b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -67,7 +67,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -138,8 +138,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index 3b7d1d5e8..0c9ae43f8 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 5.0 @@ -72,7 +72,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -143,8 +143,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ff51f8e32..f6f2d756a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -71,7 +71,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -142,8 +142,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 0143ef65b..014e0d707 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -73,7 +73,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -144,8 +144,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index a2b90e83a..e02457ae8 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -73,7 +73,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -144,8 +144,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index f0127973c..3dc005074 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -70,7 +70,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 5670aa208..46076fe1f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -177,8 +177,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 88bc91aef..83f4fd6e7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -256,8 +256,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" layerwise_renorm: True d_ffn: 2048 transformer_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index be49b69f6..b5696d7a6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -183,8 +183,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index aac74070a..0ba67441b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -179,8 +179,8 @@ extract_features_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index b5eb30f62..6a0d31fe8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/discrete_ssl +experiment_name: valle/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] @@ -70,7 +70,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random @@ -82,28 +82,23 @@ overfit_test_epoch_data_count: 1000 # index pad_index: 0 -bos_index: 0 -bos_width: 1 -eos_index: 0 -eos_width: 1 -audio_token_shift: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 # stages related parameters lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" lr_warmup_steps: 10000 lr_annealing_mode: step -guided_attention_weight: 50.0 -guided_attention_sigma: 0.5 -gate_loss_weight: 1.0 -gate_threshold: 0.5 -gate_loss_beta: 0.2 -gate_loss_gamma: 0.01 -gate_loss_max_weight: 1. # Feature parameters sample_rate: 22050 model_sample_rate: 16000 -max_audio_length: 1000 +max_audio_length: 2000 +text_max_length: 500 +n_ctx: !ref + infer_max_audio_length: !ref debug_infer_max_audio_length: 10 @@ -117,14 +112,6 @@ token_list_file: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref -# Gate offset -gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp - beta: !ref - gamma: !ref - max_weight: !ref -silence_padding: !ref -use_silence_padding: True - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice value: !ref @@ -179,14 +166,13 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer -d_model: 512 -nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" -d_ffn: 2048 -transformer_dropout: 0.2 -target_dropout: 0.2 -activation: !name:torch.nn.GELU +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 vocab_size: 1000 audio_emb_size: 1024 audio_emb_freeze: False @@ -200,35 +186,41 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + audio_tokens_per_step: 6 -attention_type: regularMHA ############################## models ################################ -model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length - input_num_tokens: !ref - audio_num_tokens: !ref - audio_tokens_per_step: !ref - d_model: !ref - d_ffn: !ref - nhead: !ref - enc_num_layers: !ref - dec_num_layers: !ref - dropout: !ref - target_dropout: !ref - activation: !ref - attention_type: !ref - gate_threshold: !ref - gate_offset: !ref - audio_emb_size: !ref - audio_emb_freeze: !ref - max_audio_length: !ref - eos_mode: !ref - infer_max_audio_length: !ref - audio_token_shift: !ref - decoder_mode: !ref - scale_factor: !ref - representation_mode: discrete +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer save_path: !ref @@ -240,25 +232,15 @@ tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer modules: model: !ref tokenizer: !ref - compute_cost: !ref opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:model.Tokotron.TokotronLoss - guided_attention_weight: !ref - guided_attention_sigma: !ref - gate_weight: !ref - gate_beta: !ref - gate_gamma: !ref - gate_max_weight: !ref - silence_padding: !ref - eos_mode: !ref - eos_index: !ref - eos_width: !ref - audio_tokens_per_step: !ref - audio_token_shift: !ref - representation_mode: discrete + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True lr_annealing: !new:model.Tokotron.TargetedNoamScheduler diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 39b28b437..4db913d00 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -67,7 +67,7 @@ bos_index: 0 bos_width: 1 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -138,8 +138,8 @@ sample_dataloader_opts: # Transformer d_model: 512 nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"uniform(2, 12,discrete=True)" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"uniform(2, 12,discrete=True)" +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 From f7116a8cd26eeeb0fe414113c5a87b95a14fcad6 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 23 Jan 2025 22:13:17 -0500 Subject: [PATCH 083/270] DASB: Fixes --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml | 2 +- .../LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 4 ++-- .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml | 2 +- .../TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml | 2 +- .../LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml | 2 +- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml | 6 +++--- .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 6 +++--- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 6 +++--- .../TTS/tokotron/hparams/train_speech_tokenizer.yaml | 6 +++--- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +- 14 files changed, 23 insertions(+), 23 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 3cdaf3c84..1ae232aca 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -150,7 +150,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 40890f6a2..83c2017fc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -90,7 +90,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -188,7 +188,7 @@ token_model_kwargs: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index ccd736e9b..3c7284821 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -139,7 +139,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index 0c9ae43f8..eac124447 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -144,7 +144,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index f6f2d756a..6b8888153 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -143,7 +143,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 014e0d707..e7af427ad 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -145,7 +145,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index e02457ae8..07e63e45b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -145,7 +145,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 3dc005074..c6ec91445 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -89,7 +89,7 @@ special_tokens: ["", "", ""] special_num_tokens: 4 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 46076fe1f..805384b8d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -76,7 +76,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -178,7 +178,7 @@ extract_features_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 83f4fd6e7..11b7e5af6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -93,7 +93,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 batch_size_guided: 2 extract_features_batch_size: 32 grad_accumulation_factor: 1 @@ -115,7 +115,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -257,7 +257,7 @@ extract_features_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" layerwise_renorm: True d_ffn: 2048 transformer_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index b5696d7a6..9a5838923 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -66,7 +66,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -87,7 +87,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -184,7 +184,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 0ba67441b..703878092 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -58,7 +58,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 # @orion_step1: --batch_size~"uniform(2, 16,discrete=True)" +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -79,7 +79,7 @@ eos_width: 1 audio_token_shift: 0 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step guided_attention_weight: 50.0 @@ -180,7 +180,7 @@ extract_features_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 z_dim: 128 hidden_dim: 2048 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 6a0d31fe8..31ed1cf23 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -89,7 +89,7 @@ special_tokens: ["", "", ""] special_num_tokens: 4 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.5)" +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 4db913d00..e541f4ae0 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -139,7 +139,7 @@ sample_dataloader_opts: d_model: 512 nhead: 4 enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"chocies([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" d_ffn: 2048 transformer_dropout: 0.2 target_dropout: 0.2 From 199a37ca608dac0064272033029453e4192cd439 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 27 Jan 2025 00:02:58 -0500 Subject: [PATCH 084/270] DASB: Tokotron: Fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 1ac700742..881d973c4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -546,17 +546,11 @@ def audio_ref_pipeline(wav): use_silence_padding = hparams.get("use_silence_padding", True) if representation_mode == RepresentationMode.DISCRETE: - layers_key = "token_model_layers" model_key = "tokenizer" else: - layers_key = "ssl_model_layers" model_key = "ssl_model" - audio_tokens_per_step = ( - len(hparams[layers_key]) - if layers_key in hparams - else hparams["audio_tokens_per_step"] - ) + audio_tokens_per_step = hparams["audio_tokens_per_step"] if ( use_silence_padding and representation_mode == RepresentationMode.DISCRETE From dd7f3d3cff19cbfef7fabaeaaba55c980caa9e7b Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 27 Jan 2025 11:44:57 -0500 Subject: [PATCH 085/270] DASB: Tokotron: Fix layer selection for Discrete SSL --- .../tokotron/hparams/train_discrete_ssl.yaml | 6 ++-- .../DASB/LJSpeech/TTS/tokotron/train.py | 32 +++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 83c2017fc..a1be07c07 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -59,8 +59,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False @@ -181,7 +181,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref token_model_kwargs: - SSL_layers: !ref + SSL_layers: !ref ####################### Model parameters ########################### # Transformer diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 881d973c4..05d8805c6 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -275,6 +275,7 @@ def on_stage_start(self, stage, epoch): self.is_evaluating = True self.audio_token_offsets = self.get_token_offsets() + self.token_model_kwargs = getattr(self.hparams, "token_model_kwargs", {}) def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -447,7 +448,7 @@ def create_waveform(self, audio, length): with torch.no_grad(): if self.audio_token_offsets is not None: audio = clean_padding(audio + self.audio_token_offsets, length) - wav = self.modules.tokenizer.tokens_to_sig(audio) + wav = self.modules.tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) wav = clean_padding(wav, length) wav = wav.to(self.device) return wav @@ -574,12 +575,20 @@ def audio_ref_pipeline(wav): ) tokens_loader = hparams.get("tokens_loader") + if "speech_model_layers" in hparams: + tokens_loader_kwargs = { + "num_codebooks": get_selected_layer_indexes(hparams) + } + else: + tokens_loader_kwargs = { + "num_codebooks": audio_tokens_per_step + } @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") def audio_pipeline(id): audio = tokens_loader.tokens_by_uttid( - id, num_codebooks=audio_tokens_per_step + id, **tokens_loader_kwargs ) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding @@ -678,6 +687,25 @@ def init_sequence_encoder(hparams): return encoder +def get_selected_layer_indexes(hparams): + """Finds the layers of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [ + available_layers.index(layer) + for layer in selected_layers + ] + return layer_idx + + def read_token_list(file_name): """Reads a simple text file with tokens (e.g. characters or phonemes) listed one per line From 46c8ba4434693ad89f71655a1cea04f6460a459a Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 27 Jan 2025 22:31:03 -0500 Subject: [PATCH 086/270] DASB: VALL-E: Add LibriTTS --- .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 23 +- .../DASB/LibriTTS/TTS/valle/evaluation.py | 357 ++++++++++++++++++ .../LibriTTS/TTS/valle/hparams/arpabet.txt | 50 +++ .../LibriTTS/TTS/valle/hparams/char_en.txt | 38 ++ .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 57 +++ .../TTS/valle/hparams/train_discrete_ssl.yaml | 19 +- .../TTS/valle/hparams/train_encodec.yaml | 136 +++---- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 10 +- benchmarks/DASB/model/valle.py | 13 +- 9 files changed, 609 insertions(+), 94 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index bafd769cc..9e9d91dc3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -7,11 +7,7 @@ eval_interval: 1 eval_subset: null eval_asr_beam_size: 66 eval_asr_type: encoder_decoder -eval_asr_source: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech - whisper: openai/whisper-small +eval_asr_source: openai/whisper-small eval_spk_sim_source: microsoft/wavlm-base-sv evaluations: utmos,asr,spk_sim tmp_folder: null @@ -24,19 +20,10 @@ eval_utmos_judge_id: null eval_perf: False -eval_asr: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator - source: !ref - sample_rate: !ref - overrides: - lm_weight: 0.0 - test_beam_size: !ref - whisper: !name:eval.WhisperASRSpeechEvaluator - source: !ref - sample_rate: !ref - savedir: !ref +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref eval_utmos: !name:eval.UTMOSSpeechEvaluator source: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py new file mode 100644 index 000000000..9fd6da808 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -0,0 +1,357 @@ +import json +import torch +import logging +import re +import csv +from speechbrain.utils.metric_stats import MetricStats +from types import SimpleNamespace +from pathlib import Path +from utils.data import undo_batch +from torch import nn + + +logger = logging.getLogger(__name__) + + +class SpeechEvaluationMetricStats(MetricStats): + """An aggregate metric combining multiple speech evaluators + + Arguments + --------- + hparams : dict | SimpleNamespace | object + Raw hyperparameters for evaluation + + device : str + The device on which evaluation will be performed + + """ + + def __init__(self, hparams, device="cpu"): + if isinstance(hparams, dict): + hparams = SimpleNamespace(**hparams) + self.hparams = hparams + self.device = device + modules = self.hparams.modules + self.modules = nn.ModuleDict(modules).to(self.device) + self.enabled_evaluators = set(self.hparams.evaluations.split(",")) + evaluators = hparams.evaluators + if evaluators: + self.evaluators = { + key: evaluator_f(run_opts={"device": device}) + for key, evaluator_f in evaluators.items() + if key in self.enabled_evaluators + } + else: + self.evaluators = {} + + if not self.evaluators: + logger.warn( + "No evaluators were defined - this run will produce samples only" + ) + + def on_evaluation_start(self, output_folder="eval"): + """Invoked at the beginning of the evaluation cycle. + + Arguments + --------- + output_folder : str | path-like + The folder to which results will be output + + """ + logger.info("Starting evaluation") + output_folder = Path(output_folder) + self.output_folder = ( + output_folder + if output_folder.is_absolute() + else self.hparams.output_folder / output_folder + ) + self.output_folder.mkdir(parents=True, exist_ok=True) + + self.files = [] + details_keys = list(self.evaluators.keys()) + self.details = {evaluator_key: [] for evaluator_key in details_keys} + self.read_reports() + self.create_reports() + self.item_ids = [] + + def on_evaluation_end(self): + """Invoked at the beginning of the evaluation cycle. The default + implementation is a no-op + """ + logger.info("Ending evaluation") + self.write_summary() + + def create_reports(self): + """Creates report files and report writers""" + self.report_files = {} + self.report_writers = {} + for evaluator_key in self.enabled_evaluators: + columns = self.get_report_columns(evaluator_key) + file_name = self.output_folder / f"{evaluator_key}.csv" + self.files.append(file_name) + resume = file_name.exists() and file_name.stat().st_size > 0 + report_file = open(file_name, "a+") + self.report_files[evaluator_key] = report_file + writer = csv.DictWriter(report_file, columns) + if not resume: + writer.writeheader() + self.report_writers[evaluator_key] = writer + + def read_reports(self): + """Invoked when resuming""" + for evaluator_key in self.enabled_evaluators: + file_name = self.output_folder / f"{evaluator_key}.csv" + if file_name.exists(): + logger.info("%s exists, reading") + with open(file_name) as report_file: + reader = csv.DictReader(report_file) + for row in reader: + del row["uttid"] + row = { + key: handle_number(value) + for key, value in row.items() + } + self.details[evaluator_key].append(row) + + def get_tracker_file_name(self): + """Determines the file name of the tracker file""" + suffix = ( + f"_{self.hparams.eval_suffix}" if self.hparams.eval_suffix else "" + ) + file_name = f"tracker_{self.hparams.eval_dataset}{suffix}.txt" + return self.output_folder / file_name + + def get_report_columns(self, evaluator_key): + """Returns the columns for the specified evaluator + + Arguments + --------- + evaluator_key : str + the identifier of the evaluator + + Returns + ------- + columns : list[str] + a list of column headers + """ + bogus_wavs = torch.randn(2, 10000, device=self.device) + bogus_length = torch.tensor([1.0, 1.0], device=self.device) + evaluator = self.evaluators[evaluator_key] + result = evaluator.evaluate( + wavs=bogus_wavs, + length=bogus_length, + text=["BOGUS"] * len(bogus_wavs), + wavs_ref=bogus_wavs, + length_ref=bogus_length, + ) + + return ["uttid"] + list(result.details.keys()) + + def append(self, ids, wav, length, text, wav_ref, length_ref): + """Appends the result of a single item + + Arguments + --------- + ids : str + Utterance IDs + wav : torch.Tensor + Synthesized waveforms + length : torch.Tensor + Relative lengths of the synthesized waveforms + text : list + Ground truth text + wav_ref : torch.Tensor + Reference (ground truth) waveforms + length_ref : torch.Tensor + Reference lengths + """ + with torch.no_grad(): + self.item_ids.extend(ids) + for evaluator_key, evaluator in self.evaluators.items(): + result = evaluator.evaluate( + wavs=wav, + length=length, + text=text, + wavs_ref=wav_ref, + length_ref=length_ref, + sample_rate_ref=self.hparams.sample_rate, + sample_rate=self.hparams.model_sample_rate, + ) + details = undo_batch(result.details) + self.write_result(evaluator_key, ids, details) + self.details[evaluator_key].extend(details) + + def write_result(self, evaluator_key, ids, details): + """Outputs the result details to the report for the specified evaluator + + Arguments + --------- + evaluator_key : str + The evaluator key + ids : list + The list of IDs + details : list + a list of evaluation details, one dictionary per item + """ + writer = self.report_writers[evaluator_key] + for uttid, details_item in zip(ids, details): + report_details = { + "uttid": uttid, + **details_item, + } + writer.writerow(ascii_only(flatten(report_details))) + self.report_files[evaluator_key].flush() + + def write_summary(self, file_name=None): + """Outputs summarized statistics + + Arguments + --------- + file_name : str | path-like + An alternative path to save the file + """ + summary = self.summarize() + if file_name is None: + file_name = self.output_folder / "summary.json" + self.files.append(file_name) + with open(file_name, "w") as output_file: + json.dump(summary, output_file, indent=4) + + def summarize(self, field=None): + """Computes the summarized statistics + + Arguments + --------- + field : str, optional + If specified, it will return a specific field + + Returns + ------- + result : dict | float + The summary - or the specified field from the sum + """ + result = { + f"{evaluator_key}_{stat_key}": value + for evaluator_key in self.enabled_evaluators + if evaluator_key in self.details + for metric_key in self.hparams.eval_summary[evaluator_key][ + "descriptive" + ] + for stat_key, value in descriptive_statistics( + items=self.details[evaluator_key], key=metric_key, + ).items() + } + if field is not None: + result = result[field] + return result + + def clear(self): + """Deletes all the files that have been created""" + for file_name in self.files: + file_name.unlink() + + +RE_NON_ASCII = re.compile(r"[^\x00-\x7F]+") + + +def ascii_only(values): + """Removes any non-ASCII characters from a dictionary + + Arguments + --------- + values : dict + A dictionary of values + + Returns + ------- + result : dict + The same dictionary - but with non-ASCII strings removed""" + return { + key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value + for key, value in values.items() + } + + +def descriptive_statistics(items, key): + """Computes descriptive statistics for the summary + + Arguments + --------- + items : list + a list of dictionaries with metric values for each item + key : str + The key of the metric for which the statistics will be computed + + Returns + ------- + statistics : dict + The desccriptive statistics computed + _mean : the arithmetic mean + _std : the standard deviation + _min : the minimum value + _max : the maximum value + _median : the median value + _q1 : the first quartile + _q3 : the third quartile + _iqr : the interquartile ratio + """ + values = torch.tensor([item[key] for item in items]) + quantiles = torch.tensor([0.25, 0.5, 0.75]) + q1, median, q3 = values.quantile(quantiles) + stats = { + "mean": values.mean(), + "std": values.std(), + "min": values.min(), + "max": values.max(), + "median": median, + "q1": q1, + "q3": q3, + "iqr": q3 - q1, + } + return { + f"{key}_{stat_key}": value.item() for stat_key, value in stats.items() + } + + +def flatten(value): + """Converts tensors to scalars and lists of strings to strings + + Arguments + --------- + value : dict + the dictionary to flatten + + Returns + ------- + result : dict + a flattened dictionary + """ + return { + key: item_value.item() if torch.is_tensor(item_value) else item_value + for key, item_value in value.items() + } + + +RE_INTEGER = re.compile(r"^-?\d+$") +RE_FLOAT = re.compile(r"^-?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?$") + + +def handle_number(value): + """Converts a value to a number, if applicable. Strings + that look like integers or floats will be converted to integers + or floats. + + Arguments + --------- + value : str + a string value + + Returns + ------- + result : object + The processed result""" + if RE_INTEGER.match(value): + value = int(value) + elif RE_FLOAT.match(value): + value = float(value) + return value diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt new file mode 100644 index 000000000..105a1dd9d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/arpabet.txt @@ -0,0 +1,50 @@ +AA +AE +AH +AO +AW +AY +B +CH +D +DH +EH +ER +EY +F +G +HH +IH +IY +JH +K +L +M +N +NG +OW +OY +P +R +S +SH +T +TH +UH +UW +V +W +Y +Z +ZH +' +" +! +( +) +, +- +. +: +; +? diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt new file mode 100644 index 000000000..f43d3b08d --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/char_en.txt @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +' +" +! +( +) +, +- +. +: +; +? + \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml index e69de29bb..129cf9337 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -0,0 +1,57 @@ +eval_dataset: valid +eval_suffix: "" +eval_sample_rate: 16000 +eval_spk_sim_sample_rate: 16000 +eval_samples: null +eval_interval: 1 +eval_subset: null +eval_asr_beam_size: 66 +eval_asr_type: encoder_decoder +eval_asr_source: openai/whisper-small +eval_spk_sim_source: microsoft/wavlm-base-sv +evaluations: utmos,asr,spk_sim +tmp_folder: null +eval_utmos_source: chaanks/wav2vec2-small +eval_utmos_save_path: !ref /utmos +eval_utmos_model_name: utmos.ckpt +eval_utmos_model_url: https://huggingface.co/chaanks/UTMOS/resolve/main +eval_utmos_domain_id: null +eval_utmos_judge_id: null +eval_perf: False + + +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref + +eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator + source: !ref + save_path: !ref + model_name: !ref + model_url: !ref + domain_id: !ref + judge_id: !ref + +eval_spk_sim: !name:utils.eval.SpkSimWavLM + source: !ref + savedir: !ref + model_sample_rate: !ref + +evaluators: + utmos: !ref + asr: !ref + spk_sim: !ref + +eval_summary: + asr: + descriptive: ["wer", "cer", "wer_ref", "cer_ref", "dwer", "dcer"] + utmos: + descriptive: ["utmos"] + spk_sim: + descriptive: ["score"] + +eval_summary_log: + utmos: utmos_utmos_mean + dwer: asr_dwer_median + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 31ed1cf23..51315b9eb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -20,16 +20,24 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER -cached_data_folder: !PLACEHOLDER -prepare_save_folder: !ref +cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref -vocoder_model_name: !ref unithifigan-dasb--discrete -vocoder_model_path: !ref / prepare_archive_path: null prepare_skip_ignore_folders: False +data_mode: lite train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress @@ -37,6 +45,7 @@ progress_current: !ref /current progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 + tokens_folder: !PLACEHOLDER tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref @@ -60,6 +69,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice ssl_model_layers: [1, 3, 7, 12, 18, 23] token_model_layers: !ref +flip_layers: false token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False @@ -100,6 +110,7 @@ max_audio_length: 2000 text_max_length: 500 n_ctx: !ref + infer_max_audio_length: !ref +max_length_ratio: 10.0 debug_infer_max_audio_length: 10 # Label encoder diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index e541f4ae0..9d9c1f278 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -14,21 +14,29 @@ save_folder: !ref /save train_log: !ref /train_log.txt testing: True # If set to True, the test evlaution is done, otherwise skipped. -token_model_src: "facebook/encodec_24khz" -g2p_src: flexthink/soundchoice-g2p -# Model type -representation_mode: discrete # Data files data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech pretrained_model_save_folder: !ref +ssl_model_type: wavlm +representation_mode: discrete prepare_archive_path: null prepare_skip_ignore_folders: False +data_mode: lite train_json: !ref /train.json valid_json: !ref /valid.json test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] frozen_split_path: null sample_path: null progress_folder: !ref /progress @@ -37,15 +45,13 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. - tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref - +flip_layers: True splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] - - ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -63,26 +69,26 @@ overfit_test_epoch_data_count: 1000 # index pad_index: 0 -bos_index: 0 -bos_width: 1 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 # stages related parameters lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step -guided_attention_weight: 50.0 -guided_attention_sigma: 0.5 -gate_loss_weight: 1.0 -gate_threshold: 0.5 -gate_loss_beta: 0.2 -gate_loss_gamma: 0.01 -gate_loss_max_weight: 1. # Feature parameters sample_rate: 22050 -model_sample_rate: 24000 -max_audio_length: 1000 +model_sample_rate: 16000 +max_audio_length: 2000 +text_max_length: 500 +n_ctx: !ref + infer_max_audio_length: !ref +max_length_ratio: 10.0 debug_infer_max_audio_length: 10 # Label encoder @@ -95,14 +101,6 @@ token_list_file: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref -# Gate offset -gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp - beta: !ref - gamma: !ref - max_weight: !ref - -silence_padding: !ref - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -136,16 +134,14 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer -d_model: 512 -nhead: 4 -enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" -dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" -d_ffn: 2048 -transformer_dropout: 0.2 -target_dropout: 0.2 -activation: !name:torch.nn.GELU -audio_num_tokens: 1024 -audio_emb_size: 1024 +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False text_num_tokens: 39 @@ -155,31 +151,43 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref -audio_tokens_per_step: 2 + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 bandwidth: 1.5 attention_type: regularMHA ############################## models ################################ -model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length - input_num_tokens: !ref - audio_num_tokens: !ref - audio_tokens_per_step: !ref - d_model: !ref - d_ffn: !ref - nhead: !ref - enc_num_layers: !ref - dec_num_layers: !ref - dropout: !ref - target_dropout: !ref - activation: !ref - attention_type: !ref - gate_threshold: !ref - gate_offset: !ref - audio_emb_size: !ref - audio_emb_freeze: !ref - max_audio_length: !ref - infer_max_audio_length: !ref +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio @@ -198,14 +206,10 @@ modules: opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !new:model.Tokotron.TokotronLoss - guided_attention_weight: !ref - guided_attention_sigma: !ref - gate_weight: !ref - gate_beta: !ref - gate_gamma: !ref - gate_max_weight: !ref - silence_padding: !ref +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index ebcc78015..4045e89ca 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -78,6 +78,7 @@ def create_waveform(self, audio, length): audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int() wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) + wav = wav.to(self.device) return wav def compute_forward(self, batch, stage): @@ -401,7 +402,7 @@ def _get_inference_opts(self): track_end = track_start + self.hparams.vocab_size mask = ( ((idx >= track_start) & (idx < track_end)) - | (idx == self.hparams.bos_index) + | (idx == self.hparams.eos_index) ).logical_not() return self.hparams.inference_opts( masks={ @@ -698,14 +699,17 @@ def apply_overfit_test(hparams, dataset): """ if hparams["overfit_test"]: if isinstance(dataset, tuple): - dataset_train, _, _ = dataset + dataset_train, dataset_valid, _ = dataset dataset_train = apply_overfit_test(hparams, dataset_train) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys())) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) + dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) + dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) @@ -797,10 +801,8 @@ def undo_padding_tensor(batch, lengths): overrides=overrides, ) - from ljspeech_prepare import prepare_ljspeech # Data preparation, to be run on only one process. - if not hparams["skip_prep"]: from libritts_prepare import prepare_libritts # Data preparation, to be run on only one process. diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 245ac0fd9..b85e68345 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -262,6 +262,10 @@ def inference( modality_index = prev_tok.flatten() mask = modality_index_to_mask(modality_index, opts) mask_cache = [] + modality_tokens = torch.tensor( + list(opts.masks.keys()), + device=prefix.device + ) for step in range(maxlen): # (3.2) AR loop @@ -288,9 +292,14 @@ def inference( # (3.3) detect modality swtich mask_cache.append(mask.clone()) - modality_change_mask = torch.logical_and( - prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, + modality_change_mask = torch.isin( + prev_tok[:, 0], + modality_tokens ) + # Note: The ESPNET VALL-E had + # modality_change_mask = torch.logical_and( + # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, + #) if torch.any(modality_change_mask): modality_index = torch.where( modality_change_mask, prev_tok[:, 0], modality_index, From ba6bddb896dd40b45fe954c8d8067c831cfe0cc3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 28 Jan 2025 00:21:24 -0500 Subject: [PATCH 087/270] DASB: VALL-E: Fixes/Updates --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 3 + .../TTS/valle/hparams/train_encodec.yaml | 8 +- .../LibriTTS/TTS/valle/libritts_prepare.py | 1 + benchmarks/DASB/LibriTTS/TTS/valle/train.py | 130 +++++++++++++++++- 4 files changed, 136 insertions(+), 6 deletions(-) create mode 120000 benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 51315b9eb..36002334a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -271,3 +271,6 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 9d9c1f278..8ef4e455f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -50,7 +50,7 @@ g2p_src: flexthink/soundchoice-g2p tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref -flip_layers: True +flip_layers: False splits: ["train", "valid", "test"] ckpt_interval_minutes: 30 # save checkpoint every N min @@ -86,7 +86,8 @@ sample_rate: 22050 model_sample_rate: 16000 max_audio_length: 2000 text_max_length: 500 -n_ctx: !ref + +spk_prompt_length: 150 +n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 debug_infer_max_audio_length: 10 @@ -227,3 +228,6 @@ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py new file mode 120000 index 000000000..489ab4011 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/libritts_prepare.py @@ -0,0 +1 @@ +../../libritts_prepare.py \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 4045e89ca..14259634c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -22,8 +22,10 @@ from hyperpyyaml import load_hyperpyyaml from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio from speechbrain.dataio.dataio import write_audio +from speechbrain.utils.data_utils import pad_right_to from speechbrain.utils.distributed import run_on_main from speechbrain.utils.data_utils import batch_pad_right +from functools import partial import re import string @@ -489,6 +491,7 @@ def dataio_prepare(hparams): offsets = offsets.flip(-1) tokens_loader = hparams.get("tokens_loader") + spk_prompt_length = hparams["spk_prompt_length"] @sb.utils.data_pipeline.takes("label") @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") @@ -505,13 +508,29 @@ def tokens_pipeline(label): """Processes the transcriptions to generate proper labels""" return label_encoder.encode_sequence_torch(label) - @sb.utils.data_pipeline.takes("uttid", "tokens") + def spk_prompt(uttid, spk_sample): + # Sample a speaker-matched embedding + selected_uttid = spk_sample[uttid] + audio = tokens_loader.tokens_by_uttid( + selected_uttid, num_codebooks=hparams["audio_tokens_per_step"] + ) + if audio.size(0) > spk_prompt_length: + offset = torch.randint(0, audio.size(0), (1,)).item() + else: + offset = 0 + # Retrieve the embedding value from the dataset + audio_spk_prompt, _ = pad_right_to( + audio[offset:offset + spk_prompt_length], + (spk_prompt_length, audio.size(1)) + ) + return audio_spk_prompt + + @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt") @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length") - def prompt_pipeline(id, tokens): + def prompt_pipeline(id, tokens, spk_prompt): audio = tokens_loader.tokens_by_uttid( id, num_codebooks=hparams["audio_tokens_per_step"] ) - if hparams["flip_layers"]: audio = audio.flip(-1) yield audio @@ -521,6 +540,8 @@ def prompt_pipeline(id, tokens): torch.ones(1, num_tracks) * hparams["bos_index"], tokens.unsqueeze(-1).expand(len(tokens), num_tracks), torch.ones(1, num_tracks) * hparams["eot_index"], + spk_prompt + hparams["audio_token_shift"] + offsets, + torch.ones(1, num_tracks) * hparams["eop_index"], ] ) yield prefix @@ -542,7 +563,7 @@ def sig_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) return sig - dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline] + dynamic_items = [text_pipeline, tokens_pipeline] init_sequence_encoder(hparams) use_spk_emb = hparams.get("use_spk_emb", False) @@ -560,6 +581,7 @@ def sig_pipeline(wav): prepared_features.append("spk_emb") output_keys.append("spk_emb") + resample_fn = {} for dataset in data_info: dataset_dynamic_items = list(dynamic_items) dataset_output_keys = list(output_keys) @@ -572,6 +594,27 @@ def sig_pipeline(wav): dynamic_items=dataset_dynamic_items, output_keys=dataset_output_keys, ) + spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) + spk_sample = {} + spk_prompt_pipeline = partial( + spk_prompt, + spk_sample=spk_sample, + ) + dynamic_dataset.add_dynamic_item( + func=spk_prompt_pipeline, + takes=["uttid"], + provides=["spk_prompt"], + ) + dynamic_dataset.add_dynamic_item(prompt_pipeline) + resample_fn[dataset] = partial( + resample_spk, + spk_idx=spk_idx, + sample=spk_sample, + dataset=dynamic_dataset, + spk_samplers=spk_samplers, + ) + resample_fn[dataset](epoch=0) + datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False @@ -597,6 +640,7 @@ def sig_pipeline(wav): raise NotImplementedError( "sorting must be random, ascending or descending" ) + return datasets @@ -613,6 +657,84 @@ def get_offsets(vocab_size, tracks): return torch.arange(tracks) * vocab_size +def group_by_speaker(dataset, hparams): + """Groups utterance IDs in a dataset by speaker, for selection. The selection + is stable based on the seed - calling this method multiple times will always + result in the same order + + Arguments + --------- + dataset : torch.Tensor + the dataset from which to select items + hparams : dict + hyperparameters + + Returns + ------- + spk_idx : dict + a str -> str with a list of utterance IDs + for every speaker + spk_samplers : dict + a reproducible sampler for every speaker + spk_samplers_it : dict + an iterator for each sampler + """ + spk_uttid = {} + spk_samplers = {} + speakers = [] + generator = torch.Generator() + generator.manual_seed(hparams["seed"]) + + # Group by speaker + with dataset.output_keys_as(["spk_id", "uttid"]): + for idx, item in enumerate(dataset): + spk_id = item["spk_id"] + if spk_id not in spk_uttid: + spk_uttid[spk_id] = [] + spk_uttid[spk_id].append(item["uttid"]) + speakers.append(spk_id) + + # Create a reproducible sampler + for spk_id in speakers: + sampler = hparams["spk_sampler"](data_source=spk_uttid[spk_id]) + spk_samplers[spk_id] = sampler + + return spk_uttid, spk_samplers + + +def resample_spk(sample, spk_idx, spk_samplers, dataset, epoch): + """Selects new samples + + Arguments + --------- + spk_idx : dict + Data item indexes grouped by speaker + spk_samplers : dict + A sampler for each speaker + spk_samplers_it : dict + An iterator for each speaker + epoch : int + The epoch number + + Returns + ------- + sample : dict + a dictionary with uttids as keys and matching + indexes as values + """ + if epoch is None: + epoch = 0 + spk_samplers_it = {} + for spk_id, sampler in spk_samplers.items(): + sampler.set_epoch(epoch) + spk_samplers_it[spk_id] = iter(sampler) + with dataset.output_keys_as(["uttid", "spk_id"]): + for item in dataset: + spk_item_idx = next(spk_samplers_it[item["spk_id"]]) + dataset_item_idx = spk_idx[item["spk_id"]][spk_item_idx] + sample[item["uttid"]] = dataset_item_idx + + def init_sequence_encoder(hparams): """Initialize a sequence encoder From d8a720c966c1ebbf0f1ffe9e1f680e00fd0f5b9b Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 28 Jan 2025 11:07:20 -0500 Subject: [PATCH 088/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 14259634c..43bcb1745 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -830,11 +830,11 @@ def apply_overfit_test(hparams, dataset): result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) - dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) - dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) + result = { "train": dataset_train, "valid": dataset_eval, From 11c427bab53b27aa9f596cdd0d0756b77ab49f65 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 28 Jan 2025 15:30:36 -0500 Subject: [PATCH 089/270] DASB: VALL-E: Fixes --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 11 +++++++---- .../LibriTTS/TTS/valle/hparams/train_encodec.yaml | 13 ++++++++----- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 14 ++++++-------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 36002334a..77e1e56a7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -81,6 +81,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 50 batch_size: 16 +valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random @@ -102,10 +103,11 @@ special_num_tokens: 4 lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step +betas: [0.9, 0.95] # Feature parameters -sample_rate: 22050 -model_sample_rate: 16000 +sample_rate: 24000 +model_sample_rate: 24000 max_audio_length: 2000 text_max_length: 500 n_ctx: !ref + @@ -155,7 +157,7 @@ train_dataloader_opts: padding_kwargs: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: @@ -244,8 +246,9 @@ modules: model: !ref tokenizer: !ref -opt_class: !name:torch.optim.Adam +opt_class: !name:torch.optim.AdamW lr: !ref + betas: !ref compute_cost: !name:model.valle.masked_nll_loss diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 8ef4e455f..74b31ecad 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -57,7 +57,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 +valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random @@ -80,10 +81,11 @@ special_num_tokens: 5 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step +betas: [0.9, 0.95] # Feature parameters -sample_rate: 22050 -model_sample_rate: 16000 +sample_rate: 24000 +model_sample_rate: 24000 max_audio_length: 2000 text_max_length: 500 spk_prompt_length: 150 @@ -112,7 +114,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: @@ -204,8 +206,9 @@ modules: tokenizer: !ref # define two optimizers here for two-stage training -opt_class: !name:torch.optim.Adam +opt_class: !name:torch.optim.AdamW lr: !ref + betas: !ref compute_cost: !name:model.valle.masked_nll_loss diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 43bcb1745..8c8b1ada7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -77,7 +77,6 @@ def create_waveform(self, audio, length): if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device - audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(min=0.).int() wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) wav = wav.to(self.device) @@ -384,10 +383,11 @@ def inference(self, batch): for prefix_item in prefix_items ] inferred_tokens = [ - result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step) + result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step, device=self.device) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) + audio_length = audio_length.to(self.device) audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) return audio, audio_length @@ -418,7 +418,7 @@ def save_samples(self, batch, wav, length, stage): samples = undo_padding_tensor(wav, length) for uttid, sample in zip(batch.uttid, samples): file_name = output_folder / f"pred_{uttid}.wav" - write_audio(file_name, sample, self.hparams.model_sample_rate) + write_audio(file_name, sample.cpu(), self.hparams.model_sample_rate) def save_eval(self, stage): """Saves evaluation results @@ -563,7 +563,7 @@ def sig_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) return sig - dynamic_items = [text_pipeline, tokens_pipeline] + dynamic_items = [text_pipeline, tokens_pipeline, sig_pipeline] init_sequence_encoder(hparams) use_spk_emb = hparams.get("use_spk_emb", False) @@ -586,7 +586,6 @@ def sig_pipeline(wav): dataset_dynamic_items = list(dynamic_items) dataset_output_keys = list(output_keys) if dataset != "train": - dataset_dynamic_items.append(sig_pipeline) dataset_output_keys += ["sig", "label_norm_eval", "prefix"] dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( json_path=data_info[dataset], @@ -633,9 +632,8 @@ def sig_pipeline(wav): hparams["train_dataloader_opts"]["shuffle"] = False elif hparams["sorting"] == "random": - hparams["train_dataloader_opts"]["shuffle"] = True - pass - + if not hparams["overfit_test"]: + hparams["train_dataloader_opts"]["shuffle"] = True else: raise NotImplementedError( "sorting must be random, ascending or descending" From 2d1a46a0384f89559dd25a4c227c3859f190415f Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 29 Jan 2025 13:36:33 -0500 Subject: [PATCH 090/270] DASB: Fix ST extraction --- .../DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml | 2 +- .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml index f91d34908..155960c27 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -40,7 +40,7 @@ freeze_embedding: False save_embedding: False -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml index 9a53ed27b..85148db9d 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -41,7 +41,7 @@ freeze_embedding: False save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) -tokenizer: !new:utils.tokenizer_interface.SpeechTokenizer +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio save_path: !ref From e53d7c69e8f9f099d7006ae5edf5f977132cb942 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 30 Jan 2025 01:30:21 -0500 Subject: [PATCH 091/270] DASB: Add support for using Orion Trial IDs instead of randomness --- benchmarks/DASB/run_experiments.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) mode change 100644 => 100755 benchmarks/DASB/run_experiments.sh diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh old mode 100644 new mode 100755 index e0f848aef..aacbc381e --- a/benchmarks/DASB/run_experiments.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}" if [ "$rnd_dir" = True ]; then - rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) - output_folder="$output_folder/$rnd_dirname" + if [[ ! -z "$ORION_TRIAL_ID" ]]; then + # Use the Orion Trial ID to ensure interrupted trials are resumed + output_folder="$output_folder/$ORION_TRIAL_ID" + else + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" + fi fi # Make sure the output_folder is created @@ -201,4 +206,4 @@ done echo 'Final Results (Performance Aggregation)' -python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt \ No newline at end of file +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt From 602f41f0c2f1a76a5fbdeecabb579f77c58967ed Mon Sep 17 00:00:00 2001 From: Pooneh Mousavi Date: Thu, 30 Jan 2025 10:35:43 -0500 Subject: [PATCH 092/270] Update run_experiments.sh fix bug for orion resuming --- benchmarks/DASB/run_experiments.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh index e0f848aef..aacbc381e 100644 --- a/benchmarks/DASB/run_experiments.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -149,8 +149,13 @@ seed="${seed:-$RANDOM}" if [ "$rnd_dir" = True ]; then - rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) - output_folder="$output_folder/$rnd_dirname" + if [[ ! -z "$ORION_TRIAL_ID" ]]; then + # Use the Orion Trial ID to ensure interrupted trials are resumed + output_folder="$output_folder/$ORION_TRIAL_ID" + else + rnd_dirname=$(tr -dc 'a-zA-Z' < /dev/urandom | head -c 6) + output_folder="$output_folder/$rnd_dirname" + fi fi # Make sure the output_folder is created @@ -201,4 +206,4 @@ done echo 'Final Results (Performance Aggregation)' -python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt \ No newline at end of file +python utils/aggregate_results.py $output_folder "$eval_metric" | tee -a $output_folder/aggregated_performance.txt From 4f5153e3412229dbdc0b925a4d2758698df13771 Mon Sep 17 00:00:00 2001 From: Pooneh Mousavi Date: Thu, 30 Jan 2025 12:13:41 -0500 Subject: [PATCH 093/270] Update run_hparam_optimization.sh fix final run resuming --- benchmarks/DASB/run_hparam_optimization.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 2ad1dddf3..bf7c9b1fa 100644 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -415,6 +415,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all --testing True $additional_flags + --rnd_dir False --testing True $additional_flags -echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file +echo "The test performance with best hparams is available at $output_folder/best" From 1d0aec0f8f3ecb5561b36b5deda6625b1546f19c Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 30 Jan 2025 12:17:04 -0500 Subject: [PATCH 094/270] DASB: Disable random directory name generation for the final test phase --- benchmarks/DASB/run_hparam_optimization.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 benchmarks/DASB/run_hparam_optimization.sh diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh old mode 100644 new mode 100755 index 2ad1dddf3..468015d08 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -415,6 +415,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir $store_all --testing True $additional_flags + --rnd_dir False --testing True $additional_flags echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file From e0bb2655956878e84792cf029377e9f78f300914 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 30 Jan 2025 23:05:54 -0500 Subject: [PATCH 095/270] DASB: Fixed the codebook count --- benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml index 22e15ef75..482f3739f 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml @@ -36,7 +36,7 @@ dataloader_opts: # Tokenizer parameters model_hub: kyutai/mimi vocab_size: 1024 -num_codebooks: 23 +num_codebooks: 32 sample_rate: 24000 # Feature parameters encoder_dim: 1024 From 5f5105f1d76d9d96e566f2eb4c8f7790ab2b9386 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 30 Jan 2025 23:19:04 -0500 Subject: [PATCH 096/270] DASB: Extraction fixes/updates --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 7 +-- .../extraction/hparams/wavtokenizer.yaml | 2 +- .../LibriTTS/extraction/hparams/mimi.yaml | 57 ++++++++++++++++++ .../LibriTTS/extraction/hparams/sqcodec.yaml | 57 ++++++++++++++++++ .../extraction/hparams/wavtokenizer.yaml | 59 +++++++++++++++++++ 5 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index c6ec91445..8f1c22767 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -57,14 +57,13 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS -ssl_model_layers: [1, 3, 7, 12, 18, 23] +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref flip_layers: True -token_model_layers: !ref token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -175,7 +174,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref token_model_kwargs: - SSL_layers: !ref + SSL_layers: !ref ####################### Model parameters ########################### # Transformer diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml index 5fe91bbce..3a0a935ff 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -32,7 +32,7 @@ dataloader_opts: shuffle: True num_workers: !ref -# EnCodec parameters +# WavTokenizer parameters model_hub: novateur/WavTokenizer-medium-music-audio-75token config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml index e69de29bb..9e64347c7 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +####################### Model parameters ########################### +# Tokenizer parameters +model_hub: kyutai/mimi +vocab_size: 1024 +num_codebooks: 32 +sample_rate: 24000 +encoder_dim: 1024 +freeze_embedding: False +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml new file mode 100644 index 000000000..cf46b3f5a --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml @@ -0,0 +1,57 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/sqcodec +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# SQCodec parameters +config: config.yaml +checkpoint: ckpt_00190000.pth +sample_rate: 16000 +save_embedding: False +num_codebooks: 4 +save_path: /home/ubuntu/sq-codec/SQ-Codec + + +# SQCodec model +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml new file mode 100644 index 000000000..c7581bbe7 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml @@ -0,0 +1,59 @@ +# ############################################################################ +# Auido Tokenizer: Speech Tokenizer +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/speech_tokenizer +save_folder: !ref /save +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# WavTokenizer parameters +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +sample_rate: 24000 +save_embedding: False +num_codebooks: 1 +vocab_size: 4096 + +# wavtokenizer model +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref From d02e8702b936ca40c68ba1784b9536e38eafb90c Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 31 Jan 2025 00:11:38 -0500 Subject: [PATCH 097/270] DASB: Clean-up --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 77e1e56a7..688b7ace3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -38,14 +38,8 @@ train_split: !apply:speechbrain.utils.hparams.choice full: ["train-clean-100", "train-clean-360", "train-other-500"] valid_split: ["dev-clean"] test_split: ["test-clean"] -frozen_split_path: null -sample_path: null progress_folder: !ref /progress progress_current: !ref /current -progress_meta: !ref /meta.yaml -num_audio_samples: 32 -samples_interval: 5 - tokens_folder: !PLACEHOLDER tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref @@ -66,15 +60,12 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice hubert: speechbrain/hifigan-hubert-k1000-LibriTTS wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS - -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref +speech_model_layers: [1, 3, 7, 12, 18, 23] flip_layers: false token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -175,7 +166,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref token_model_kwargs: - SSL_layers: !ref + SSL_layers: !ref ####################### Model parameters ########################### # Transformer From 0f2561d55491bcf02b38a415480f1cf1eab3367e Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 31 Jan 2025 01:05:40 -0500 Subject: [PATCH 098/270] DASB: Tokotron: Config updates --- benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml | 4 ++-- .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml index 0117d9afe..378315bcf 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -32,7 +32,7 @@ dataloader_opts: shuffle: True num_workers: !ref -# EnCodec parameters +# SQCodec parameters config: config.yaml checkpoint: ckpt_00190000.pth sample_rate: 16000 @@ -40,7 +40,7 @@ save_embedding: False num_codebooks: 4 save_path: /home/ubuntu/sq-codec/SQ-Codec -# wavtokenizer model +# SQCodec model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer save_path: !ref checkpoint: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 11b7e5af6..ce3347e54 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -14,8 +14,9 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER data_folder_alignments: null # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared +prepare_save_folder: !ref pretrained_model_save_folder: !ref ssl_model_type: wavlm representation_mode: discrete From c9578e85e9ab88103fd425f07b6ce2dc8d15ef72 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 31 Jan 2025 11:31:18 -0500 Subject: [PATCH 099/270] DASB: Cosmetic changes (pre-commit hooks) --- .../LJSpeech/TTS/tokotron/hparams/eval.yaml | 22 +--- .../TTS/tokotron/hparams/train_dac.yaml | 2 +- .../tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../TTS/tokotron/hparams/train_mimi.yaml | 2 +- .../hparams/train_speech_tokenizer.yaml | 2 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 2 +- .../tokotron/hparams/train_wavtokenizer.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/train.py | 29 ++--- .../DASB/LJSpeech/TTS/valle/hparams/eval.yaml | 22 +--- .../TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../LibriTTS/TTS/tokotron/hparams/eval.yaml | 4 +- .../TTS/tokotron/hparams/train_dac.yaml | 2 +- .../tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../hparams/train_speech_tokenizer.yaml | 2 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 4 +- .../TTS/valle/hparams/train_encodec.yaml | 2 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 109 +++++++++--------- 19 files changed, 95 insertions(+), 121 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index f805e23f6..9cdf08aab 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -2,11 +2,7 @@ eval_sample_rate: 16000 eval_samples: null eval_interval: 1 eval_asr_type: whisper -eval_asr_source: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech - whisper: openai/whisper-small +eval_asr_source: openai/whisper-small evaluations: utmos,asr tmp_folder: null eval_utmos_source: chaanks/wav2vec2-small @@ -26,18 +22,10 @@ eval_utmos: !name:eval.UTMOSSpeechEvaluator domain_id: !ref judge_id: !ref -eval_asr: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: !name:eval.EncoderDecoderASRSpeechEvaluator - source: !ref - sample_rate: !ref - overrides: - lm_weight: 0.0 - whisper: !name:eval.WhisperASRSpeechEvaluator - source: !ref - sample_rate: !ref - savedir: !ref +eval_asr: !name:eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref evaluators: utmos: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index 1ae232aca..f94d25d74 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index a1be07c07..1c0c765f7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 3c7284821..3355ac511 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -51,7 +51,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index eac124447..e80edb2b0 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 5.0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 6b8888153..fb839c897 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -54,7 +54,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index e7af427ad..3b667e2f8 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index 07e63e45b..81bcee2ca 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -56,7 +56,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 150 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 5.0 sorting: random diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 05d8805c6..8e6e59197 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -275,7 +275,9 @@ def on_stage_start(self, stage, epoch): self.is_evaluating = True self.audio_token_offsets = self.get_token_offsets() - self.token_model_kwargs = getattr(self.hparams, "token_model_kwargs", {}) + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -448,7 +450,9 @@ def create_waveform(self, audio, length): with torch.no_grad(): if self.audio_token_offsets is not None: audio = clean_padding(audio + self.audio_token_offsets, length) - wav = self.modules.tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) wav = clean_padding(wav, length) wav = wav.to(self.device) return wav @@ -580,16 +584,12 @@ def audio_ref_pipeline(wav): "num_codebooks": get_selected_layer_indexes(hparams) } else: - tokens_loader_kwargs = { - "num_codebooks": audio_tokens_per_step - } + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") def audio_pipeline(id): - audio = tokens_loader.tokens_by_uttid( - id, **tokens_loader_kwargs - ) + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -699,10 +699,7 @@ def get_selected_layer_indexes(hparams): available_layers = hparams.get("available_speech_model_layers") if not (selected_layers and available_layers): return None - layer_idx = [ - available_layers.index(layer) - for layer in selected_layers - ] + layer_idx = [available_layers.index(layer) for layer in selected_layers] return layer_idx @@ -770,14 +767,18 @@ def apply_overfit_test(hparams, dataset): dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) - dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys())) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) - dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) result = { "train": dataset_train, "valid": dataset_eval, diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml index b80347c82..08587ce23 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/eval.yaml @@ -2,11 +2,7 @@ eval_sample_rate: 16000 eval_samples: null eval_interval: 1 eval_asr_type: whisper -eval_asr_source: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: speechbrain/asr-transformer-transformerlm-librispeech - whisper: openai/whisper-small +eval_asr_source: openai/whisper-small evaluations: utmos,asr tmp_folder: null eval_utmos_source: chaanks/wav2vec2-small @@ -26,18 +22,10 @@ eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator domain_id: !ref judge_id: !ref -eval_asr: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - encoder_decoder: !name:utils.eval.EncoderDecoderASRSpeechEvaluator - source: !ref - sample_rate: !ref - overrides: - lm_weight: 0.0 - whisper: !name:utils.eval.WhisperASRSpeechEvaluator - source: !ref - sample_rate: !ref - savedir: !ref +eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator + source: !ref + sample_rate: !ref + savedir: !ref evaluators: utmos: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 8f1c22767..f4a003a0d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -69,7 +69,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 +batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 sorting: random diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml index 9e9d91dc3..b39b11009 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/eval.yaml @@ -7,7 +7,7 @@ eval_interval: 1 eval_subset: null eval_asr_beam_size: 66 eval_asr_type: encoder_decoder -eval_asr_source: openai/whisper-small +eval_asr_source: openai/whisper-small eval_spk_sim_source: microsoft/wavlm-base-sv evaluations: utmos,asr,spk_sim tmp_folder: null @@ -54,4 +54,4 @@ eval_summary: eval_summary_log: utmos: utmos_utmos_mean dwer: asr_dwer_median - spk_sim: spk_sim_score_mean \ No newline at end of file + spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 805384b8d..1a55d1c02 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -55,7 +55,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index ce3347e54..0b128b7a9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -94,7 +94,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 batch_size_guided: 2 extract_features_batch_size: 32 grad_accumulation_factor: 1 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 9a5838923..9e5c6826a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -66,7 +66,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 703878092..86ebee501 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -58,7 +58,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min input: text number_of_epochs: 1000 reset_annealing_epoch: null -batch_size: 16 +batch_size: 16 extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 688b7ace3..8ee2a0468 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -61,7 +61,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS speech_model_layers: [1, 3, 7, 12, 18, 23] -flip_layers: false +flip_layers: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False @@ -71,7 +71,7 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 -batch_size: 16 +batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 74b31ecad..f1981bd88 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -213,7 +213,7 @@ opt_class: !name:torch.optim.AdamW compute_cost: !name:model.valle.masked_nll_loss log_softmax: !new:speechbrain.nnet.activations.Softmax - apply_log: True + apply_log: True lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 8c8b1ada7..4f11022f4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -20,8 +20,11 @@ import shutil from pathlib import Path from hyperpyyaml import load_hyperpyyaml -from speechbrain.dataio.dataio import clean_padding_, length_to_mask, write_audio -from speechbrain.dataio.dataio import write_audio +from speechbrain.dataio.dataio import ( + clean_padding_, + length_to_mask, + write_audio, +) from speechbrain.utils.data_utils import pad_right_to from speechbrain.utils.distributed import run_on_main from speechbrain.utils.data_utils import batch_pad_right @@ -32,7 +35,7 @@ base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) -from evaluation import SpeechEvaluationMetricStats +from evaluation import SpeechEvaluationMetricStats # noqa: E402 logger = logging.getLogger(__name__) @@ -57,7 +60,7 @@ def __init__( self.evaluation_metric = SpeechEvaluationMetricStats( self.hparams, self.device ) - + def create_waveform(self, audio, length): """Creates a waveform from a discrete or continuous audio representation @@ -101,14 +104,13 @@ def compute_forward(self, batch, stage): prompt, prompt_length = batch.prompt batch_size, prompt_max_len, num_tracks = prompt.shape nar_track = torch.randint( - 1, num_tracks, (batch_size,), - device=self.device + 1, num_tracks, (batch_size,), device=self.device ) logits_ar, logits_nar = self.modules.model( dec_seq=batch.prompt.data, dec_seq_lengths=batch.prompt.lengths, prefix_len=batch.prefix_length / prompt_max_len, - nar_level_idx=nar_track + nar_level_idx=nar_track, ) return logits_ar, logits_nar, nar_track @@ -144,14 +146,16 @@ def compute_objectives(self, predictions, batch, stage): batch_idx = torch.arange(batch_size, device=prompt.device) targets_nar = prompt[batch_idx, 1:, nar_track] prompt_max_len = prompt.size(1) - length_mask = length_to_mask(prompt_length * prompt_max_len, prompt_max_len) - prefix_mask = length_to_mask(prefix_length, prompt_max_len).logical_not() + length_mask = length_to_mask( + prompt_length * prompt_max_len, prompt_max_len + ) + prefix_mask = length_to_mask( + prefix_length, prompt_max_len + ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, - targets=targets_ar, - mask=mask + log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask ) self.loss_metric_ar.append( ids=batch.uttid, @@ -161,9 +165,7 @@ def compute_objectives(self, predictions, batch, stage): reduction="batch", ) loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, - targets=targets_nar, - mask=mask, + log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, ) self.loss_metric_nar.append( ids=batch.uttid, @@ -187,20 +189,17 @@ def on_stage_start(self, stage, epoch): `None` during the test stage. """ self.offsets = get_offsets( - self.hparams.vocab_size, - self.hparams.audio_tokens_per_step, + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, )[None, None, :].to(self.device) self.loss_metric = sb.utils.metric_stats.MultiMetricStats( metric=self.hparams.compute_cost, batch_eval=True, ) self.loss_metric_ar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, - batch_eval=True, + metric=self.hparams.compute_cost, batch_eval=True, ) self.loss_metric_nar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, - batch_eval=True, + metric=self.hparams.compute_cost, batch_eval=True, ) # TOOO: Reestablish evaluation @@ -290,10 +289,7 @@ def evaluate_batch(self, batch, stage): wav = self.create_waveform(audio_tokens, audio_length) wav = wav.squeeze(1) self.save_samples( - batch=batch, - wav=wav, - length=audio_length, - stage=stage + batch=batch, wav=wav, length=audio_length, stage=stage ) self.evaluation_metric.append( ids=batch.uttid, @@ -377,13 +373,16 @@ def inference(self, batch): prefix_items = undo_padding_tensor(prefix.int(), prefix_length) inference_results = [ self.modules.model.inference( - prefix=prefix_item.unsqueeze(0), - opts=self._get_inference_opts() - ) + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() + ) for prefix_item in prefix_items ] inferred_tokens = [ - result[0][0] if result[0] else torch.zeros(1000, self.hparams.audio_tokens_per_step, device=self.device) + result[0][0] + if result[0] + else torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) @@ -392,8 +391,12 @@ def inference(self, batch): return audio, audio_length def _get_inference_opts(self): - idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[None, :] - tracks = torch.arange(self.hparams.audio_tokens_per_step, device=self.device)[:, None] + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] track_start = ( self.hparams.text_num_tokens + self.hparams.special_num_tokens @@ -407,10 +410,7 @@ def _get_inference_opts(self): | (idx == self.hparams.eos_index) ).logical_not() return self.hparams.inference_opts( - masks={ - self.hparams.bos_index: mask - }, - device=self.device, + masks={self.hparams.bos_index: mask}, device=self.device, ) def save_samples(self, batch, wav, length, stage): @@ -438,7 +438,7 @@ def _get_eval_output_folder(self, stage): Path(self.hparams.output_folder) / "eval" / stage.name.lower() ) if epoch is not None: - output_folder = output_folder / str(epoch) + output_folder = output_folder / str(epoch) output_folder.mkdir(exist_ok=True, parents=True) return output_folder @@ -484,12 +484,11 @@ def dataio_prepare(hparams): label_encoder = hparams["label_encoder"] input_feature = INPUT_FEATURE_MAP[hparams["input"]] offsets = get_offsets( - hparams["vocab_size"], - hparams["audio_tokens_per_step"] + hparams["vocab_size"], hparams["audio_tokens_per_step"] ).unsqueeze(0) if hparams["flip_layers"]: offsets = offsets.flip(-1) - + tokens_loader = hparams.get("tokens_loader") spk_prompt_length = hparams["spk_prompt_length"] @@ -520,13 +519,15 @@ def spk_prompt(uttid, spk_sample): offset = 0 # Retrieve the embedding value from the dataset audio_spk_prompt, _ = pad_right_to( - audio[offset:offset + spk_prompt_length], - (spk_prompt_length, audio.size(1)) + audio[offset : offset + spk_prompt_length], + (spk_prompt_length, audio.size(1)), ) return audio_spk_prompt @sb.utils.data_pipeline.takes("uttid", "tokens", "spk_prompt") - @sb.utils.data_pipeline.provides("audio", "prefix", "prompt", "prefix_length", "length") + @sb.utils.data_pipeline.provides( + "audio", "prefix", "prompt", "prefix_length", "length" + ) def prompt_pipeline(id, tokens, spk_prompt): audio = tokens_loader.tokens_by_uttid( id, num_codebooks=hparams["audio_tokens_per_step"] @@ -575,7 +576,7 @@ def sig_pipeline(wav): "audio", "prompt", "prefix_length", - "length" + "length", ] if use_spk_emb: prepared_features.append("spk_emb") @@ -595,14 +596,9 @@ def sig_pipeline(wav): ) spk_idx, spk_samplers = group_by_speaker(dynamic_dataset, hparams) spk_sample = {} - spk_prompt_pipeline = partial( - spk_prompt, - spk_sample=spk_sample, - ) + spk_prompt_pipeline = partial(spk_prompt, spk_sample=spk_sample,) dynamic_dataset.add_dynamic_item( - func=spk_prompt_pipeline, - takes=["uttid"], - provides=["spk_prompt"], + func=spk_prompt_pipeline, takes=["uttid"], provides=["spk_prompt"], ) dynamic_dataset.add_dynamic_item(prompt_pipeline) resample_fn[dataset] = partial( @@ -612,8 +608,7 @@ def sig_pipeline(wav): dataset=dynamic_dataset, spk_samplers=spk_samplers, ) - resample_fn[dataset](epoch=0) - + resample_fn[dataset](epoch=0) datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False @@ -824,14 +819,18 @@ def apply_overfit_test(hparams, dataset): dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) - dataset_eval.set_output_keys(list(dataset_valid.pipeline.output_mapping.keys())) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) - dataset_eval.set_output_keys(list(dataset["valid"].pipeline.output_mapping.keys())) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) result = { "train": dataset_train, @@ -921,7 +920,6 @@ def undo_padding_tensor(batch, lengths): overrides=overrides, ) - # Data preparation, to be run on only one process. from libritts_prepare import prepare_libritts @@ -951,7 +949,6 @@ def undo_padding_tensor(batch, lengths): }, ) - # We can now directly create the datasets for training, valid, and test datasets = dataio_prepare(hparams) From 7270d4ecc2b374c09c3bfc3a5c58f693893c4096 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 31 Jan 2025 15:05:16 -0500 Subject: [PATCH 100/270] DASB: Add the ability to turn off evaluation for debugging purposes. --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 8e6e59197..506207f96 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -264,15 +264,16 @@ def on_stage_start(self, stage, epoch): self.use_spk_emb = getattr(self.hparams, "use_spk_emb", False) self.is_evaluating = False - if stage == sb.Stage.VALID: - if self.is_eval_epoch(epoch): + if self.hparams.eval_enabled: + if stage == sb.Stage.VALID: + if self.is_eval_epoch(epoch): + self.evaluator.on_evaluate_start(stage, epoch) + self.is_evaluating = True + else: + logger.info("No evaluation on epoch %d", epoch) + elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True - else: - logger.info("No evaluation on epoch %d", epoch) - elif stage == sb.Stage.TEST: - self.evaluator.on_evaluate_start(stage, epoch) - self.is_evaluating = True self.audio_token_offsets = self.get_token_offsets() self.token_model_kwargs = getattr( From 2b22169c285f7f79edf241408e6ad1fbde6a6904 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 31 Jan 2025 16:41:22 -0500 Subject: [PATCH 101/270] DASB: Add the ability to turn off evaluation --- benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml index 9cdf08aab..dcdc6d920 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/eval.yaml @@ -1,3 +1,11 @@ +# ############################################################################ +# Evaluation Hyperparameters +# Common to old models, appended to main hyperparameters +# +# Authors: Artem Ploujnikov +# ############################################################################ + +eval_enabled: True eval_sample_rate: 16000 eval_samples: null eval_interval: 1 From 6eaa206f97d6f859a64b08215eb2972d670b5d02 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 2 Feb 2025 23:25:58 -0500 Subject: [PATCH 102/270] DASB: Tokotron: SQCodec update to use ternary coding --- .../TTS/tokotron/hparams/train_sqcodec.yaml | 33 +- .../DASB/LJSpeech/TTS/tokotron/train.py | 5 + benchmarks/DASB/model/Tokotron.py | 332 ++++++++++++++++-- 3 files changed, 330 insertions(+), 40 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 3b667e2f8..21dee91e3 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -140,6 +140,8 @@ sample_dataloader_opts: padding_kwargs: value: !ref +transform_audio: !name:model.Tokotron.tokens_to_ternary + ####################### Model parameters ########################### # Transformer d_model: 512 @@ -154,7 +156,7 @@ audio_num_tokens: 19683 audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False -audio_token_offsets: True +audio_token_offsets: False text_num_tokens: 39 phn_num_tokens: 52 input_num_tokens: !apply:speechbrain.utils.hparams.choice @@ -162,7 +164,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice choices: text: !ref phonemes: !ref -audio_tokens_per_step: 1 +audio_tokens_per_step: 4 +ternary_num_digits: 9 +ternary_num_positions: !ref * bandwidth: 1.5 attention_type: regularMHA @@ -187,6 +191,29 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul audio_emb_freeze: !ref max_audio_length: !ref infer_max_audio_length: !ref + audio_emb: !ref + out_proj: !ref + multihead_input: False + inference: !ref + +inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference + gate_offset: !ref + gate_threshold: !ref + tokens_per_step: !ref + bos_idx: !ref + audio_token_shift: 0 + max_steps: !ref + representation_mode: !ref + transform_audio: !name:model.Tokotron.tokens_to_ternary + feed_audio: !name:model.Tokotron.ternary_logits_to_tokens + +audio_emb: !new:model.Tokotron.TernaryInput + emb_size: !ref + num_positions: !ref + +out_proj: !new:model.Tokotron.TernaryPredictionHead + d_model: !ref + num_positions: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer save_path: !ref @@ -209,6 +236,8 @@ compute_cost: !new:model.Tokotron.TokotronLoss gate_gamma: !ref gate_max_weight: !ref silence_padding: !ref + seq_cost: !name:model.Tokotron.ternary_loss + multihead_output: False lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 506207f96..d40ec20f0 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -88,6 +88,7 @@ def compute_forward(self, batch, stage): if self.use_spk_emb: emb = {"spk": batch.spk_emb.data.squeeze(1)} + audio = self.transform_audio(audio) predictions = self.modules.model( input_tokens=tokens, input_length=tokens_length, @@ -210,6 +211,8 @@ def compute_objectives(self, predictions, batch, stage): batch = batch.to(self.device) predictions, features = predictions _, _, audio_tgt, audio_tgt_length = features + + audio_tgt = self.transform_audio(audio_tgt) loss_details = self.hparams.compute_cost( predictions=predictions, audio=audio_tgt, @@ -280,6 +283,8 @@ def on_stage_start(self, stage, epoch): self.hparams, "token_model_kwargs", {} ) + self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity()) + def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 010f3b26b..c795f049b 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -25,10 +25,11 @@ from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear -from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss +from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, bce_loss from speechbrain.dataio.dataio import length_to_mask from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler +from model.sq_codec import decimal_to_ternary_matrix from enum import Enum from collections import namedtuple @@ -157,8 +158,10 @@ def __init__( show_inference_progress=True, audio_token_shift=0, multihead_input=True, + multihead_output=True, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, + out_proj=None, ): super().__init__() self.num_tokens = num_tokens @@ -182,9 +185,11 @@ def __init__( if self.representation_mode == RepresentationMode.DISCRETE else audio_dim ) - self.out_proj = Linear( - input_size=d_model, n_neurons=self.out_dim * tokens_per_step, - ) + if out_proj is None: + out_proj = Linear( + input_size=d_model, n_neurons=self.out_dim * tokens_per_step, + ) + self.out_proj = out_proj self.gate = Linear(input_size=d_model, n_neurons=1) if audio_emb is None: if self.representation_mode == RepresentationMode.DISCRETE: @@ -222,6 +227,7 @@ def __init__( self.multihead_input = multihead_input self.d_model = d_model self.d_model_sqrt = math.sqrt(d_model) + self.multihead_output = multihead_output def decode( self, @@ -371,16 +377,17 @@ def forward( pos_embs_src, ) lin_out = self.out_proj(dec_out) - batch_size, audio_max_len, num_tokens = lin_out.shape - lin_out_heads = lin_out.reshape( - batch_size, - audio_max_len, - self.tokens_per_step, - num_tokens // self.tokens_per_step, - ) + if self.multihead_output: + batch_size, audio_max_len, num_tokens = lin_out.shape + lin_out = lin_out.reshape( + batch_size, + audio_max_len, + self.tokens_per_step, + num_tokens // self.tokens_per_step, + ) gate_out = self.gate(dec_out).squeeze(-1) return TokotronDecoderOutput( - lin_out_heads, + lin_out, gate_out, dec_self_attn, dec_attn, @@ -400,6 +407,68 @@ def init_audio_emb(self, emb): self.audio_emb.initialize(emb) +class TernaryPredictionHead(nn.Module): + """An alternative prediction head that predicts a fixed number of ternary digits + for each position (as used in SQ-Codec) + + Arguments + --------- + d_model : int + The model dimension + num_positions : int + the number of positions + """ + def __init__(self, d_model, num_positions): + super().__init__() + self.num_positions = num_positions + self.d_model = d_model + self.num_positions = num_positions + self.lin_p = Linear( + input_size=d_model, + n_neurons=num_positions * 2 + ) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The decoder output (Batch x Length x d_model) + + Returns + ------- + p : torch.Tensor + A tensor of shape (Batch x Length x num_positions x 2) where + p[:, :, :, 0] -> the probability of the ternary digit being at least 0 + p[:, :, :, 0] -> the probability of the ternary digit being at least 1 + """ + batch_size, max_len, _ = x.shape + p = self.sigmoid(self.lin_p(x)) + p = p.reshape(batch_size, max_len, self.num_positions, 2) + return p + + +class TernaryInput(nn.Module): + def __init__(self, emb_size, num_positions): + super().__init__() + self.num_positions = num_positions + self.in_proj = Linear( + input_size=num_positions * 3, + n_neurons=emb_size, + ) + + def forward(self, x): + batch_size, max_len = x.shape[:2] + x_onehot = torch.nn.functional.one_hot( + (x + 1).long(), + 3 + ).reshape(batch_size, max_len, self.num_positions * 3) + in_proj = self.in_proj(x_onehot.float()) + return in_proj + + class TokotronTransformerAutoregressiveInference(nn.Module): """A greedy autoregressive inference implementation @@ -439,6 +508,8 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, show_inference_progress=True, + transform_audio=None, + feed_audio=None ): super().__init__() self.decoder = None @@ -451,6 +522,10 @@ def __init__( self.representation_mode = RepresentationMode(representation_mode) self.audio_dim = audio_dim self.show_inference_progress = show_inference_progress + if transform_audio is None: + transform_audio = nn.Identity() + self.transform_audio = transform_audio + self.feed_audio = feed_audio def bind(self, model): """Binds this inference implementation to a model @@ -522,6 +597,7 @@ def forward(self, enc_out, length, emb=None): steps_range = tqdm(steps_range, desc="Inference") for idx in steps_range: # One autoregressive step + audio = self.transform_audio(audio) step_out = self.decoder.forward( enc_out=enc_out, src_length=length, @@ -530,7 +606,9 @@ def forward(self, enc_out, length, emb=None): ) audio_out = step_out.out - if self.representation_mode == RepresentationMode.DISCRETE: + if self.feed_audio: + audio_out = self.feed_audio(audio_out) + elif self.representation_mode == RepresentationMode.DISCRETE: audio_out = audio_out.argmax(-1) # The model outputs predictions without BOS. Add the BOS back for the @@ -701,11 +779,13 @@ def __init__( eos_mode=EosMode.GATE, inference=None, audio_token_shift=0, - decoder_mode=DecoderMode.AUTOREGRESSIVE, scale_factor=5.0, representation_mode=RepresentationMode.DISCRETE, audio_dim=1024, emb=None, + audio_emb=None, + out_proj=None, + multihead_input=False ): super().__init__() self.in_emb = Embedding( @@ -724,11 +804,6 @@ def __init__( activation=activation, normalize_before=True, ) - self.decoder_mode = DecoderMode(decoder_mode) - audio_emb = None - if self.decoder_mode == DecoderMode.FORWARD: - audio_emb = nn.Identity() - audio_emb_size = d_model self.decoder = TokotronTransformerDecoder( num_tokens=audio_num_tokens + self.audio_token_shift, tokens_per_step=audio_tokens_per_step, @@ -748,9 +823,11 @@ def __init__( gate_threshold=gate_threshold, gate_offset=gate_offset, audio_token_shift=audio_token_shift, - multihead_input=self.decoder_mode == DecoderMode.AUTOREGRESSIVE, + multihead_input=multihead_input, + multihead_output=out_proj is None, representation_mode=representation_mode, audio_dim=audio_dim, + out_proj=out_proj, ) self.bos_idx = bos_idx self.attention_type = attention_type @@ -904,17 +981,11 @@ def forward( src_key_padding_mask=src_key_padding_mask, pos_embs=pos_embs_encoder, ) - if self.decoder_mode == DecoderMode.AUTOREGRESSIVE: - tgt = audio - tgt_length = audio_length - else: - tgt = scale(enc_out, self.scale_factor) - tgt_length = input_length enc_out = self.add_emb(enc_out, emb) dec_out = self.decoder( enc_out=enc_out, - tgt=tgt, - tgt_length=tgt_length, + tgt=audio, + tgt_length=audio_length, src_length=input_length, src_key_padding_mask=src_key_padding_mask, pos_embs_src=pos_embs_encoder, @@ -1218,6 +1289,7 @@ def __init__( representation_mode=RepresentationMode.DISCRETE, audio_clip_min=-10.0, audio_clip_max=10.0, + multihead_output=True, ): super().__init__() self.guided_attention_weight = guided_attention_weight @@ -1246,6 +1318,7 @@ def __init__( self.register_buffer("audio_eos", audio_eos) self.audio_clip_min = audio_clip_min self.audio_clip_max = audio_clip_max + self.multihead_output = multihead_output def forward( self, @@ -1278,9 +1351,12 @@ def forward( out = out.log_softmax(dim=-1) batch_size, out_len, heads, tok_dim = out.shape max_len = out_len - 1 - out_reshaped = ( - out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) - )[:, :max_len] + if self.multihead_output: + out_reshaped = ( + out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) + )[:, :max_len] + else: + out_reshaped = out if self.eos_mode == EosMode.TOKEN: # NOTE: Shift only the tokens, but not EOS padding_lengths = torch.ones(batch_size, device=audio.device) @@ -1294,7 +1370,10 @@ def forward( ) tok_len = audio.size(1) - if self.representation_mode == RepresentationMode.DISCRETE: + if not self.multihead_output: + audio_reshaped = audio + lengths_reshaped = audio_length + elif self.representation_mode == RepresentationMode.DISCRETE: audio_reshaped = audio.transpose(1, 2).reshape( batch_size * heads, max_len ) @@ -1313,18 +1392,21 @@ def forward( ) audio_reshaped = audio_reshaped[:, :max_len] - lengths_reshaped = ( - audio_length.unsqueeze(-1) - .expand(batch_size, heads) - .reshape(batch_size * heads) - ) + if self.multihead_output: + lengths_reshaped = ( + audio_length.unsqueeze(-1) + .expand(batch_size, heads) + .reshape(batch_size * heads) + ) + else: + lengths_reshaped = audio_length seq_loss = self.seq_cost( out_reshaped[:, :tok_len], audio_reshaped, length=lengths_reshaped, reduction=reduction, ) - if reduction == "batch": + if reduction == "batch" and self.multihead_output: seq_loss = seq_loss.reshape(batch_size, heads).mean(-1) lengths_abs = audio_length * out_len @@ -2252,3 +2334,177 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys): token_collate_fn, silence_token=silence_token, token_keys=token_keys ), } + + +def ternary_matrix_to_decimal(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 **torch.arange(D) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, None] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + +def logits_to_ternary(logits): + """Converts a tensor with two logits to a ternary matrix + + Arguments + --------- + logits : torch.Tensor + The logits (Batch x Length x num_positions x 2) + + Returns + ------- + result : torch.Tensor + The corresponding ternary matrix + """ + gte0 = logits[..., 0] >= 0.5 + gte1 = logits[..., 1] >= 0.5 + val_minus_1 = torch.tensor(-1, device=logits.device) + val_zero = torch.tensor(0, device=logits.device) + val_plus_1 = torch.tensor(1, device=logits.device) + return torch.where( + gte0, + torch.where( + gte1, + val_plus_1, + val_zero + ), + val_minus_1 + ) + +def ternary_matrix_to_decimal(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** torch.arange(D, device=matrix.device) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, None] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + +def ternary_to_decimal(ternary, n_codebook=4): + """Converts ternary digits to their decimal equivalent + + Arguments + --------- + ternary : torch.Tensor + (Batch x Length x num_positions) - ternary digits + n_codebooks : torch.Tensor + The number of coedbooks""" + chunks = ternary.chunk(n_codebook, dim=1) + codec_ls = [] + # TODO: Vectorize + for i, chunk in enumerate(chunks): + chunk = chunk + 1 + tmp_codec = ternary_matrix_to_decimal(chunk) + codec_ls.append(tmp_codec) + codec_ls = torch.stack(codec_ls) + return codec_ls.permute(1, 2, 0) + + +def ternary_logits_to_tokens(logits): + """Converts ternary logits to tokens (as used for SQ-Codec) + + Arguments + --------- + logits : torch.Tensor + The logits + + Returns + ------- + tokens : torch.Tensor + Token IDs + """ + ternary_matrix = logits_to_ternary(logits) + tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2)) + return tokens + + +def tokens_to_ternary(tokens): + """Converts a sequence of tokens to a ternary matrix + + Arguments + --------- + tokens : torch.Tensor + A (Batch x Length x Codebooks) tensor of tokens + + Returns + ------- + result : t""" + batch_size = tokens.size(0) + n_codebook = tokens.size(2) + tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() + ternary_matrix = torch.cat([ + decimal_to_ternary_matrix(item, D=9) - 1 + for item in tokens + ], dim=1) + return ternary_matrix.transpose(1, 2) + + +def ternary_loss(predictions, targets, length=None, reduction="mean"): + tgt_gte0 = targets >= 0. + tgt_gte1 = targets >= 1. + loss_gte0 = bce_loss( + predictions[:, :, :, 0], + tgt_gte0, + length=length, + reduction=reduction, + ) + loss_gte1 = bce_loss( + predictions[:, :, :, 0], + tgt_gte1, + length=length, + reduction=reduction, + ) + loss = loss_gte0 + loss_gte1 + return loss \ No newline at end of file From a99fddb94def647a02171c13cf93702901fcc34d Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 2 Feb 2025 23:54:39 -0500 Subject: [PATCH 103/270] DASB: Device fix --- benchmarks/DASB/model/sq_codec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 0e1ffe3f8..7901675e1 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1300,7 +1300,7 @@ def decimal_to_ternary_matrix(decimals, D): corresponds to a batch, and each column is represented as a ternary number. """ B, T = decimals.shape - ternary_matrix = torch.zeros((B, D, T), dtype=torch.long) + ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device) for pos in range(D): ternary_matrix[:, pos, :] = decimals % 3 # Modulo operation decimals //= 3 # Floor division for next ternary digit From 650cf2e60d76bd289334d6597232270c6f04b510 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 3 Feb 2025 11:49:16 -0500 Subject: [PATCH 104/270] DASB: Tokotron: Add the ability to add an "initialization model" when no checkpoint is available --- .../TTS/tokotron/hparams/train_discrete_ssl.yaml | 2 +- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 13 ++++++++++++- benchmarks/DASB/model/Tokotron.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 0b128b7a9..233aee30a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -38,7 +38,7 @@ frozen_split_path: null sample_path: null progress_folder: !ref /progress progress_current: !ref /current -progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 590fb10f7..fa65e2d10 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -353,9 +353,20 @@ def on_fit_start(self): if self.checkpointer is not None and not getattr( self, "_ckpt_recovered", False ): - self.checkpointer.recover_if_possible() + checkpoint = self.checkpointer.recover_if_possible() + if not checkpoint: + self.check_init() self._ckpt_recovered = True + def check_init(self): + init_from = getattr(self.hparams, "init_from", None) + if init_from is not None: + init_from_path = Path(init_from) + model_path = init_from_path / "model.ckpt" + with open(model_path, "rb") as model_file: + model_state_dict = torch.load(model_file, map_location=self.device) + self.modules.model.load_state_dict(model_state_dict) + @torch.no_grad() def evaluate_batch(self, batch, stage): """Evaluate one batch, override for different procedure than train. diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index c795f049b..92a1cbd49 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -785,7 +785,7 @@ def __init__( emb=None, audio_emb=None, out_proj=None, - multihead_input=False + multihead_input=True ): super().__init__() self.in_emb = Embedding( From b43b5652c6c94075ab2585978695b22807c0de99 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 3 Feb 2025 12:05:52 -0500 Subject: [PATCH 105/270] DASB: A small fix for cases where strides are not compatble (not necessarily a bug - it depends on how the tensor was obtained) --- benchmarks/DASB/utils/tokenizer_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index be73fda74..0ab019b58 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -499,7 +499,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): self.eval() - signal = self.decode(tokens.view(tokens.shape[0], -1), **kwargs) + signal = self.decode(tokens.reshape(tokens.shape[0], -1), **kwargs) return signal.squeeze(1) @torch.no_grad() From 693d499c883bb893e185bed36be8402616ef4669 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 13:36:13 -0500 Subject: [PATCH 106/270] DASB: Extra logging --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index fa65e2d10..2d5ff461a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -361,11 +361,13 @@ def on_fit_start(self): def check_init(self): init_from = getattr(self.hparams, "init_from", None) if init_from is not None: + logger.info("Initializing with pre-trained weights from %s", init_from) init_from_path = Path(init_from) model_path = init_from_path / "model.ckpt" with open(model_path, "rb") as model_file: model_state_dict = torch.load(model_file, map_location=self.device) self.modules.model.load_state_dict(model_state_dict) + logger.info("Successfully initialized with pre-trained weights from %s", init_from) @torch.no_grad() def evaluate_batch(self, batch, stage): From 7b79ffcc42e70e1864e866231b23a0e5941eab71 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 14:58:09 -0500 Subject: [PATCH 107/270] DASB: Fix maximum validation set size --- benchmarks/DASB/LibriTTS/extraction/extract.py | 1 + benchmarks/DASB/LibriTTS/libritts_prepare.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py index 87de6f84b..a3db84984 100644 --- a/benchmarks/DASB/LibriTTS/extraction/extract.py +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -51,6 +51,7 @@ "save_json_test": hparams["test_json"], "sample_rate": hparams["sample_rate"], "skip_prep": hparams["skip_prep"], + "max_valid_size": None }, ) diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py index 6d0ca9f0a..cb26eb085 100644 --- a/benchmarks/DASB/LibriTTS/libritts_prepare.py +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -105,7 +105,7 @@ def prepare_libritts( if valid_split: wav_list = prepare_split(data_folder, valid_split) # TODO add better way to speedup evaluation - if len(wav_list) > max_valid_size: + if max_valid_size is not None and len(wav_list) > max_valid_size: wav_list = random.sample(wav_list, max_valid_size) create_json(wav_list, save_json_valid, sample_rate, model_name) if test_split: From 24bebfe7fcda4ea400b6c028503b94b065ed7555 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 17:21:12 -0500 Subject: [PATCH 108/270] DASB: Add the ability to change the saved folder for Encodec --- benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml index d3cd83c3e..b7ae76969 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/encodec save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -47,7 +48,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False From 7ede118828fa5631b067e6e72e7e69e2b27a052a Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 21:59:16 -0500 Subject: [PATCH 109/270] DASB: Fixes --- .../tokotron/hparams/train_discrete_ssl.yaml | 34 ++----------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 233aee30a..2db7cd944 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -70,8 +70,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice hubert: speechbrain/hifigan-hubert-k1000-LibriTTS wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS -ssl_model_layers: [1, 3, 7, 12, 18, 23] -token_model_layers: !ref +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref select_layers: null token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec @@ -166,15 +166,6 @@ gate_offset: !apply:Tokotron.distance_diff_loss_ramp silence_padding: !ref use_silence_padding: True -# Guides -guides_enabled: False -guides_start_epoch: 40 -guides_spk: False -guides_spk_discrete: True -guides_spk_loss_weight: 0.2 -guides_asr: True -guides_asr_loss_weight: 0.1 - # Token model (pretrained) ssl_model: !apply:speechbrain.utils.hparams.choice @@ -201,14 +192,6 @@ spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_h source: !ref savedir: !ref /ecapa -spk_emb_discrete_guide: !name:speechbrain.inference.interfaces.foreign_class - source: !ref - savedir: !ref /ecapa- - pymodule_file: custom_interface.py - classname: DiscreteSpkEmb - overrides: - ssl_layer_num_selected: !ref - # Dataloader options train_dataloader_opts: batch_size: !ref @@ -240,19 +223,8 @@ sample_dataloader_opts: value: !ref token_model_kwargs: - SSL_layers: !ref + SSL_layers: !ref -extract_features_opts: - dataloader_opts: - batch_size: !ref - num_workers: !ref - ssl_model: !ref - ssl_model_layers: !ref - token_model_layers: !ref - sample_rate: !ref - model_sample_rate: !ref - spk_emb_model: !ref - data_folder_alignments: !ref ####################### Model parameters ########################### # Transformer d_model: 512 From 123248d9099acf22faa3c7faedcbb92dca7b3ac3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 23:23:55 -0500 Subject: [PATCH 110/270] DASB: Tokotron: Fixes --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 2d5ff461a..47c3b8939 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -827,7 +827,10 @@ def read_token_list(file_name): result: list a list of tokens """ - if not Path(file_name).exists(): + file_name = Path(file_name) + if not file_name.is_absolute(): + file_name = Path(__file__).parent / "hparams" / file_name + if not file_name.exists(): raise ValueError(f"Token file {file_name} not found") with open(file_name) as token_file: return [line.strip("\r\n") for line in token_file if line] From 0b11188b9d32b2f8745c0b983902b1c7561d507c Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 23:34:36 -0500 Subject: [PATCH 111/270] DASB: Tokotron: Fixes --- .../LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 2db7cd944..efcde8c58 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -149,8 +149,8 @@ debug_infer_max_audio_length: 10 # Label encoder label_encoder: !new:speechbrain.dataio.encoder.TextEncoder -token_list_file_text: ./hparams/char_en.txt -token_list_file_phn: ./hparams/arpabet.txt +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt token_list_file: !apply:speechbrain.utils.hparams.choice value: !ref choices: From 4eaa7cdf6e6d50e09437e245bef02eb88aeb26e9 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Feb 2025 23:52:36 -0500 Subject: [PATCH 112/270] DASB: Fixes --- .../DASB/LibriTTS/TTS/tokotron/train.py | 59 ++++++++----------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 47c3b8939..f11269392 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -163,38 +163,6 @@ def _compute_spk(self, wav, wav_length): ) return spk_emb_pred - def _get_selected_layer_idx(self): - selected_layers = None - if ( - hasattr(self.hparams, "select_layers") - and self.hparams.select_layers - ): - layers = self.hparams.select_layers - model_layers_map = { - layer: idx - for idx, layer in enumerate(self.hparams.token_model_layers) - } - selected_layers = [model_layers_map[layer] for layer in layers] - return selected_layers - - # TODO: Move this elsewhere - def select_layers(self, audio_ssl): - """Applies layer squishing, if enabled - - Arguments - --------- - audio_ssl : torch.Tensor - SSL features - - Returns - ------- - audio_ssl : torch.Tensor - SSL features, squished if enabled - """ - if self.layer_idx: - audio_ssl = audio_ssl[:, :, self.layer_idx] - return audio_ssl - def compute_objectives(self, predictions, batch, stage): """Computes the loss given the predicted and targeted outputs. We here do multi-task learning and the loss is a weighted sum of the ctc + seq2seq @@ -258,7 +226,6 @@ def on_stage_start(self, stage, epoch): self.modules.vocoder, "model" ): self.modules.vocoder.model.device = self.device - self.layer_idx = self._get_selected_layer_idx() self.loss_metric = sb.utils.metric_stats.MultiMetricStats( metric=self.hparams.compute_cost, batch_eval=True, ) @@ -558,13 +525,17 @@ def tokens_pipeline(label): ) tokens_loader = hparams.get("tokens_loader") + if "speech_model_layers" in hparams: + tokens_loader_kwargs = { + "num_codebooks": get_selected_layer_indexes(hparams) + } + else: + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") def audio_pipeline(id): - audio = tokens_loader.tokens_by_uttid( - id, num_codebooks=audio_tokens_per_step - ) + audio = tokens_loader.tokens_by_uttid(id, **tokens_loader_kwargs) audio_pad = feature_pad_to( audio, len(audio) + silence_padding_len, silence_padding ) @@ -813,6 +784,22 @@ def init_sequence_encoder(hparams): return encoder +def get_selected_layer_indexes(hparams): + """Finds the layers of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + def read_token_list(file_name): """Reads a simple text file with tokens (e.g. characters or phonemes) listed one per line From 60e7d9eb21bc73f580f15d8d44c6c46ea4fc94ae Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Feb 2025 00:48:14 -0500 Subject: [PATCH 113/270] DASB: Tokotron: Fixes --- .../DASB/LibriTTS/TTS/tokotron/train.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index f11269392..9d18705e2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -333,7 +333,15 @@ def check_init(self): model_path = init_from_path / "model.ckpt" with open(model_path, "rb") as model_file: model_state_dict = torch.load(model_file, map_location=self.device) - self.modules.model.load_state_dict(model_state_dict) + tgt_state_dict = self.modules.model.state_dict() + ignore_keys = [] + for k, v in model_state_dict.items(): + if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape: + logger.warning("Ignoring shape mismatch for %s", k) + ignore_keys.append(k) + for k in ignore_keys: + del model_state_dict[k] + self.modules.model.load_state_dict(model_state_dict, strict=False) logger.info("Successfully initialized with pre-trained weights from %s", init_from) @torch.no_grad() @@ -499,6 +507,9 @@ def tokens_pipeline(label): audio_tokens_per_step = len(hparams["token_model_layers"]) else: audio_tokens_per_step = hparams["audio_tokens_per_step"] + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes(hparams) if use_silence_padding: if representation_mode == RepresentationMode.DISCRETE: silence_padding = get_silence_token( @@ -514,6 +525,10 @@ def tokens_pipeline(label): ) silence_padding = silence_padding.cpu() + if layer_idx: + silence_padding = silence_padding[layer_idx] + else: + silence_padding = silence_padding[:audio_tokens_per_step] silence_padding_len = int(math.ceil(hparams["silence_padding"])) bos_width = hparams.get("bos_width", 1) audio_bos_prefix = ( @@ -525,9 +540,9 @@ def tokens_pipeline(label): ) tokens_loader = hparams.get("tokens_loader") - if "speech_model_layers" in hparams: + if layer_idx is not None: tokens_loader_kwargs = { - "num_codebooks": get_selected_layer_indexes(hparams) + "num_codebooks": layer_idx } else: tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} From 54df7ed4912375a393d3d89ba7dc22d8268f40dd Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Feb 2025 10:57:43 -0500 Subject: [PATCH 114/270] DASB: Tokotron LibriTTS: Fixes --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 9d18705e2..f167e2f64 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -89,8 +89,12 @@ def create_waveform(self, audio, length, emb): if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device - wav = self.modules.tokenizer.tokens_to_sig(audio) - clean_padding_(wav, length) + with torch.no_grad(): + wav = self.modules.tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + clean_padding_(wav, length) + wav = wav.to(self.device) return wav def compute_forward(self, batch, stage): @@ -279,6 +283,9 @@ def on_stage_start(self, stage, epoch): elif stage == sb.Stage.TEST: self.evaluator.on_evaluate_start(stage, epoch) self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed From 3aa7de39695cc308cd7471d18f5bb3974021570b Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 00:58:29 -0500 Subject: [PATCH 115/270] DASB: Fixes --- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 9e5c6826a..cbef6a840 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -240,7 +240,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False From 10f820221fd744a11151579e5399e328d5ccdc6e Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 01:05:30 -0500 Subject: [PATCH 116/270] DASB: Fixes --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index 3355ac511..c2ffc13bf 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -184,7 +184,7 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False From 2cd7c6a2a32bf6e1549d075d7f3036c7adb6c274 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 01:08:04 -0500 Subject: [PATCH 117/270] DASB: Fixes --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml | 2 +- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index c2ffc13bf..d16403558 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -184,7 +184,7 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index cbef6a840..c7ca08adc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -240,7 +240,7 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False From 7e1bf0f5286db1f7ac6d080d0a3d51383153ebfa Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 09:48:49 -0500 Subject: [PATCH 118/270] DASB: Fixes --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 1 - .../DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml | 2 +- .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 3 +-- .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 1c0c765f7..b92a76255 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -233,7 +233,6 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rule eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref representation_mode: discrete diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 3090e9f79..9c8baf3bf 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -45,7 +45,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref tokens_extractor: !new:utils.tokens.TokensExtractor diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 86ebee501..b48bb66fa 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -236,13 +236,12 @@ model: !new:benchmarks.DASB.model.Tokotron.TokotronTransformerModel # yamllint d eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref emb: !ref tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref modules: model: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml index 85148db9d..931e448cd 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -43,7 +43,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref From 2c72caf7a1b998814cdff6f08682d3255b1892b0 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 09:54:20 -0500 Subject: [PATCH 119/270] DASB: Fixes --- .../LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 2 +- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 2 -- .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index fb839c897..4b2fb6553 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -188,7 +188,7 @@ model: !new:model.Tokotron.TokotronTransformerModel # yamllint disable-line rul tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref modules: model: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index c7ca08adc..258065779 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -101,7 +101,6 @@ gate_loss_max_weight: 1. # Inference parameters inference_mode: autoregressive eos_mode: gate -decoder_mode: autoregressive scale_factor: 4 # Embedding Injection @@ -233,7 +232,6 @@ model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line- eos_mode: !ref infer_max_audio_length: !ref audio_token_shift: !ref - decoder_mode: !ref scale_factor: !ref emb: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml index 931e448cd..85148db9d 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -43,7 +43,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref From 4b5164420a701ebd093e32c308dc47ec153f8d6a Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 11:07:20 -0500 Subject: [PATCH 120/270] VALL-E: Cosmetic changes, hparams updates --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 10 +- benchmarks/DASB/model/valle.py | 406 +++++++++++++----- 2 files changed, 309 insertions(+), 107 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 8ee2a0468..3aa7690a1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -87,11 +87,12 @@ pad_index: 0 bos_index: 1 eos_index: 2 eot_index: 3 -special_tokens: ["", "", ""] -special_num_tokens: 4 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr: 0.002 # @orion_step1: --lr~"loguniform(0.00001,0.005)" lr_warmup_steps: 10000 lr_annealing_mode: step betas: [0.9, 0.95] @@ -101,7 +102,8 @@ sample_rate: 24000 model_sample_rate: 24000 max_audio_length: 2000 text_max_length: 500 -n_ctx: !ref + +spk_prompt_length: 150 +n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 debug_infer_max_audio_length: 10 diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index b85e68345..ab233efff 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -14,7 +14,7 @@ import logging import torch -from typing import Dict, Tuple, Optional +from typing import Tuple, Optional from speechbrain.dataio.dataio import length_to_mask from torch import Tensor @@ -144,6 +144,13 @@ def forward( Lengths of condition part in dec_seq (B,). nar_level_idx : int the index of the non-autoregressive level to train + + Returns + ------- + logits_ar : torch.Tensor + Autoregressive predictions + logits_nar : torch.Tensor + Non-autoregressive predictions """ assert dec_seq.dim() == 3 @@ -202,11 +209,7 @@ def prepare_input(self, dec_seq_emb, prefix_len, level): @torch.no_grad() def inference( - self, - prefix, - opts, - enc_seq=None, - suffix=None, + self, prefix, opts, enc_seq=None, suffix=None, ): """Vall-E Inference. @@ -221,6 +224,13 @@ def inference( suffix : torch.Tensor suffix part of dec_seq (B, T, nq), usually the target sequence for teacher-forcing. + + Returns + ------- + gen_tokens_list : list + Generated tokens + gen_scores_list : list + The scores associated with the generated tokens """ # (1) initialization @@ -263,8 +273,7 @@ def inference( mask = modality_index_to_mask(modality_index, opts) mask_cache = [] modality_tokens = torch.tensor( - list(opts.masks.keys()), - device=prefix.device + list(opts.masks.keys()), device=prefix.device ) for step in range(maxlen): @@ -292,14 +301,11 @@ def inference( # (3.3) detect modality swtich mask_cache.append(mask.clone()) - modality_change_mask = torch.isin( - prev_tok[:, 0], - modality_tokens - ) + modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens) # Note: The ESPNET VALL-E had # modality_change_mask = torch.logical_and( # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, - #) + # ) if torch.any(modality_change_mask): modality_index = torch.where( modality_change_mask, prev_tok[:, 0], modality_index, @@ -434,14 +440,33 @@ def _initialize(self): class ResidualAttentionBlock(nn.Module): + """A VALL-E residual attention block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of heads + cross_attention : bool + Whether to use cross-attention + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + """ + def __init__( self, - n_state: int, - n_head: int, - cross_attention: bool = False, - causal: bool = False, - qk_norm: bool = False, - dropout: float = 0.0, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, ): super().__init__() @@ -471,12 +496,20 @@ def __init__( self.mlp_dropout = nn.Dropout(p=dropout) def forward( - self, - x: Tensor, - xa: Optional[Tensor] = None, - mask: Optional[Tensor] = None, - kv_cache: Optional[dict] = None, + self, x, xa=None, mask=None, kv_cache=None, ): + """The forward pass implementation + + Arguments + --------- + x : torch.Tensor + the feature tensor + xa : torch.Tensor + The tensor for cross-attention + mask : torch.Tensor + The attention mask to be applied + + """ x = x + self.attn_dropout( self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) ) @@ -491,15 +524,37 @@ def forward( class TransformerDecoder(nn.Module): def __init__( self, - n_ctx: int, - n_state: int, - n_head: int, - n_layer: int, - causal: bool = True, - qk_norm: bool = False, - dropout: float = 0.0, + n_ctx, + n_state, + n_head, + n_layer, + causal=True, + qk_norm=False, + dropout=0.0, layer_class=ResidualAttentionBlock, ): + """A custom transformer decoder implementation for VALL-E + + Arguments + --------- + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + layer_class : type + The layer type to be used + """ super().__init__() self.pos_emb = nn.Embedding(n_ctx, n_state) @@ -523,11 +578,24 @@ def __init__( self.kv_cache = None def forward( - self, - x: Tensor, - mask: torch.Tensor = None, - kv_cache: Optional[dict] = None, + self, x, mask=None, kv_cache=None, ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + the feature tensor + mask : torch.Tensor + The attention mask to be applied + kv_cache : dict + The key/value cache (for inference) + + Returns + ------- + result : torch.Tensor + The decoder output + """ if self.causal and mask is not None: raise ValueError("Causal Transformer dones't allow mask") @@ -541,17 +609,33 @@ def forward( return x def init(self): + """Initializes the key/value cache and the hooks to update it""" self.kv_cache, self.hooks = install_kv_cache_hook(self, self.kv_cache) return self.kv_cache - def reset(self,): + def reset(self): + """Resets the key-value cache""" for hook in self.hooks: hook.remove() self.kv_cache = None class LayerNorm(nn.LayerNorm): - def forward(self, x: Tensor) -> Tensor: + """A layer normalziation wrapper""" + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor to be normalized + + Returns + ------- + result : torch.Tensor + A normalzied tensor + """ return super().forward(x.float()).type(x.dtype) @@ -565,14 +649,35 @@ def forward(self, x: Tensor) -> Tensor: class ResidualAttentionBlockAdaLN(ResidualAttentionBlock): + """"The Vall-E Adaptive Residual Attention Block + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of states + n_head : int + The number of attention heads + cross_attention : bool + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + def __init__( self, - n_state: int, - n_head: int, - cross_attention: bool = False, - causal: bool = False, - qk_norm: bool = False, - dropout: float = 0.0, + n_state, + n_head, + cross_attention=False, + causal=False, + qk_norm=False, + dropout=0.0, ): super(ResidualAttentionBlockAdaLN, self).__init__( n_state=n_state, @@ -587,13 +692,23 @@ def __init__( self.mlp_ln = AdaLN(n_state) def forward( - self, - x: Tensor, - level: Tensor, - xa: Optional[Tensor] = None, - mask: Optional[Tensor] = None, - kv_cache: Optional[dict] = None, + self, x, level, xa=None, mask=None, kv_cache=None, ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ x = x + self.attn_dropout( self.attn(self.attn_ln(x, level), mask=mask, kv_cache=kv_cache) ) @@ -610,17 +725,40 @@ def forward( class ValleNARDecoder(TransformerDecoder): def __init__( self, - n_level: int, - n_ctx: int, - n_state: int, - n_head: int, - n_layer: int, - causal: bool = False, - qk_norm: bool = False, - dropout: float = 0.0, + n_level, + n_ctx, + n_state, + n_head, + n_layer, + causal=False, + qk_norm=False, + dropout=0.0, layer_class=ResidualAttentionBlockAdaLN, ): + """The VALL-E non-autoregressive decoder + Arguments + --------- + n_level : int + The number of levels + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of attention heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + layer_class : type + The layer class to use + """ super().__init__( n_ctx=n_ctx, n_state=n_state, @@ -636,12 +774,21 @@ def __init__( self.ln = AdaLN(n_state) def forward( - self, - x: Tensor, - level: Tensor, - mask: Tensor = None, - kv_cache: Optional[dict] = None, + self, x, level, mask=None, kv_cache=None, ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + level : torch.Tensor + The level numbers for each batch element + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ if self.causal and mask is not None: raise ValueError("mask is not allowed when causal") @@ -658,13 +805,25 @@ def forward( class MultiHeadAttention(nn.Module): + """A Multi-Head Attention implementation + + Arguments + --------- + n_state : int + The number of states + n_head : int + The number of attention heads + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + """ + def __init__( - self, - n_state: int, - n_head: int, - causal: bool = False, - qk_norm: bool = False, - dropout: float = 0.0, + self, n_state, n_head, causal=False, qk_norm=False, dropout=0.0, ): super().__init__() assert n_state % n_head == 0 @@ -681,23 +840,22 @@ def __init__( self.q_norm = LayerNorm(n_state // n_head) self.k_norm = LayerNorm(n_state // n_head) - if not hasattr(F, "scaled_dot_product_attention"): - raise ValueError("Install torch 2.0.1+ to support Flash Attention") - - try: - from flash_attn import flash_attn_func - - self.flash_attn_func = flash_attn_func - except ImportError: - self.flash_attn_func = None - def forward( - self, - x: Tensor, - xa: Optional[Tensor] = None, - mask: Optional[Tensor] = None, - kv_cache: Optional[dict] = None, + self, x, xa=None, mask=None, kv_cache=None, ): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The source tensor + xa : torch.Tensor + The sequence for cross attention + mask : torch.Tensor + The attention mask + kv_cache : dict + The key/value cache (for inference) + """ q = self.query(x) if kv_cache is None or xa is None or self.key not in kv_cache: @@ -714,9 +872,23 @@ def forward( return self.out(wv) - def qkv_attention( - self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None - ): + def qkv_attention(self, q, k, v, mask=None): + """Computes self-attention + + Arguments + --------- + q : torch.Tensor + The queries tensor + k : torch.Tensor + The keys tensor + v : torch.Tensor + The values tensor + + Returns + ------- + wv : torch.Tensor + The attention output + """ if self.causal and mask is not None: raise ValueError("mask is not allowed when the attention is causal") @@ -732,16 +904,6 @@ def qkv_attention( if self.qk_norm: q = self.q_norm(q) k = self.k_norm(k) - - if self.flash_attn_func is not None and mask is None and self.training: - wv = self.flash_attn_func( - q.transpose(1, 2), - k.transpose(1, 2), - v.transpose(1, 2), - dropout_p=self.dropout, - causal=causal, - ).flatten(start_dim=2) - else: wv = ( F.scaled_dot_product_attention( q, k, v, mask, is_causal=causal, dropout_p=self.dropout @@ -754,6 +916,17 @@ def qkv_attention( class AdaLN(nn.Module): + """Adaptive Layer Normalization, a Layer Norm implementation + that learns an affine transformation based on the level + embedding + + Arguemnts + --------- + n_state : int + The number of states + eps : float + The layer norm epsilon parameter""" + def __init__(self, n_state, eps=1e-5): super().__init__() self.weight = nn.Linear(n_state, n_state, bias=False) @@ -764,7 +937,16 @@ def __init__(self, n_state, eps=1e-5): self.n_state = n_state self.eps = eps - def forward(self, x: Tensor, level_emb: Tensor): + def forward(self, x, level_emb): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The tensor + level_emb : torch.Tensor + The level embedding + """ w = self.weight(level_emb).unsqueeze(1) b = self.bias(level_emb).unsqueeze(1) x = nn.functional.layer_norm(x, (self.n_state,), eps=self.eps) @@ -773,6 +955,22 @@ def forward(self, x: Tensor, level_emb: Tensor): def install_kv_cache_hook(model, cache): + """Sets up the key/value cache hook + + Arguments + --------- + model : torch.nn.Module + The model + cache : dict + The cache content + + Returns + ------- + cache : torch.Tensor + The cache dictionary (new or copied) + hooks : torch.Tensor + The installed hooks + """ cache = {**cache} if cache is not None else {} hooks = [] @@ -794,12 +992,7 @@ def install_hooks(layer: torch.nn.Module): def logits_to_tokens( - logits: torch.Tensor, - opts: SpeechLMInferenceOptions, - mask: torch.Tensor, - search_algo: str = None, - allow_eos: bool = True, - nq_level: int = None, + logits, opts, mask, search_algo=None, allow_eos=True, nq_level=None, ): """ Select the generated tokens and their scores based on logits prediction. @@ -818,6 +1011,13 @@ def logits_to_tokens( whether to allow end-of-sentence prediction nq_level : int, optional if not None, only conpute the specified codec level nq. + + Returns + ------- + gen_token_idx : torch.Tensor + The token indexes + gen_token_score : torch.Tensor + The token scores """ assert logits.dim() == 4 From 748cc860b30bb54d49e6ec4a45a130924346c637 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 11:35:54 -0500 Subject: [PATCH 121/270] DASB: Fixes --- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 3aa7690a1..be7e49c81 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -23,7 +23,7 @@ data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER # e.g., path/to/cache data_folder_alignments: null # e.g., /path/to/LibriSpeech prepare_save_folder: !ref /prepared -pretrained_model_save_folder: !ref +prepare_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False data_mode: lite From 858b5d47ef35736ea7511cd2aa1fd50aad1cff36 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 11:37:28 -0500 Subject: [PATCH 122/270] DASB: Fixes --- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index be7e49c81..4b1a44a29 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -22,7 +22,6 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER # e.g., path/to/cache data_folder_alignments: null # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared prepare_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False From 7a5ea84ca99b33f027459bbc2e2120ddf5e1e456 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 11:38:44 -0500 Subject: [PATCH 123/270] DASB: Fixes --- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 4b1a44a29..5c8f608d2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -23,6 +23,7 @@ data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER # e.g., path/to/cache data_folder_alignments: null # e.g., /path/to/LibriSpeech prepare_save_folder: !ref +pretrained_model_save_folder: !ref prepare_archive_path: null prepare_skip_ignore_folders: False data_mode: lite From 30ee0c06f32efbc0d3c3be9290217f00810b1c14 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 14:29:52 -0500 Subject: [PATCH 124/270] DASB: Fix prefix masking for VALL-E --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 3 ++- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 92ea570da..dd619fede 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -152,7 +152,7 @@ def compute_objectives(self, predictions, batch, stage): prompt_length * prompt_max_len, prompt_max_len ) prefix_mask = length_to_mask( - prefix_length, prompt_max_len + prefix_length * prompt_max_len, prompt_max_len ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] @@ -480,6 +480,7 @@ def dataio_prepare(hparams): "valid": hparams["valid_json"], "test": hparams["test_json"], } + label_encoder = hparams["label_encoder"] input_feature = INPUT_FEATURE_MAP[hparams["input"]] offsets = get_offsets( diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 4f11022f4..6b7b7d207 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -150,7 +150,7 @@ def compute_objectives(self, predictions, batch, stage): prompt_length * prompt_max_len, prompt_max_len ) prefix_mask = length_to_mask( - prefix_length, prompt_max_len + prefix_length * prompt_max_len, prompt_max_len ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] From 3d89d2d6bd7194d9f31300ea431856b036310113 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 15:06:00 -0500 Subject: [PATCH 125/270] DASB: Update loss calculation to match ESPNet --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index dd619fede..bfcd76403 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -176,7 +176,7 @@ def compute_objectives(self, predictions, batch, stage): mask=mask, reduction="batch", ) - loss = loss_ar + loss_nar + loss = 0.5 * (loss_ar + loss_nar) return loss def on_stage_start(self, stage, epoch): diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 6b7b7d207..d51d53878 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -174,7 +174,7 @@ def compute_objectives(self, predictions, batch, stage): mask=mask, reduction="batch", ) - loss = loss_ar + loss_nar + loss = 0.5 * (loss_ar + loss_nar) return loss def on_stage_start(self, stage, epoch): From 779bf9932adf4921af172dc80baac4be935b4a1e Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 15:20:43 -0500 Subject: [PATCH 126/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index bfcd76403..c21f29c2d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -152,7 +152,7 @@ def compute_objectives(self, predictions, batch, stage): prompt_length * prompt_max_len, prompt_max_len ) prefix_mask = length_to_mask( - prefix_length * prompt_max_len, prompt_max_len + prefix_length, prompt_max_len ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index d51d53878..18adcf75f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -150,7 +150,7 @@ def compute_objectives(self, predictions, batch, stage): prompt_length * prompt_max_len, prompt_max_len ) prefix_mask = length_to_mask( - prefix_length * prompt_max_len, prompt_max_len + prefix_length, prompt_max_len ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] From 92c40b69e0ae69d0be0c4323ec25bbaeaf1cc73c Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 15:29:36 -0500 Subject: [PATCH 127/270] VALL-E: Hyperparameter updates --- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 5 +++-- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index f4a003a0d..827ffe2e3 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -64,6 +64,7 @@ token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -88,8 +89,8 @@ special_tokens: ["", "", ""] special_num_tokens: 4 # stages related parameters -lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" -lr_warmup_steps: 10000 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 lr_annealing_mode: step guided_attention_weight: 50.0 guided_attention_sigma: 0.5 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 5c8f608d2..2f53cc210 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -92,7 +92,7 @@ special_tokens: ["", "", "", ""] special_num_tokens: 5 # stages related parameters -lr: 0.002 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" lr_warmup_steps: 10000 lr_annealing_mode: step betas: [0.9, 0.95] diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index f1981bd88..4bbde09be 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -78,8 +78,8 @@ special_tokens: ["", "", "", ""] special_num_tokens: 5 # stages related parameters -lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.005)" -lr_warmup_steps: 10000 +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 lr_annealing_mode: step betas: [0.9, 0.95] From 56187971ac86b28621b63df23765449aa63d6d94 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Feb 2025 21:43:04 -0500 Subject: [PATCH 128/270] DASB: Fix the sample rate --- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 2f53cc210..dc6fe1ebc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -93,13 +93,13 @@ special_num_tokens: 5 # stages related parameters lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" -lr_warmup_steps: 10000 +lr_warmup_steps: 70000 lr_annealing_mode: step betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 -model_sample_rate: 24000 +model_sample_rate: 16000 max_audio_length: 2000 text_max_length: 500 spk_prompt_length: 150 From 71cd31618563e629564a7bb15a2bea2b20c29ccb Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 01:07:57 -0500 Subject: [PATCH 129/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 4bbde09be..1a3e332c2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -194,7 +194,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False From 9e4c550841abe2b58c5c237481684457a487b872 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 12:23:44 -0500 Subject: [PATCH 130/270] DASB: Encodec: Small fix --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 1a3e332c2..8d415323d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -86,7 +86,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2000 +max_audio_length: 2300 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + From 165eaac5b293f8a16cfa28a3be4328e4cf97bbb5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 18:06:53 -0500 Subject: [PATCH 131/270] DASB: Add Mimi, fix defaults for VALL-E Encodec --- .../TTS/tokotron/hparams/train_mimi.yaml | 278 ++++++++++++++++++ .../TTS/valle/hparams/train_encodec.yaml | 2 +- benchmarks/DASB/model/Tokotron.py | 11 +- 3 files changed, 284 insertions(+), 7 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml new file mode 100644 index 000000000..515537417 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -0,0 +1,278 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 8d415323d..57aac47c3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -168,7 +168,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -bandwidth: 1.5 +bandwidth: 6 attention_type: regularMHA ############################## models ################################ diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 92a1cbd49..d86d52273 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -2142,7 +2142,8 @@ def get_silence_token( sample_length=100000, unsqueeze=False, device=None, - model_kwargs=None, + num_codebooks=None, + ): """Attempts to find out the silence tokens for a given model, if applicable @@ -2157,8 +2158,8 @@ def get_silence_token( Whether to add an extra dimension to the audio (needed for DAC) device : str | torch.Device The device to use - model_kwargs : dict - Additional arguments to pass to the model + num_codebooks : int | list + The number of codebooks or the codebooks to use Returns ------- @@ -2171,8 +2172,6 @@ def get_silence_token( """ if device is None: device = next(model.parameters()).device - if model_kwargs is None: - model_kwargs = {} audio = torch.zeros(1, sample_length, device=device) if unsqueeze: @@ -2180,7 +2179,7 @@ def get_silence_token( length = torch.ones(1, device=device) model_training = model.training model.eval() - tokens = model.sig_to_tokens(audio, length) + tokens = model.sig_to_tokens(audio, length, num_codebooks=num_codebooks) if model_training: model.train() tokens = tokens.squeeze(0) From c1b30dbf8af1e573cf02cb360c6dea0f53136942 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 18:21:19 -0500 Subject: [PATCH 132/270] DASB: mimi fixes --- benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml index 515537417..ff173e8b5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -13,8 +13,9 @@ train_log: !ref /train_log.txt testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files +cached_data_folder: !PLACEHOLDER data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech -prepare_save_folder: !ref /prepared +prepare_save_folder: !ref pretrained_model_save_folder: !ref representation_mode: discrete vocoder_model_name: vocos From c3b647e0730c36fa4aa5d339f82df5e3ad882a56 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 18:29:16 -0500 Subject: [PATCH 133/270] DASB: add init_from --- benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml index ff173e8b5..3c06d761f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -39,6 +39,7 @@ sample_path: null progress_folder: !ref /progress progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 From f27ebad0c99ea3857d0cb657632395f321ae981a Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 18:34:12 -0500 Subject: [PATCH 134/270] DASB: small updates --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index d40ec20f0..dd9d61762 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -567,7 +567,12 @@ def audio_ref_pipeline(wav): and representation_mode == RepresentationMode.DISCRETE ): silence_token = get_silence_token( - hparams[model_key], model_kwargs=hparams.get("token_model_kwargs"), + hparams[model_key], + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ) ) if silence_token.dim() == 2: silence_token = silence_token.squeeze(-1) From 98408241422f8f87d1429bbdf3fe6e6116538c3c Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Feb 2025 18:35:08 -0500 Subject: [PATCH 135/270] DASB: small updates --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index f167e2f64..b160ab3a5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -521,7 +521,11 @@ def tokens_pipeline(label): if representation_mode == RepresentationMode.DISCRETE: silence_padding = get_silence_token( hparams["tokenizer"], - model_kwargs=hparams.get("token_model_kwargs"), + num_codebooks=( + hparams["speech_model_layers"] + if "speech_model_layers" in hparams + else audio_tokens_per_step + ) ) else: silence_padding = get_silence_repr(hparams["ssl_model"],) From b4afc68968fe82aab5d53378b709f360585e6b0f Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 8 Feb 2025 21:57:13 -0500 Subject: [PATCH 136/270] DASB: Add support for alignments --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../TTS/valle/hparams/train_encodec.yaml | 1 + benchmarks/DASB/LibriTTS/libritts_prepare.py | 181 +++++++++++++++++- 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index dc6fe1ebc..f8af5fb45 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -21,6 +21,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER # e.g., path/to/cache +alignments_folder: null data_folder_alignments: null # e.g., /path/to/LibriSpeech prepare_save_folder: !ref pretrained_model_save_folder: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 57aac47c3..1a125cc81 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -18,6 +18,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER cached_data_folder: !PLACEHOLDER +alignments_folder: null prepare_save_folder: !ref data_folder_alignments: null # e.g., /path/to/LibriSpeech pretrained_model_save_folder: !ref diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py index cb26eb085..6ec1a3a96 100644 --- a/benchmarks/DASB/LibriTTS/libritts_prepare.py +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -11,12 +11,14 @@ import torch import torchaudio +import re from tqdm import tqdm from speechbrain.inference.text import GraphemeToPhoneme from speechbrain.utils.data_utils import get_all_files from speechbrain.utils.logger import get_logger from speechbrain.utils.text_to_sequence import _g2p_keep_punctuations +from pathlib import Path logger = get_logger(__name__) LIBRITTS_URL_PREFIX = "https://www.openslr.org/resources/60/" @@ -38,6 +40,7 @@ def prepare_libritts( seed=1234, model_name=None, max_valid_size=500, + alignments_folder=None, skip_prep=False, ): """ @@ -75,6 +78,8 @@ def prepare_libritts( Seed value model_name : str Model name (used to prepare additional model specific data) + alignments_path : None + The path to alignments files skip_prep: Bool If True, skip preparation. @@ -101,16 +106,16 @@ def prepare_libritts( # If specific splits are provided, creates data manifest files accordingly if train_split: wav_list = prepare_split(data_folder, train_split) - create_json(wav_list, save_json_train, sample_rate, model_name) + create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name) if valid_split: wav_list = prepare_split(data_folder, valid_split) # TODO add better way to speedup evaluation if max_valid_size is not None and len(wav_list) > max_valid_size: wav_list = random.sample(wav_list, max_valid_size) - create_json(wav_list, save_json_valid, sample_rate, model_name) + create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name) if test_split: wav_list = prepare_split(data_folder, test_split) - create_json(wav_list, save_json_test, sample_rate, model_name) + create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed.") @@ -124,12 +129,12 @@ def prepare_libritts( data_split = split_sets(wav_list, split_ratio) # Creating json files create_json( - data_split["train"], save_json_train, sample_rate, model_name + data_split["train"], save_json_train, sample_rate, alignments_folder, model_name ) create_json( - data_split["valid"], save_json_valid, sample_rate, model_name + data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name ) - create_json(data_split["test"], save_json_test, sample_rate, model_name) + create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name) def prepare_split(data_folder, split_list): @@ -172,7 +177,7 @@ def prepare_split(data_folder, split_list): return wav_list -def create_json(wav_list, json_file, sample_rate, model_name=None): +def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None): """ Creates the json file given a list of wav files. Arguments @@ -183,6 +188,10 @@ def create_json(wav_list, json_file, sample_rate, model_name=None): The path of the output json file sample_rate : int The sample rate to be used for the dataset + data_folder : str + The path to LibriTTS + alignments_folder : str + The path to LibriTTS alignments model_name : str Model name (used to prepare additional model specific data) """ @@ -250,6 +259,10 @@ def create_json(wav_list, json_file, sample_rate, model_name=None): "label": normalized_text, "segment": True if "train" in json_file else False, } + if alignments_folder is not None: + alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file) + alignments = parse_alignments(alignments_file_name) + json_dict[uttid].update(alignments) # Characters are used for Tacotron2, phonemes may be needed for other models if model_name not in ["Tacotron2", "HiFi-GAN"] and g2p is not None: @@ -264,6 +277,39 @@ def create_json(wav_list, json_file, sample_rate, model_name=None): logger.info(f"{json_file} successfully created!") +def get_alignment_path(data_folder, alignments_folder, file_name): + """Returns the path in the LibriSpeech-Alignments dataset + corresponding to the specified file path in LibriSpeech + + Arguments + --------- + data_folder: str + the path to LibriSpeech + alignments_folder: str + the path to LibriSpeech-Alignments + file_name: str + the file name within LibriSpeech + + Returns + ------- + file_name: str + the alignment file path + """ + file_name = Path(file_name) + data_folder = Path(data_folder) + if file_name.parts[0] == "{data_root}": + file_name_rel = file_name.relative_to("{data_root}") + else: + file_name_rel = file_name.relative_to(data_folder) + data_slice = file_name_rel.parts[0] + + textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent + textgrid_file_name = f"{file_name_rel.stem}.TextGrid" + textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name + + return textgrid_path + + def skip(*filenames): """ Detects if the data preparation has been already done. @@ -329,3 +375,124 @@ def check_folders(*folders): if not os.path.exists(folder): return False return True + +def parse_alignments(file_name): + """Parses a given LibriSpeech-Alignments TextGrid file and + converts the results to the desired format (to be used in JSON + metadata) + + Arguments + --------- + file_name : path-like + the file name of the TextGrid file + + Returns + ------- + details: dict + the metadata details + """ + try: + import textgrids + except ImportError: + logger.error( + "Parsing LibriSpeech-alignments requires the" + "praat-textgrids package" + ) + raise + if not file_name.exists(): + return { + "has_alignments": False, + "phn": [], + "phn_stress": [], + "phn_start": [], + "phn_end": [], + "phn_count": 0, + "wrd": [], + "wrd_start": [], + "wrd_end": [], + "wrd_count": 0, + "unk_count": None + } + + text_grid = textgrids.TextGrid() + text_grid.read(file_name) + word_intervals = [ + {**word, "label": word["label"].upper()} + for word in text_grid.interval_tier_to_array("words") + ] + phn_intervals = text_grid.interval_tier_to_array("phones") + details = {} + details.update(intervals_to_dict(word_intervals, "wrd")) + phn = intervals_to_dict(phn_intervals, "phn") + phn_stress = phn["phn"] + phn_nostress = remove_stress_marks(phn_stress) + phn["phn"] = phn_nostress + phn["phn_stress"] = phn_stress + details.update(phn) + details["unk_count"] = sum(wrd == "" for wrd in details["wrd"]) + details["has_alignments"] = True + + return details + + +INTERVAL_MAP = [("label", ""), ("begin", "_start"), ("end", "_end")] +INTERVAL_EMPTY_LABELS = {"", "sil", "sp", "spn"} + + +def intervals_to_dict(intervals, prefix): + """ + Converts a parsed list of intervals from PRAAT TextGrid + to a learning-friendly array + + Arguments + --------- + intervals: list + A list of raw TextGrid intervals, as returned by + TextGrid.interval_tier_to_array + prefix: str + the prefix to add + + Returns + ------- + result: dict + A dictionary of the form + { + "{prefix}": , + "{prefix}_start": , + "{prefix}_end": , + "{prefix}_count: + } + + """ + # Remove meaningless labels + intervals_clean = [ + interval + for interval in intervals + if interval["label"] not in INTERVAL_EMPTY_LABELS + ] + result = { + f"{prefix}{suffix}": [interval[key] for interval in intervals_clean] + for key, suffix in INTERVAL_MAP + } + # This will map space labels to a single one + result[f"{prefix}_count"] = len(intervals_clean) + return result + + +RE_STRESS_MARK = re.compile(r"\d$") + + +def remove_stress_marks(phn): + """Removes stress marks from a phoneme annotation + + Arguments + --------- + phn: list + a list of phoneme annotations with or without stress marks + + Returns + ------- + result: list + a list of phoneme annotations without stress marks + """ + return [RE_STRESS_MARK.sub("", item) for item in phn] From cbea7f714f670ee8e701b45c5052741d912b3010 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 00:55:06 -0500 Subject: [PATCH 137/270] DASB: Fixed --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 18adcf75f..ec802fbd5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -945,6 +945,7 @@ def undo_padding_tensor(batch, lengths): else None ), "seed": hparams["seed"], + "alignments_folder": hparams.get("alignments_folder"), "model_name": hparams["model"].__class__.__name__, }, ) From e48a91f72ea0583d69b96f9c3c2f7200c01824a2 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 20:05:31 -0500 Subject: [PATCH 138/270] VALL-E: Fixes, add encodec --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 14 +------------- .../LibriTTS/TTS/valle/hparams/train_encodec.yaml | 1 - 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 827ffe2e3..2f6f34297 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/discrete_ssl +experiment_name: valle/discrete_ssl # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 __set_seed: !apply:torch.manual_seed [!ref ] @@ -92,18 +92,6 @@ special_num_tokens: 4 lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" lr_warmup_steps: 70000 lr_annealing_mode: step -guided_attention_weight: 50.0 -guided_attention_sigma: 0.5 -gate_loss_weight: 1.0 -gate_threshold: 0.5 -gate_loss_beta: 0.2 -gate_loss_gamma: 0.01 -gate_loss_max_weight: 1. - -# Inference parameters -eos_mode: gate -decoder_mode: autoregressive -scale_factor: 4 # Feature parameters sample_rate: 22050 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 1a125cc81..6fc0f4b58 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -22,7 +22,6 @@ alignments_folder: null prepare_save_folder: !ref data_folder_alignments: null # e.g., /path/to/LibriSpeech pretrained_model_save_folder: !ref -ssl_model_type: wavlm representation_mode: discrete prepare_archive_path: null prepare_skip_ignore_folders: False From 45d613079d8ff5bcb0c2ac441591a0047caee1b9 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 21:11:46 -0500 Subject: [PATCH 139/270] DASB: Add encodec --- .../TTS/valle/hparams/train_encodec.yaml | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml new file mode 100644 index 000000000..b6de2eb8a --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -0,0 +1,227 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 6 +bandwidth: 1.5 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref From e1635df4f8ea635afaf2f462ccb29e254b390ce5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 21:28:23 -0500 Subject: [PATCH 140/270] DASB: fixes --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index c21f29c2d..c8c0198a6 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -448,7 +448,7 @@ def fit_batch(self, batch): return loss -INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} +INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phonemes"} def dataio_prepare(hparams): From 64b73e74375a89e24f93827db50f4397e4461582 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 21:31:24 -0500 Subject: [PATCH 141/270] DASB: Fixes --- benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index 06292fd34..08d7297e5 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -53,8 +53,6 @@ def prepare_ljspeech( pitch_max_f0=400, skip_prep=False, use_custom_cleaner=False, - extract_features=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", skip_ignore_folders=False, @@ -404,10 +402,6 @@ def prepare_json( pitch_min_f0, pitch_max_f0, use_custom_cleaner=False, - extract_features=None, - extract_features_context=None, - extract_features_folder=None, - extract_features_opts=None, extract_phonemes=False, g2p_src="speechbrain/soundchoice-g2p", device="cpu", @@ -467,7 +461,7 @@ def prepare_json( extract_phonemes = True if extract_phonemes: logger.info( - "Computing phonemes for LJSpeech labels using SpeechBrain G2P. This may take a while." + "Computing phonemes for LJSpeech labels using SpeechBrain f This may take a while." ) g2p = GraphemeToPhoneme.from_hparams( g2p_src, run_opts={"device": device} From 79ca7a6c3f3db20d10127f25a0ec21b5c24348ba Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 23:09:02 -0500 Subject: [PATCH 142/270] DASB: Vall-E: Multi-GPU inference fix --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 7 ++++++- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index c8c0198a6..b77ad4afa 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -373,8 +373,13 @@ def inference(self, batch): prefix, prefix_length = batch.prefix # NOTE: ESPNET VALL-E does not support batched inference prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) inference_results = [ - self.modules.model.inference( + inference( prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() ) for prefix_item in prefix_items diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index ec802fbd5..d75defb0a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -371,8 +371,13 @@ def inference(self, batch): prefix, prefix_length = batch.prefix # NOTE: ESPNET VALL-E does not support batched inference prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = ( + self.modules.model.module.inference + if hasattr(self.modules.model, "module") + else self.modules.model.inference + ) inference_results = [ - self.modules.model.inference( + inference( prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() ) for prefix_item in prefix_items @@ -946,6 +951,7 @@ def undo_padding_tensor(batch, lengths): ), "seed": hparams["seed"], "alignments_folder": hparams.get("alignments_folder"), + "extract_phonemes": hparams["input"] == "phonemes", "model_name": hparams["model"].__class__.__name__, }, ) From c6c6cf64226da780debda96014612efc94ea906a Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 23:31:36 -0500 Subject: [PATCH 143/270] DASB: Fixes --- benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index b6de2eb8a..6ea447070 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -163,8 +163,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice text: !ref + phonemes: !ref + -audio_tokens_per_step: 6 -bandwidth: 1.5 +audio_tokens_per_step: 8 +bandwidth: 6 ############################## models ################################ From e25d1469e15fd85bf715bcdbf1c0160a0914c83b Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Feb 2025 23:41:50 -0500 Subject: [PATCH 144/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index d75defb0a..87e70b407 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -951,7 +951,6 @@ def undo_padding_tensor(batch, lengths): ), "seed": hparams["seed"], "alignments_folder": hparams.get("alignments_folder"), - "extract_phonemes": hparams["input"] == "phonemes", "model_name": hparams["model"].__class__.__name__, }, ) From 45b3d1ba99e18260b2afc2b93b05825e249979ce Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Feb 2025 06:59:59 -0500 Subject: [PATCH 145/270] DASB: CPU/GPU fixes --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index b77ad4afa..9cd5fb3f1 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -422,7 +422,7 @@ def save_samples(self, batch, wav, length, stage): samples = undo_padding_tensor(wav, length) for uttid, sample in zip(batch.uttid, samples): file_name = output_folder / f"pred_{uttid}.wav" - write_audio(file_name, sample, self.hparams.model_sample_rate) + write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate) def save_eval(self, stage): """Saves evaluation results From 370ab8e390082e0ef5bf388e6c01dfa3f7686a16 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Feb 2025 17:20:03 -0500 Subject: [PATCH 146/270] DASB: Minor fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 11 ++++++++--- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 14 ++++++++------ benchmarks/DASB/run_experiments.sh | 2 +- benchmarks/DASB/run_hparam_optimization.sh | 4 ++-- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index dd9d61762..86e0efc26 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -914,9 +914,14 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - tts_brain.evaluate( - test_set=datasets["test"], test_loader_kwargs=test_dataloader_opts, - ) + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index b160ab3a5..9164d31e0 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -1015,9 +1015,11 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - tts_brain.evaluate( - test_set=datasets["test"], - test_loader_kwargs=use_silence_padding( - hparams["test_dataloader_opts"], silence_padding, audio_keys - ), - ) + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) \ No newline at end of file diff --git a/benchmarks/DASB/run_experiments.sh b/benchmarks/DASB/run_experiments.sh index aacbc381e..5dcd6b397 100755 --- a/benchmarks/DASB/run_experiments.sh +++ b/benchmarks/DASB/run_experiments.sh @@ -186,7 +186,7 @@ mkdir -p $cached_data_folder # Function to run the training experiment run_experiment() { -python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ +eval python $dataset/$task/train.py $hparams --cached_data_folder=$cached_data_folder --seed=$seed --data_folder=$data_folder --output_folder=$output_folder_exp \ $additional_flags } diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 468015d08..3029a3678 100755 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -365,7 +365,7 @@ while [ -n "$opt_flags" ]; do eval $orion_hunt_command # Compress the exp folder (if required) - if [ "$compress_exp" = True ]; then + if [ "$compress_exp" = True ] && [ ! -e "$output_folder_step/exp.tar.gz" ]; then tar -czf "$output_folder_step/exp.tar.gz" "$output_folder_step/exp" if [ -d "$output_folder_step/exp" ]; then rm -rf "$output_folder_step/exp" @@ -417,4 +417,4 @@ scp $best_yaml_file $final_yaml_file --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ --rnd_dir False --testing True $additional_flags -echo "The test performance with best hparams is available at $output_folder/best" \ No newline at end of file +echo "The test performance with best hparams is available at $output_folder/best" From 256fa35e4ba1ac708efe4b005fce13c1e25db58d Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Feb 2025 22:17:24 -0500 Subject: [PATCH 147/270] DASB: Fixes --- .../TTS/valle/hparams/train_encodec.yaml | 3 +- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 36 +++++++++++-------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index 6ea447070..3a9a1347d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -84,7 +84,7 @@ model_sample_rate: 24000 max_audio_length: 1000 text_max_length: 500 n_ctx: !ref + -infer_max_audio_length: !ref +infer_top_k: 20 max_length_ratio: 10.0 debug_infer_max_audio_length: 10 @@ -187,6 +187,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 9cd5fb3f1..c8b35c57d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -78,11 +78,6 @@ def create_waveform(self, audio, length): if hasattr(self.modules.tokenizer, "codec_vocoder"): self.modules.tokenizer.codec_vocoder.to(self.device) self.modules.tokenizer.codec_vocoder.device = self.device - audio = ( - (audio - hparams["audio_token_shift"] - self.offsets) - .clip(min=0.0) - .int() - ) wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) return wav @@ -288,7 +283,7 @@ def evaluate_batch(self, batch, stage): audio_tokens, audio_length = self.inference(batch) if self.hparams.flip_layers: audio_tokens = audio_tokens.flip(2) - wav = self.create_waveform(audio_tokens, audio_length) + wav = self.create_waveform(audio_tokens, audio_length) wav = wav.squeeze(1) self.save_samples( batch=batch, wav=wav, length=audio_length, stage=stage @@ -391,7 +386,10 @@ def inference(self, batch): for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) - audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + offsets = self.offsets + if self.hparams.flip_layers: + offsets = offsets.flip(2) + audio = (audio - self.hparams.audio_token_shift - offsets).clip(0) return audio, audio_length def _get_inference_opts(self): @@ -550,7 +548,7 @@ def sig_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) return sig - dynamic_items = [text_pipeline, tokens_pipeline, prompt_pipeline] + dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline, prompt_pipeline] init_sequence_encoder(hparams) use_spk_emb = hparams.get("use_spk_emb", False) @@ -572,7 +570,6 @@ def sig_pipeline(wav): dataset_dynamic_items = list(dynamic_items) dataset_output_keys = list(output_keys) if dataset != "train": - dataset_dynamic_items.append(sig_pipeline) dataset_output_keys += ["sig", "label_norm_eval", "prefix"] dynamic_dataset = sb.dataio.dataset.DynamicItemDataset.from_json( json_path=data_info[dataset], @@ -707,17 +704,23 @@ def apply_overfit_test(hparams, dataset): """ if hparams["overfit_test"]: if isinstance(dataset, tuple): - dataset_train, _, _ = dataset + dataset_train, dataset_valid, _ = dataset dataset_train = apply_overfit_test(hparams, dataset_train) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset_valid.pipeline.output_mapping.keys()) + ) result = dataset_train, dataset_eval, dataset_eval elif isinstance(dataset, dict): dataset_train = apply_overfit_test(hparams, dataset["train"]) dataset_eval = dataset_train.filtered_sorted( select_n=hparams["overfit_test_sample_count"] ) + dataset_eval.set_output_keys( + list(dataset["valid"].pipeline.output_mapping.keys()) + ) result = { "train": dataset_train, "valid": dataset_eval, @@ -831,6 +834,7 @@ def undo_padding_tensor(batch, lengths): datasets = dataio_prepare(hparams) # Apply overfit test settings + datasets["train"].data_ids = ["LJ001-0023"] datasets = apply_overfit_test(hparams, datasets) audio_keys = ["audio_tokens"] @@ -857,7 +861,11 @@ def undo_padding_tensor(batch, lengths): # Load best checkpoint for evaluation if hparams["testing"]: - tts_brain.evaluate( - test_set=datasets["test"], - test_loader_kwargs=hparams["test_dataloader_opts"], - ) + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + ) From 9f27332c1c0f2b25f574a2d2915cbb7aaefa1d45 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Feb 2025 22:35:18 -0500 Subject: [PATCH 148/270] DASB: Review debugging code --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index c8b35c57d..d7eec7079 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -834,7 +834,6 @@ def undo_padding_tensor(batch, lengths): datasets = dataio_prepare(hparams) # Apply overfit test settings - datasets["train"].data_ids = ["LJ001-0023"] datasets = apply_overfit_test(hparams, datasets) audio_keys = ["audio_tokens"] From bad8999b564b8569101eb3bcb30bf000f8b298e6 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 11 Feb 2025 11:34:49 -0500 Subject: [PATCH 149/270] VALL-E: Update token sequence initialization to account for special tokens --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 10 ++++++---- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index d7eec7079..d823530a9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -37,8 +37,6 @@ logger = logging.getLogger(__name__) -SPECIAL_TOKEN_COUNT = 1 - # Brain class for speech recognition training class VALLEBrain(sb.Brain): @@ -638,10 +636,14 @@ def init_sequence_encoder(hparams): an encoder instance""" encoder = hparams["label_encoder"] token_list_file_name = hparams["token_list_file"] - tokens = read_token_list(token_list_file_name) + tokens = read_token_list(token_list_file_name) encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) encoder.update_from_iterable(tokens, sequence_input=False) - encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) return encoder diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 87e70b407..7fa415230 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -39,9 +39,6 @@ logger = logging.getLogger(__name__) -SPECIAL_TOKEN_COUNT = 1 - - # Brain class for speech recognition training class VALLEBrain(sb.Brain): """Class that manages the training loop. See speechbrain.core.Brain.""" @@ -755,8 +752,13 @@ def init_sequence_encoder(hparams): token_list_file_name = hparams["token_list_file"] tokens = read_token_list(token_list_file_name) encoder.add_unk() + for token in hparams["special_tokens"]: + token_key = token.replace("<", "").replace(">", "") + token_index = hparams[f"{token_key}_index"] + encoder.insert_label(token, token_index) + encoder.update_from_iterable(tokens, sequence_input=False) - encoder.expect_len(len(tokens) + SPECIAL_TOKEN_COUNT) + encoder.expect_len(len(tokens) + hparams["special_num_tokens"]) return encoder From 39ddfd16619ed6afb1797e420691a3e9dbd73a01 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 11 Feb 2025 16:47:55 -0500 Subject: [PATCH 150/270] DASB: hparam file updates, new hparams for additional tokenizers --- .../TTS/valle/hparams/train_mimi.yaml | 224 ++++++++++++++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 229 ++++++++++++++ .../extraction/hparams/speech_tokenizer.yaml | 2 +- .../LibriSpeech/extraction/hparams/mimi.yaml | 2 +- .../extraction/hparams/speech_tokenizer.yaml | 2 +- .../extraction/hparams/wavtokenizer.yaml | 2 +- .../TTS/tokotron/hparams/train_encodec.yaml | 1 + .../tokotron/hparams/train_wavtokenizer.yaml | 280 ++++++++++++++++++ 8 files changed, 738 insertions(+), 4 deletions(-) create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..e6d0ad87c --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,224 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/mimi +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 2048 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..b922c7489 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,229 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/wavtokenizer +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: True +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 +bandwidth: 6 + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml index 155960c27..9d6ba7130 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -42,7 +42,7 @@ save_embedding: False tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml index f9720b170..7871d6212 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/mimi.yaml @@ -47,7 +47,7 @@ save_embedding: False tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref - save_path: !ref + save_path: !ref num_codebooks: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml index 9c8baf3bf..3090e9f79 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/speech_tokenizer.yaml @@ -45,7 +45,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref tokens_extractor: !new:utils.tokens.TokensExtractor diff --git a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml index 976614a3d..9a8b754eb 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriSpeech/extraction/hparams/wavtokenizer.yaml @@ -47,7 +47,7 @@ vocab_size: 4096 # wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref sample_rate: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 258065779..4a2a7b033 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -14,6 +14,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. # Data files data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null prepare_save_folder: !ref /prepared pretrained_model_save_folder: !ref representation_mode: discrete diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..3c06d761f --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -0,0 +1,280 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +cached_data_folder: !PLACEHOLDER +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: vocos +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + + +freeze_token_model: True +model_hub: kyutai/mimi +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +decoder_chunk_size: -1 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 2048 +audio_emb_size: 1024 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 8 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:model.Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref From 5acd1d3528bd850d607804a0e144545ab00f57e3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Feb 2025 11:49:20 -0500 Subject: [PATCH 151/270] VALL-E: Add files for multiple configurations --- .../TTS/tokotron/hparams/train_mimi.yaml | 2 +- .../TTS/tokotron/hparams/train_mimi.yaml | 2 +- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 239 ++++++++++++++++++ .../TTS/valle/hparams/train_discrete_ssl.yaml | 11 +- .../TTS/valle/hparams/train_encodec.yaml | 11 +- .../TTS/valle/hparams/train_mimi.yaml | 234 +++++++++++++++++ .../valle/hparams/train_speech_tokenizer.yaml | 234 +++++++++++++++++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 239 ++++++++++++++++++ 8 files changed, 963 insertions(+), 9 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index e80edb2b0..b99ac7980 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -150,7 +150,7 @@ transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU audio_num_tokens: 2048 -audio_emb_size: 1024 +audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False audio_token_offsets: False diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml index 3c06d761f..4f0772f47 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -112,7 +112,7 @@ beam_size: 5 # Feature parameters sample_rate: 22050 -model_sample_rate: 16000 +model_sample_rate: 24000 max_audio_length: 5000 infer_max_audio_length: 1000 debug_infer_max_audio_length: 10 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml new file mode 100644 index 000000000..5cb2f4050 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -0,0 +1,239 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/dac + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 60 +epoch_size: 10000 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 2300 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !ref // + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 2 + +# Model Settings +model_type: 24khz +model_bitrate: 8kbps + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.DACTokenizer + model_type: !ref + model_bitrate: !ref + n_codebooks: !ref + load_pretrained: True + tag: latest + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index f8af5fb45..08d53d66b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -10,7 +10,6 @@ __set_seed: !apply:torch.manual_seed [!ref ] run_name: !PLACEHOLDER # Model Type -ssl_model_type: wavlm representation_mode: discrete output_folder: !ref results/tokotron/// save_folder: !ref /save @@ -45,6 +44,9 @@ tokens_folder: !PLACEHOLDER tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref freeze_token_model: True + +# Model Settings +ssl_model_type: wavlm token_model_src: !apply:speechbrain.utils.hparams.choice value: !ref choices: @@ -63,7 +65,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS speech_model_layers: [1, 3, 7, 12, 18, 23] flip_layers: False -token_offset: 1 + +# Speaker Embeddings spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] @@ -71,7 +74,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 50 +number_of_epochs: 1200 +epoch_size: 10000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 @@ -147,6 +151,7 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref + looped_nominal_epoch: !ref // collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 6fc0f4b58..3371621d0 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -45,7 +45,6 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 -token_model_src: "facebook/encodec_24khz" g2p_src: flexthink/soundchoice-g2p tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader @@ -56,7 +55,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 150 +number_of_epochs: 60 +epoch_size: 10000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 @@ -109,6 +109,7 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref + looped_nominal_epoch: !ref // collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref @@ -168,8 +169,10 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 + +# Model Settings +model_hub: facebook/encodec_24khz bandwidth: 6 -attention_type: regularMHA ############################## models ################################ @@ -193,7 +196,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer - source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + source: !ref save_path: !ref sample_rate: !ref bandwidth: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml new file mode 100644 index 000000000..d41dd3b98 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -0,0 +1,234 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/mimi + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 60 +epoch_size: 10000 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 2300 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !ref // + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 + +# Model Settings +model_hub: kyutai/mimi + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.MimiTokenizer + source: !ref + save_path: !ref + num_codebooks: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml new file mode 100644 index 000000000..4928b2dd3 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -0,0 +1,234 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 60 +epoch_size: 10000 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 2300 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !ref // + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 + +# Model Settings +model_hub: fnlp/SpeechTokenizer + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper + source: !ref # Only the 24kHz version supports mono audio + save_path: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml new file mode 100644 index 000000000..1796c4425 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -0,0 +1,239 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/wavtokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 60 +epoch_size: 10000 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 5.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 2300 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !ref // + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 1 + +# Model Settings +model_hub: novateur/WavTokenizer-medium-music-audio-75token +config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml +checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper + source: !ref + save_path: !ref + checkpoint: !ref + config: !ref + freeze: True + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref From a78f011ff4a59700d9ecb37c9f11879e30bb1409 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Feb 2025 19:00:33 -0500 Subject: [PATCH 152/270] DASB: Add Lifeteng-style curriculum, some config updates --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 2 + .../TTS/valle/hparams/train_encodec.yaml | 2 + .../TTS/valle/hparams/train_mimi.yaml | 2 + .../TTS/valle/hparams/train_wavtokenizer.yaml | 2 + benchmarks/DASB/LJSpeech/TTS/valle/train.py | 147 +++++++++++++----- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 8 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 8 +- .../TTS/valle/hparams/train_encodec.yaml | 8 +- .../TTS/valle/hparams/train_mimi.yaml | 8 +- .../valle/hparams/train_speech_tokenizer.yaml | 8 +- .../TTS/valle/hparams/train_wavtokenizer.yaml | 8 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 141 ++++++++++++----- benchmarks/DASB/model/valle.py | 44 ++++-- 13 files changed, 273 insertions(+), 115 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 2f6f34297..3e34a6bce 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -70,6 +70,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index 3a9a1347d..0255373ad 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -55,6 +55,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index e6d0ad87c..aeb97d1c3 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -56,6 +56,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index b922c7489..1867c0c1c 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -58,6 +58,8 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null batch_size: 16 grad_accumulation_factor: 1 max_grad_norm: 0.01 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index d823530a9..02a76476b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -1,12 +1,10 @@ #!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio +"""Recipe for training VALL-E -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model +Based on ESPNET VALL-E +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e Authors * Artem Ploujnikov 2024 @@ -98,14 +96,19 @@ def compute_forward(self, batch, stage): batch = batch.to(self.device) prompt, prompt_length = batch.prompt batch_size, prompt_max_len, num_tracks = prompt.shape - nar_track = torch.randint( - 1, num_tracks, (batch_size,), device=self.device - ) + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) + else: + nar_track = None logits_ar, logits_nar = self.modules.model( dec_seq=batch.prompt.data, dec_seq_lengths=batch.prompt.lengths, prefix_len=batch.prefix_length / prompt_max_len, nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, ) return logits_ar, logits_nar, nar_track @@ -134,13 +137,8 @@ def compute_objectives(self, predictions, batch, stage): prompt, prompt_length = batch.prompt prefix_length = batch.prefix_length - logits_ar_sm = self.hparams.log_softmax(logits_ar) - logits_nar_sm = self.hparams.log_softmax(logits_nar) - batch_size, max_len, _ = prompt.shape - targets_ar = prompt[:, 1:, 0] + batch_size, prompt_max_len, _ = prompt.shape batch_idx = torch.arange(batch_size, device=prompt.device) - targets_nar = prompt[batch_idx, 1:, nar_track] - prompt_max_len = prompt.size(1) length_mask = length_to_mask( prompt_length * prompt_max_len, prompt_max_len ) @@ -149,29 +147,81 @@ def compute_objectives(self, predictions, batch, stage): ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] - loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask - ) - self.loss_metric_ar.append( - ids=batch.uttid, - log_probabilities=logits_ar_sm, - targets=targets_ar, - mask=mask, - reduction="batch", - ) - loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, - ) - self.loss_metric_nar.append( + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + targets_ar = prompt[:, 1:, 0] + loss_ar = self.hparams.compute_cost( + log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( ids=batch.uttid, - log_probabilities=logits_nar_sm, - targets=targets_nar, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, mask=mask, reduction="batch", ) - loss = 0.5 * (loss_ar + loss_nar) + + loss = torch.mean(torch.stack(loss_components)) return loss + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch" + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + log_probabilities=logits_ar, targets=targets_ar, mask=mask, + reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + log_probabilities=logits_nar, targets=targets_nar, mask=mask, + reduction=reduction, + ) + return stats + def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch. @@ -188,16 +238,10 @@ def on_stage_start(self, stage, epoch): )[None, None, :].to(self.device) self.loss_metric = sb.utils.metric_stats.MultiMetricStats( - metric=self.hparams.compute_cost, batch_eval=True, - ) - self.loss_metric_ar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, batch_eval=True, - ) - self.loss_metric_nar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, batch_eval=True, + metric=self.compute_loss_stats, batch_eval=True, ) + self.apply_curriculum() - # TOOO: Reestablish evaluation self.is_evaluating = False if stage == sb.Stage.VALID: if self.is_eval_epoch(epoch): @@ -209,6 +253,22 @@ def on_stage_start(self, stage, epoch): self.evaluation_metric.on_evaluation_start() self.is_evaluating = True + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + if self.hparams.audio_tokens_per_step == 1: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: + self.train_nar = False + elif ( + self.hparams.number_of_epochs_nar is not None + and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + ): + self.train_ar = False + def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed in the specieied epoch @@ -226,7 +286,12 @@ def is_eval_epoch(self, epoch): otherwise""" if epoch is None: epoch = self.hparams.epoch_counter.current - return epoch % self.hparams.eval_interval == 0 + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) def on_fit_start(self): """Gets called at the beginning of ``fit()``, on multiple processes diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 5cb2f4050..c60540da2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -56,12 +56,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 60 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 5.0 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 08d53d66b..d3e30a4f4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -74,12 +74,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 1200 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 0.01 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 3371621d0..34c5e6cb2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 60 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 5.0 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index d41dd3b98..c585acfba 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 60 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 5.0 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 4928b2dd3..fd0e3daaf 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 60 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 5.0 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 1796c4425..e59ccd34f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -55,12 +55,14 @@ ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters input: text -number_of_epochs: 60 -epoch_size: 10000 +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 batch_size: 16 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 5.0 +max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 7fa415230..47e0a25e4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1,12 +1,10 @@ #!/usr/bin/env/python3 -"""Recipe for training a Text-to-Speech system based on tokenized audio +"""Recipe for training VALL-E -Inspired by WhisperSpeech -https://github.com/collabora/WhisperSpeech - -However, this is not an implementation of WhisperSpeech, but rather -a radical simplification of it that uses only an acoustic model +Based on ESPNET VALL-E +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e Authors * Artem Ploujnikov 2024 @@ -108,6 +106,8 @@ def compute_forward(self, batch, stage): dec_seq_lengths=batch.prompt.lengths, prefix_len=batch.prefix_length / prompt_max_len, nar_level_idx=nar_track, + predict_ar=self.train_ar, + predict_nar=self.train_nar, ) return logits_ar, logits_nar, nar_track @@ -136,13 +136,8 @@ def compute_objectives(self, predictions, batch, stage): prompt, prompt_length = batch.prompt prefix_length = batch.prefix_length - logits_ar_sm = self.hparams.log_softmax(logits_ar) - logits_nar_sm = self.hparams.log_softmax(logits_nar) - batch_size, max_len, _ = prompt.shape - targets_ar = prompt[:, 1:, 0] + batch_size, prompt_max_len, _ = prompt.shape batch_idx = torch.arange(batch_size, device=prompt.device) - targets_nar = prompt[batch_idx, 1:, nar_track] - prompt_max_len = prompt.size(1) length_mask = length_to_mask( prompt_length * prompt_max_len, prompt_max_len ) @@ -151,28 +146,80 @@ def compute_objectives(self, predictions, batch, stage): ).logical_not() mask = (length_mask * prefix_mask)[:, 1:] - loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask - ) - self.loss_metric_ar.append( - ids=batch.uttid, - log_probabilities=logits_ar_sm, - targets=targets_ar, - mask=mask, - reduction="batch", - ) - loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, - ) - self.loss_metric_nar.append( + loss_components = [] + + if self.train_ar: + logits_ar_sm = self.hparams.log_softmax(logits_ar) + targets_ar = prompt[:, 1:, 0] + loss_ar = self.hparams.compute_cost( + log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask + ) + loss_components.append(loss_ar) + else: + logits_ar_sm, targets_ar = None, None + if self.train_nar: + logits_nar_sm = self.hparams.log_softmax(logits_nar) + targets_nar = prompt[batch_idx, 1:, nar_track] + loss_nar = self.hparams.compute_cost( + log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, + ) + loss_components.append(loss_nar) + else: + logits_nar_sm, targets_nar = None, None + + self.loss_metric.append( ids=batch.uttid, - log_probabilities=logits_nar_sm, - targets=targets_nar, + logits_ar=logits_ar_sm, + targets_ar=targets_ar, + logits_nar=logits_nar_sm, + targets_nar=targets_nar, mask=mask, reduction="batch", ) - loss = 0.5 * (loss_ar + loss_nar) + + loss = torch.mean(torch.stack(loss_components)) return loss + + def compute_loss_stats( + self, + logits_ar, + targets_ar, + logits_nar, + targets_nar, + mask, + reduction="batch" + ): + """Computes an autoregressive/non-autoregressive loss breakdown, + to be used for metrics/stats + + Arguments + --------- + logits_ar : torch.Tensor + The autoregressive predictions + targets_ar : torch.Tensor + The targets for autoregressive predictions + logits_nar : torch.Tensor + The non-autoregressive predictions + targets_nar : torch.Tensor + The targets for non-autoregressive prediction + + Returns + ------- + stats: dict + statistics + """ + stats = {} + if self.train_ar: + stats["loss_ar"] = self.hparams.compute_cost( + log_probabilities=logits_ar, targets=targets_ar, mask=mask, + reduction=reduction, + ) + if self.train_nar: + stats["loss_nar"] = self.hparams.compute_cost( + log_probabilities=logits_nar, targets=targets_nar, mask=mask, + reduction=reduction, + ) + return stats def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch. @@ -190,16 +237,10 @@ def on_stage_start(self, stage, epoch): )[None, None, :].to(self.device) self.loss_metric = sb.utils.metric_stats.MultiMetricStats( - metric=self.hparams.compute_cost, batch_eval=True, - ) - self.loss_metric_ar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, batch_eval=True, - ) - self.loss_metric_nar = sb.utils.metric_stats.MetricStats( - metric=self.hparams.compute_cost, batch_eval=True, + metric=self.compute_loss_stats, batch_eval=True, ) + self.apply_curriculum() - # TOOO: Reestablish evaluation self.is_evaluating = False if stage == sb.Stage.VALID: if self.is_eval_epoch(epoch): @@ -211,6 +252,22 @@ def on_stage_start(self, stage, epoch): self.evaluation_metric.on_evaluation_start() self.is_evaluating = True + def apply_curriculum(self): + """Applies curriculum settings, if specified, training only the autoregressive part - or + only the non-autoregressive part""" + epoch = self.hparams.epoch_counter.current + self.train_ar, self.train_nar = True, True + if self.hparams.audio_tokens_per_step == 1: + # NOTE: If there is only one track it's autoregressive + self.train_nar = False + elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: + self.train_nar = False + elif ( + self.hparams.number_of_epochs_nar is not None + and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + ): + self.train_ar = False + def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed in the specieied epoch @@ -228,7 +285,12 @@ def is_eval_epoch(self, epoch): otherwise""" if epoch is None: epoch = self.hparams.epoch_counter.current - return epoch % self.hparams.eval_interval == 0 + # NOTE: Need to get past AR-only training to be able to evaluate + can_evaluate = not ( + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ) + return can_evaluate and (epoch % self.hparams.eval_interval == 0) def on_fit_start(self): """Gets called at the beginning of ``fit()``, on multiple processes @@ -379,8 +441,7 @@ def inference(self, batch): ) for prefix_item in prefix_items ] - inferred_tokens = [ - result[0][0] + inferred_tokens = [ result[0][0] if result[0] else torch.zeros( 1000, self.hparams.audio_tokens_per_step, device=self.device diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index ab233efff..5805cb061 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -125,6 +125,8 @@ def forward( prefix_len=None, conti_feats=None, nar_level_idx=1, + predict_ar=True, + predict_nar=True, ): """Vall-E forward for training @@ -144,6 +146,10 @@ def forward( Lengths of condition part in dec_seq (B,). nar_level_idx : int the index of the non-autoregressive level to train + predict_ar : bool + Whether to make an autoregressive prediction + predict_nar : bool + Whether to make a non-autoregressive prediction Returns ------- @@ -161,24 +167,30 @@ def forward( ) # Auto-Regressive part - input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[ - :, :-1 - ] # [B, T, D] - h_ar = self.ar_decoder(input_ar_emb) + if predict_ar: + input_ar_emb = self.prepare_input(dec_seq_emb, prefix_len, 1)[ + :, :-1 + ] # [B, T, D] + h_ar = self.ar_decoder(input_ar_emb) # Non-Auto-Regressive part - input_nar_emb = self.prepare_input( - dec_seq_emb, prefix_len, nar_level_idx - )[ - :, 1: - ] # [B, T, V] - max_len = dec_seq.size(1) - mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool() - mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask) - - logits_ar = self.lm_head(h_ar) - logits_nar = self.lm_head(h_nar) + if predict_nar: + input_nar_emb = self.prepare_input( + dec_seq_emb, prefix_len, nar_level_idx + )[ + :, 1: + ] # [B, T, V] + max_len = dec_seq.size(1) + mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask) + + # Logits + logits_ar, logits_nar = None, None + if predict_ar: + logits_ar = self.lm_head(h_ar) + if predict_nar: + logits_nar = self.lm_head(h_nar) return logits_ar, logits_nar From 953540baa3c6f7dba1a401a75c972306e5ea6dd4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 13 Feb 2025 02:03:19 -0500 Subject: [PATCH 153/270] DASB: Add init_from --- .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index b48bb66fa..937db0812 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -38,6 +38,7 @@ sample_path: null progress_folder: !ref /progress progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 From f8b9a6721d1f18a974708e6456cfaf50bd9d1a45 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 13 Feb 2025 02:13:32 -0500 Subject: [PATCH 154/270] DASB: Add init_from --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index c585acfba..c6d1a4dfb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -42,6 +42,7 @@ sample_path: null progress_folder: !ref /progress progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 From 4f8cc9c96ab284b18f835e405545e13ccf251d43 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 13 Feb 2025 13:45:59 -0500 Subject: [PATCH 155/270] DASB: VALL-E: Implement checkpoint retention based on dWER --- .../tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 1 - .../TTS/valle/hparams/train_discrete_ssl.yaml | 10 +++++-- .../TTS/valle/hparams/train_encodec.yaml | 7 ++++- .../TTS/valle/hparams/train_mimi.yaml | 7 ++++- .../TTS/valle/hparams/train_wavtokenizer.yaml | 7 ++++- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 15 ++++++++++- .../TTS/tokotron/hparams/train_encodec.yaml | 1 + .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 6 +++++ .../TTS/valle/hparams/train_discrete_ssl.yaml | 8 +++++- .../TTS/valle/hparams/train_encodec.yaml | 6 +++++ .../TTS/valle/hparams/train_mimi.yaml | 6 +++++ .../valle/hparams/train_speech_tokenizer.yaml | 6 +++++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 6 +++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 26 ++++++++++++++++--- 15 files changed, 102 insertions(+), 12 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index b92a76255..afdac42b7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -12,7 +12,7 @@ run_name: !PLACEHOLDER # Model Type ssl_model_type: wavlm representation_mode: discrete -output_folder: !ref results/tokotron/// +output_folder: !ref results/// save_folder: !ref /save train_log: !ref /train_log.txt testing: True # If set to True, the test evlaution is done, otherwise skipped. diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 21dee91e3..d101e1d85 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -50,7 +50,6 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 3e34a6bce..1fd2aa3aa 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -11,7 +11,7 @@ run_name: !PLACEHOLDER # Model Type ssl_model_type: wavlm -output_folder: !ref results/tokotron/// +output_folder: !ref results/// save_folder: !ref /save train_log: !ref /train_log.txt testing: True # If set to True, the test evlaution is done, otherwise skipped. @@ -59,14 +59,20 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref -flip_layers: True +flip_layers: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min + # Training parameters input: text number_of_epochs: 50 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index 0255373ad..fd1fea7cc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -44,12 +44,17 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref -flip_layers: True +flip_layers: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index aeb97d1c3..eb1605f4e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -45,12 +45,17 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref -flip_layers: True +flip_layers: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index 1867c0c1c..dd9d60798 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -47,12 +47,17 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref -flip_layers: True +flip_layers: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 02a76476b..73af26870 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -391,6 +391,8 @@ def on_stage_end(self, stage, stage_loss, epoch): for key, value in self.hparams.eval_summary_log.items() } stage_stats.update(eval_summary_stats) + else: + eval_summary_stats = {} # Perform end-of-iteration things, like annealing, logging, etc. if stage == sb.Stage.VALID: @@ -409,8 +411,13 @@ def on_stage_end(self, stage, stage_loss, epoch): ) # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs ) def inference(self, batch): @@ -931,7 +938,13 @@ def undo_padding_tensor(batch, lengths): if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs ) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index 4a2a7b033..e85d37ff8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -39,6 +39,7 @@ sample_path: null progress_folder: !ref /progress progress_current: !ref /current progress_meta: !ref /meta.yaml +init_from: null num_audio_samples: 32 samples_interval: 5 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index c60540da2..588fd6b55 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -52,6 +52,12 @@ tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index d3e30a4f4..161c911e6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -11,7 +11,7 @@ run_name: !PLACEHOLDER # Model Type representation_mode: discrete -output_folder: !ref results/tokotron/// +output_folder: !ref results/// save_folder: !ref /save train_log: !ref /train_log.txt testing: True # If set to True, the test evlaution is done, otherwise skipped. @@ -70,6 +70,12 @@ flip_layers: False spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 34c5e6cb2..ad79fb3de 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index c6d1a4dfb..2b3b5d5b3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -52,6 +52,12 @@ tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index fd0e3daaf..eae14615f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index e59ccd34f..4120d37f5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -51,6 +51,12 @@ tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 47e0a25e4..bde1deb8a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -402,14 +402,24 @@ def on_stage_end(self, stage, stage_loss, epoch): # The train_logger writes a summary to stdout and to the logfile. self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr": lr}, + stats_meta={"epoch": epoch, "lr": lr, **eval_summary_stats}, train_stats=self.train_stats, valid_stats=stage_stats, ) + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } # Save the current checkpoint and delete previous checkpoints. self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + meta={"loss": stage_stats["loss"]}, + **ckpt_kwargs + ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"epoch": epoch, "lr": lr}, + train_stats=self.train_stats, + valid_stats=stage_stats, ) def inference(self, batch): @@ -1047,8 +1057,18 @@ def undo_padding_tensor(batch, lengths): ) # Load best checkpoint for evaluation - if hparams["testing"]: + + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs ) From 856df20804b0f95956df38faa51adefe28777ebd Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 13 Feb 2025 18:24:31 -0500 Subject: [PATCH 156/270] DASB: ESPNet Encodec support --- .../LJSpeech/extraction/hparams/encodec.yaml | 3 +- .../extraction/hparams/espnet_encodec.yaml | 66 ++++ .../LJSpeech/extraction/hparams/mimi.yaml | 3 +- .../extraction/hparams/speech_tokenizer.yaml | 1 + .../LJSpeech/extraction/hparams/sqcodec.yaml | 3 +- .../extraction/hparams/wavtokenizer.yaml | 3 +- .../hparams/train_espnet_encodec.yaml | 293 ++++++++++++++++++ .../DASB/LibriTTS/extraction/extract.py | 2 - .../extraction/hparams/espnet_encodec.yaml | 66 ++++ benchmarks/DASB/utils/tokenizer_interface.py | 116 +++++++ 10 files changed, 550 insertions(+), 6 deletions(-) create mode 100644 benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml index 6de95de73..869d1c503 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/encodec.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/encodec save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -47,7 +48,7 @@ save_embedding: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: facebook/encodec_24khz # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref sample_rate: !ref bandwidth: !ref flat_embeddings: False diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..c03ffa936 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,66 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LJSpeech +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +frozen_split_path: null +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml index 482f3739f..c534bef0f 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/mimi.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/mimi save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -44,7 +45,7 @@ save_embedding: False tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref - save_path: !ref + save_path: !ref num_codebooks: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml index 9d6ba7130..d036e05a3 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/speech_tokenizer.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/speech_tokenizer save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml index 378315bcf..28c7c9be9 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/sqcodec.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/sqcodec save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -42,7 +43,7 @@ save_path: /home/ubuntu/sq-codec/SQ-Codec # SQCodec model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref diff --git a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml index 3a0a935ff..a23c29e59 100644 --- a/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/extraction/hparams/wavtokenizer.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/wavtokenizer save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -44,7 +45,7 @@ vocab_size: 4096 # wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref freeze: True diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..e85d37ff8 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml @@ -0,0 +1,293 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +alignments_folder: null +prepare_save_folder: !ref /prepared +pretrained_model_save_folder: !ref +representation_mode: discrete +vocoder_model_name: encodec +vocoder_model_path: !ref / +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +freeze_token_model: True +token_model_src: "facebook/encodec_24khz" +g2p_src: flexthink/soundchoice-g2p +token_offset: 1 +vocoder_type: encodec +vocoder_src: "charactr/vocos-encodec-24khz" +vocoder_takes_spk_emb: False +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +inference_mode: autoregressive +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 5000 +infer_max_audio_length: 1000 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: ./hparams/char_en.txt +token_list_file_phn: ./hparams/arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +# Guides +guides_enabled: False + + +silence_padding: !ref +use_silence_padding: True + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +audio_num_tokens: 1024 +audio_emb_size: 128 +audio_emb_freeze: True +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 2 +bandwidth: 1.5 +attention_type: regularMHA + +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + vocoder: !ref + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel # yamllint disable-line rule:line-length + input_num_tokens: !ref + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + emb: !ref + + +tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer + source: facebook/encodec_24khz # Only the 24kHz version supports mono audio + save_path: !ref + sample_rate: !ref + bandwidth: !ref + flat_embeddings: False + freeze: True + renorm_embeddings: False + + +modules: + model: !ref + compute_cost: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py index a3db84984..00799eeb4 100644 --- a/benchmarks/DASB/LibriTTS/extraction/extract.py +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -17,8 +17,6 @@ base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) sys.path.append(base_dir) -print(base_dir) - logger = logging.getLogger(__name__) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml new file mode 100644 index 000000000..a6630188f --- /dev/null +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml @@ -0,0 +1,66 @@ +# ############################################################################ +# Auido Tokenizer: Encodec +# Extraction: Librispeech 960h +# Authors: Jarod Duret 2024 +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 1986 +__set_seed: !apply:torch.manual_seed [!ref ] +output_folder: !ref results/encodec +save_folder: !ref /save +pretrained_model_save_folder: !ref +train_log: !ref /extraction_log.txt + +# Data files +data_folder: !PLACEHOLDER # e.g., /path/to/LibriSpeech +train_splits: ["train-clean-100"] #, "train-clean-360", "train-other-500" +dev_splits: ["dev-clean"] +test_splits: ["dev-clean", "test-clean", "test-other"] +skip_prep: False +train_json: !ref /train.json +valid_json: !ref /dev-clean.json +test_json: !ref /test.json + + +batch_size: 8 +num_workers: 8 +src_key: wav +id_key: id + +# Dataloader options +dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + +# EnCodec parameters +# sample_rate: [24000, 24000, 24000, 24000] +# vocab_size: [1024, 1024, 1024, 1024] +# bandwidth: [1.5, 3.0, 6.0, 12.0, 24.0] +# num_codebooks: [2, 4, 8, 16, 32] +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml +num_codebooks: 32 +vocab_size: 1024 +sample_rate: 24000 +save_embedding: False + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +tokens_extractor: !new:utils.tokens.TokensExtractor + tokenizer: !ref + sample_rate: !ref + src_key: !ref + id_key: !ref + dataloader_opts: !ref diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index 0ab019b58..1ba9bc21a 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -10,7 +10,9 @@ import sys import os import torch +import re from abc import ABC, abstractmethod +from pathlib import Path from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec from speechbrain.lobes.models.huggingface_transformers.discrete_ssl import ( DiscreteSSL, @@ -19,6 +21,14 @@ from speechbrain.lobes.models.discrete.speechtokenizer import SpeechTokenizer from speechbrain.lobes.models.discrete.wavtokenizer import WavTokenizer from speechbrain.lobes.models.huggingface_transformers.mimi import Mimi +from speechbrain.utils.superpowers import run_shell +from speechbrain.utils.fetching import fetch +from torch import nn +import logging +import shlex +import yaml + +logger = logging.getLogger(__name__) base_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "..") @@ -513,3 +523,109 @@ def get_pretrained_embeddings( raise ValueError( "SQCodec does not have any trainable quantizer or embedding since it uses scalar quantization." ) + + +DEFAULT_ESPNET_REPO = "https://github.com/espnet/espnet" + + +class ESPNetEncodecInterface(BaseTokenizer, nn.Module): + """An interface for pretrained ESPNet Encodec implementations""" + + def __init__( + self, + source, + model_ckpt, + model_config, + save_path, + sample_rate=24000, + n_codebook=32, + espnet_repo=DEFAULT_ESPNET_REPO, + espnet_commit=None, + ): + super().__init__() + self.source = source + self.model_ckpt = model_ckpt + self.model_config = model_config + self.save_path = Path(save_path) + self.sample_rate = sample_rate + self.n_codebook = n_codebook + self.espnet_repo = espnet_repo + self.espnet_commit = espnet_commit + self._load() + + def _load(self): + self._load_espnet() + ckpt_file_name = fetch( + filename=self.model_ckpt, + source=self.source, + savedir=str(self.save_path), + save_filename=str(Path(self.model_ckpt).name) + ) + config_file_name = fetch( + filename=self.model_config, + source=self.source, + savedir=str(self.save_path), + save_filename="config.yaml" + ) + with open(config_file_name) as config_file: + config = yaml.safe_load(config_file) + from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec + self.encodec = ESPNetEncodec(**config["codec_conf"]) + device = next(iter(self.encodec.parameters())).device + state_dict = torch.load(ckpt_file_name, map_location=device) + state_dict = { + re.sub("^codec.", "", key): value + for key, value in state_dict.items() + } + self.encodec.load_state_dict(state_dict) + + def _load_espnet(self): + try: + import espnet2 + except ModuleNotFoundError: + self._download_espnet() + + def _download_espnet(self): + logger.info("espnet is not installed, installing") + espnet_path = self.save_path / "espnet" + if not espnet_path.exists(): + logger.info("Cloining %s into %s", self.espnet_repo, espnet_path) + cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)]) + run_shell(cmd) + else: + logger.info("%s already exists", espnet_path) + if self.espnet_commit: + logger.info("Checking out %s", self.espnet_commit) + cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit]) + run_shell(cmd) + logger.info("Installing") + cmd = shlex.join(["pip", "install", "-e", str(espnet_path)]) + run_shell(cmd) + logger.info("Installation completed") + + @torch.no_grad() + def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): + self.encodec.eval() + if signal.dim() < 3: + signal = signal.unsqueeze(1) + tokens = self.encodec.encode(signal) + return tokens.permute(1, 2, 0)[:, :, :self.n_codebook] + + @torch.no_grad() + def tokens_to_sig(self, tokens, **kwargs): + self.encodec.eval() + tokens = tokens.permute(2, 0, 1) + signal = self.encodec.decode(tokens, **kwargs) + return signal.squeeze(1) + + @torch.no_grad() + def get_pretrained_embeddings( + self, vocab_size=None, num_codebooks=None, **kwargs + ): + """ + This method is not implemented for ESPNet Encodec, as it uses scalar quantization + and does not have any trainable quantizer or embedding. + """ + raise ValueError( + "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization." + ) \ No newline at end of file From be174df994a2cede4dea7230ce37b6a9228d60d6 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Feb 2025 00:20:32 -0500 Subject: [PATCH 157/270] DASB: Inference mode, remove an unused evaluator --- benchmarks/DASB/utils/eval.py | 111 ++-------------------------------- 1 file changed, 6 insertions(+), 105 deletions(-) diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 9d5e8642f..76f2a6c2f 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -7,7 +7,6 @@ """ from speechbrain.inference.interfaces import Pretrained -from speechbrain.inference.ASR import EncoderDecoderASR from speechbrain.lobes.models.huggingface_transformers import Whisper from speechbrain.lobes.models.huggingface_transformers.wav2vec2 import Wav2Vec2 from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset @@ -351,105 +350,6 @@ def _replace_blanks(self, preds): return [" " if item == "" else item for item in preds] -class EncoderDecoderASRSpeechEvaluator(ASRSpeechEvaluator): - """A speech evaluator implementation based on ASR. - Computes the Word Error Rate (WER), Character Error Rate (CER) - and a few other metrics - - Arguments - --------- - sample_rate : int - The audio sample rate this evaluator expects - """ - - def __init__(self, source, sample_rate=None, *args, **kwargs): - super().__init__(sample_rate=sample_rate) - self.asr = EncoderDecoderASR.from_hparams(source, *args, **kwargs) - self.device = next(self.asr.mods.parameters()).device - - def evaluate_samples(self, wavs, length, text, sample_rate): - wavs = self.resample(wavs, sample_rate) - if text is None: - raise ValueError("This evaluator requires ground-truth text") - predicted_words, scores, log_probs = self.transcribe_batch_with_details( - wavs, length - ) - ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = init_asr_metrics() - wer_metric.append(ids, predicted_words, text) - cer_metric.append(ids, predicted_words, text) - wer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=wavs.device - ) - cer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=wavs.device - ) - prob_mean = log_probs.exp().mean(dim=-1) - return { - "wer": wer, - "cer": cer, - "beam_score": scores, - "prob_mean": prob_mean, - "pred": predicted_words, - "target": text, - } - - def transcribe_batch_with_details(self, wavs, wav_lens): - """Transcribes the input audio into a sequence of words - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - predicted_words : list - The raw ASR predictions, fully decoded - best_scores : list - The best scores (from beam search) - best_log_probs : list - The best predicted log-probabilities (from beam search) - - - Returns - ------- - predicted_words : list - The predictions - - best_scores : torch.Tensor - The best scores (from beam search) - - best_log_probs : torch.Tensor - The best log-probabilities - - """ - with torch.no_grad(): - wav_lens = wav_lens.to(self.device) - encoder_out = self.asr.encode_batch(wavs, wav_lens) - ( - hyps, - best_lens, - best_scores, - best_log_probs, - ) = self.asr.mods.decoder(encoder_out, wav_lens) - predicted_words = [ - self.asr.tokenizer.decode_ids(token_seq) for token_seq in hyps - ] - return predicted_words, best_scores, best_log_probs - - def to(self, device): - """Transfers this module to the spcieifed device - - Arguments - --------- - device : str | torch.Device - the target device - """ - self.asr = self.asr.to(device) - return self - - class WhisperASRSpeechEvaluator(ASRSpeechEvaluator): """A speech evaluator implementation based on Whisper ASR @@ -995,11 +895,12 @@ def evaluate( length_cat_abs.int() ).long() # 0 for masked tokens # Forward - embs = self.model( - input_values=audio, - attention_mask=attention_mask, - output_attentions=False, - ).embeddings + with torch.inference_mode(): + embs = self.model( + input_values=audio, + attention_mask=attention_mask, + output_attentions=False, + ).embeddings hyp_embs, ref_embs = embs.split([len(wavs), len(wavs_ref)]) scores = torch.nn.functional.cosine_similarity( hyp_embs, ref_embs, dim=-1 From 750f3a4ff6c8b161f39a777ba492811f3356ee84 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Feb 2025 01:15:13 -0500 Subject: [PATCH 158/270] DASB: Add customization for the validation batch size --- benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml | 3 ++- .../DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 3 ++- .../DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml | 3 ++- .../LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml | 3 ++- benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml | 3 ++- .../LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml | 3 ++- .../DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml | 3 ++- 7 files changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 1a55d1c02..01c818370 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -56,6 +56,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -143,7 +144,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index efcde8c58..61dabef41 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -95,6 +95,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref batch_size_guided: 2 extract_features_batch_size: 32 grad_accumulation_factor: 1 @@ -202,7 +203,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index e85d37ff8..e45794171 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -69,6 +69,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -159,7 +160,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml index e85d37ff8..e45794171 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml @@ -69,6 +69,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -159,7 +160,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml index 4f0772f47..156e05b02 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -68,6 +68,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -153,7 +154,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 937db0812..ffb68f2a5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -60,6 +60,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -146,7 +147,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml index 3c06d761f..f4f745716 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -68,6 +68,7 @@ input: text number_of_epochs: 1000 reset_annealing_epoch: null batch_size: 16 +valid_batch_size: !ref extract_features_batch_size: 32 grad_accumulation_factor: 1 max_grad_norm: 0.01 @@ -153,7 +154,7 @@ train_dataloader_opts: value: !ref valid_dataloader_opts: - batch_size: !ref + batch_size: !ref num_workers: !ref collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: From 55fc383319d819c1ca137dd222f7a3c322b0d8ec Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Feb 2025 09:25:51 -0500 Subject: [PATCH 159/270] DASB: VALL-E: Add ESPNET Encodec --- .../valle/hparams/train_espnet_encodec.yaml | 238 +++++++++++++++++ .../valle/hparams/train_espnet_encodec.yaml | 251 ++++++++++++++++++ 2 files changed, 489 insertions(+) create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..ad486d493 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,238 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/espnet-encodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 24000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_top_k: 20 +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + top_k: !ref + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml new file mode 100644 index 000000000..112b526c4 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -0,0 +1,251 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: tokotron/encodec + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +batch_size: 16 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 2300 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !ref // + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 +num_layers_nar: 12 +dropout: 0.2 +vocab_size: 1024 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 + +# Model Settings +model_hub: facebook/encodec_24khz +espnet_repo: https://github.com/espnet/espnet +espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef +model_hub: espnet/libritts_encodec_24k +model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth +model_config: exp/codec_encodec_ss4_24k/config.yaml + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref From 07302546e5fcbd05d41dc3a88bc615829f2438b6 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Feb 2025 15:48:05 -0500 Subject: [PATCH 160/270] DASB: Add the ability to skip resampling --- .../DASB/LibriTTS/extraction/extract.py | 3 ++- .../DASB/LibriTTS/extraction/hparams/dac.yaml | 1 + .../extraction/hparams/discrete_ssl.yaml | 2 ++ .../LibriTTS/extraction/hparams/encodec.yaml | 1 + .../extraction/hparams/espnet_encodec.yaml | 1 + .../LibriTTS/extraction/hparams/mimi.yaml | 1 + .../extraction/hparams/speech_tokenizer.yaml | 1 + .../LibriTTS/extraction/hparams/sqcodec.yaml | 1 + .../extraction/hparams/wavtokenizer.yaml | 1 + benchmarks/DASB/LibriTTS/libritts_prepare.py | 22 ++++++++++++------- 10 files changed, 25 insertions(+), 9 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/extraction/extract.py b/benchmarks/DASB/LibriTTS/extraction/extract.py index 00799eeb4..328fbe868 100644 --- a/benchmarks/DASB/LibriTTS/extraction/extract.py +++ b/benchmarks/DASB/LibriTTS/extraction/extract.py @@ -49,7 +49,8 @@ "save_json_test": hparams["test_json"], "sample_rate": hparams["sample_rate"], "skip_prep": hparams["skip_prep"], - "max_valid_size": None + "max_valid_size": None, + "skip_resample": hparams["skip_resample"], }, ) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml index 76870e279..836503717 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/dac.yaml @@ -48,6 +48,7 @@ sample_rate: 24000 # Feature parameters encoder_dim: 1024 save_embedding: False +skip_resample: False tokenizer: !new:utils.tokenizer_interface.DACTokenizer model_type: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml index 2b57a7edf..6ae14c87c 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/discrete_ssl.yaml @@ -52,6 +52,8 @@ freeze_ssl: True freeze_feature_extractor: True vocab_size: 1000 save_embedding: False +skip_resample: False + ### Config for Tokenizer # Layer number should be among the supported layers for discrete SSL models(kmenas model should be available for that layer) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml index b7ae76969..188b38a6d 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/encodec.yaml @@ -44,6 +44,7 @@ num_codebooks: 32 vocab_size: 1024 sample_rate: 24000 save_embedding: False +skip_resample: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml index a6630188f..a0542b189 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/espnet_encodec.yaml @@ -48,6 +48,7 @@ num_codebooks: 32 vocab_size: 1024 sample_rate: 24000 save_embedding: False +skip_resample: False tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface source: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml index 9e64347c7..acddcd93b 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml @@ -42,6 +42,7 @@ sample_rate: 24000 encoder_dim: 1024 freeze_embedding: False save_embedding: False +skip_resample: False tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml index 85148db9d..2b96a749b 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -39,6 +39,7 @@ sample_rate: 16000 encoder_dim: 1024 freeze_embedding: False save_embedding: False +skip_resample: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml index cf46b3f5a..68dc9df49 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml @@ -40,6 +40,7 @@ sample_rate: 16000 save_embedding: False num_codebooks: 4 save_path: /home/ubuntu/sq-codec/SQ-Codec +skip_resample: False # SQCodec model diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml index c7581bbe7..56c13508c 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml @@ -41,6 +41,7 @@ sample_rate: 24000 save_embedding: False num_codebooks: 1 vocab_size: 4096 +skip_resample: False # wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py index 6ec1a3a96..52594eaf9 100644 --- a/benchmarks/DASB/LibriTTS/libritts_prepare.py +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -42,6 +42,7 @@ def prepare_libritts( max_valid_size=500, alignments_folder=None, skip_prep=False, + skip_resample=False, ): """ Prepares the json files for the LibriTTS dataset. @@ -82,6 +83,8 @@ def prepare_libritts( The path to alignments files skip_prep: Bool If True, skip preparation. + skip_resample: bool + If True, audio will not be resampled Returns ------- @@ -106,16 +109,16 @@ def prepare_libritts( # If specific splits are provided, creates data manifest files accordingly if train_split: wav_list = prepare_split(data_folder, train_split) - create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name) + create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample) if valid_split: wav_list = prepare_split(data_folder, valid_split) # TODO add better way to speedup evaluation if max_valid_size is not None and len(wav_list) > max_valid_size: wav_list = random.sample(wav_list, max_valid_size) - create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name) + create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample) if test_split: wav_list = prepare_split(data_folder, test_split) - create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name) + create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed.") @@ -129,12 +132,12 @@ def prepare_libritts( data_split = split_sets(wav_list, split_ratio) # Creating json files create_json( - data_split["train"], save_json_train, sample_rate, alignments_folder, model_name + data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample ) create_json( - data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name + data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample ) - create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name) + create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample) def prepare_split(data_folder, split_list): @@ -177,7 +180,7 @@ def prepare_split(data_folder, split_list): return wav_list -def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None): +def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False): """ Creates the json file given a list of wav files. Arguments @@ -194,6 +197,9 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder The path to LibriTTS alignments model_name : str Model name (used to prepare additional model specific data) + skip_resample : int + Skips resampling - useful when large temporary storage + is absent. """ # Downloads and initializes the G2P model to compute the phonemes if data is being prepared for Tacotron2 experiments @@ -240,7 +246,7 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder continue # Resamples the audio file if required - if sig_sr != sample_rate: + if sig_sr != sample_rate and not skip_resample: resampled_signal = torchaudio.functional.resample( signal, sig_sr, sample_rate ) From 41afc01a027f7f33061b46cb9c8f86df90d2d98e Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Feb 2025 00:20:32 -0500 Subject: [PATCH 161/270] DASB: Add the switch for LM head training --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 ++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 73af26870..9ab344ff6 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -258,6 +258,7 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True + self.modules.model.lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -268,6 +269,7 @@ def apply_curriculum(self): and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) ): self.train_ar = False + self.modules.model.lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index bde1deb8a..395113edb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -257,6 +257,7 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True + self.modules.model.lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -266,7 +267,10 @@ def apply_curriculum(self): self.hparams.number_of_epochs_nar is not None and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) ): + # NOTE: Avoid the AR head being "taken by surprise" self.train_ar = False + self.modules.model.lm_head.requires_grad_(False) + def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed From f529e621b211189a4b3aee47db0b2ad2e5e342a0 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Feb 2025 11:36:18 -0500 Subject: [PATCH 162/270] DASB: Undo the gradient change - it did not help --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 -- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 9ab344ff6..73af26870 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -258,7 +258,6 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True - self.modules.model.lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -269,7 +268,6 @@ def apply_curriculum(self): and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) ): self.train_ar = False - self.modules.model.lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 395113edb..e786d3495 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -257,7 +257,6 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True - self.modules.model.lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -269,7 +268,6 @@ def apply_curriculum(self): ): # NOTE: Avoid the AR head being "taken by surprise" self.train_ar = False - self.modules.model.lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): From 554e52ac3e19c01af7392ee522691dc86d6c9f88 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Feb 2025 19:43:55 -0500 Subject: [PATCH 163/270] DASB: VALL-E: Add the ability to disable fixed batches, add the ability to limit the validation set to be run on every epoch --- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 8 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 8 +- .../TTS/valle/hparams/train_encodec.yaml | 8 +- .../valle/hparams/train_espnet_encodec.yaml | 8 +- .../TTS/valle/hparams/train_mimi.yaml | 8 +- .../valle/hparams/train_speech_tokenizer.yaml | 8 +- .../TTS/valle/hparams/train_wavtokenizer.yaml | 8 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 127 ++++++++++++++++++ 8 files changed, 176 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 588fd6b55..86d3c7ce3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -66,7 +66,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -118,7 +120,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 161c911e6..a62288062 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -84,7 +84,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -159,7 +161,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index ad79fb3de..0db811710 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -65,7 +65,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -117,7 +119,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 112b526c4..af6e9cbef 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -65,7 +65,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -117,7 +119,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 2b3b5d5b3..cee2de622 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -66,7 +66,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -118,7 +120,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index eae14615f..41c7cad68 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -65,7 +65,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -117,7 +119,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 4120d37f5..0ccaf4727 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -65,7 +65,9 @@ number_of_epochs: 100 number_of_epochs_ar: null number_of_epochs_nar: null epoch_size: 50000 +epoch_fixed: False batch_size: 16 +valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 max_grad_norm: 1.0 @@ -117,7 +119,11 @@ train_dataloader_opts: batch_size: !ref shuffle: True num_workers: !ref - looped_nominal_epoch: !ref // + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index e786d3495..e17e93e74 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -26,6 +26,7 @@ from speechbrain.utils.data_utils import pad_right_to from speechbrain.utils.distributed import run_on_main from speechbrain.utils.data_utils import batch_pad_right +from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset from functools import partial import re import string @@ -522,6 +523,104 @@ def fit_batch(self, batch): if self.hparams.lr_annealing_mode == "step": self.hparams.lr_annealing(self.optimizer) return loss + + def fit( + self, + epoch_counter, + train_set, + valid_set=None, + progressbar=None, + train_loader_kwargs={}, + valid_loader_kwargs={}, + ): + """Iterate epochs and datasets to improve objective. + + Relies on the existence of multiple functions that can (or should) be + overridden. The following methods are used and expected to have a + certain behavior: + + * ``fit_batch()`` + * ``evaluate_batch()`` + * ``update_average()`` + + If the initialization was done with distributed_count > 0 and the + distributed_backend is ddp, this will generally handle multiprocess + logic, like splitting the training data into subsets for each device and + only saving a checkpoint on the main process. + + Arguments + --------- + epoch_counter : iterable + Each call should return an integer indicating the epoch count. + train_set : Dataset, DataLoader + A set of data to use for training. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + valid_set : Dataset, DataLoader + A set of data to use for validation. If a Dataset is given, a + DataLoader is automatically created. If a DataLoader is given, it is + used directly. + progressbar : bool + Whether to display the progress of each epoch in a progressbar. + train_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the train_loader + (if train_set is a Dataset, not DataLoader). + E.G. batch_size, num_workers. + DataLoader kwargs are all valid. + valid_loader_kwargs : dict + Kwargs passed to `make_dataloader()` for making the valid_loader + (if valid_set is a Dataset, not DataLoader). + E.g., batch_size, num_workers. + DataLoader kwargs are all valid. + + Returns + ------- + None + """ + if self.test_only: + logger.info( + "Test only mode, skipping training and validation stages." + ) + return + + self.on_fit_start() + train_set = self.make_dataloader( + train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs + ) + epoch = self.hparams.epoch_counter.current + if epoch < self.hparams.number_of_epochs: + valid_set = sample_dataset( + dataset=valid_set, + count=self.hparams.valid_inter_data_count, + seed=self.hparams.seed + ) + + valid_set = self.make_dataloader( + valid_set, + stage=sb.Stage.VALID, + ckpt_prefix=None, + **valid_loader_kwargs, + ) + + if progressbar is None: + progressbar = not self.noprogressbar + + # Only show progressbar if requested and main_process + enable = progressbar and sb.utils.distributed.if_main_process() + + # Iterate epochs + for epoch in epoch_counter: + self._fit_train(train_set=train_set, epoch=epoch, enable=enable) + self._fit_valid(valid_set=valid_set, epoch=epoch, enable=enable) + + # Debug mode only runs a few epochs + if ( + self.debug + and epoch == self.debug_epochs + or self._optimizer_step_limit_exceeded + ): + break + INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} @@ -712,6 +811,34 @@ def sig_pipeline(wav): return datasets +def sample_dataset(dataset, count, seed): + """Selects a sample of the specified dataset in a + stable manner, returning the same sample on each call + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + count : int + The number of items to select + seed : int + The seed to be used + """ + if len(dataset) < count: + return dataset + generator = torch.Generator() + generator.manual_seed(seed) + indexes = torch.randperm(len(dataset)).tolist()[:count] + data_ids = [ + dataset.data_ids[idx] + for idx in indexes + ] + return FilteredSortedDynamicItemDataset( + dataset, + data_ids, + ) + + def get_offsets(vocab_size, tracks): """Adds offsets to each track to treat the tokens as distinct From e2d74404b2ff8424d5bef4a0fcded61fe9fa3b5a Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Feb 2025 23:11:48 -0500 Subject: [PATCH 164/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index e17e93e74..db4c8e5b0 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -383,6 +383,7 @@ def on_stage_end(self, stage, stage_loss, epoch): if stage == sb.Stage.TRAIN: self.train_stats = stage_stats + eval_summary_stats = {} # End evaluation and report stats if stage != sb.Stage.TRAIN and self.is_evaluating: self.evaluation_metric.on_evaluation_end() From d7fc323d8d45d6c0726735d79dffbc2aec7785ed Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 16 Feb 2025 02:27:00 -0500 Subject: [PATCH 165/270] DASB: Update wav2vec2 --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index afdac42b7..0a18b2f60 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -47,7 +47,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self + wav2vec2: facebook/wav2vec2-large g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 1fd2aa3aa..85fc660a9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -45,7 +45,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self + wav2vec2: facebook/wav2vec2-large g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 61dabef41..4efa9f75c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -60,7 +60,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self + wav2vec2: facebook/wav2vec2-large g2p_src: flexthink/soundchoice-g2p kmeans_cache_dir: !ref /kmeans_checkpoint kmeans_dataset: LibriSpeech diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index a62288062..2800cfd56 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -52,7 +52,7 @@ token_model_src: !apply:speechbrain.utils.hparams.choice choices: wavlm: microsoft/wavlm-large hubert: facebook/hubert-large-ll60k - wav2vec2: facebook/wav2vec2-large-960h-lv60-self + wav2vec2: facebook/wav2vec2-large g2p_src: speechbrain/soundchoice-g2p token_model_kmeans_src: poonehmousavi/SSL_Quantization kmeans_cache_dir: !ref /kmeans_checkpoint From 8a9e8739e7264a287b390dd75bf8ef9f47f7851d Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 16 Feb 2025 23:05:04 -0500 Subject: [PATCH 166/270] DASB: Add back LM head freezing (with a toggle) --- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 1 + .../TTS/valle/hparams/train_discrete_ssl.yaml | 2 + .../TTS/valle/hparams/train_encodec.yaml | 1 + .../valle/hparams/train_espnet_encodec.yaml | 3 +- .../TTS/valle/hparams/train_mimi.yaml | 2 + .../valle/hparams/train_speech_tokenizer.yaml | 2 + .../TTS/valle/hparams/train_wavtokenizer.yaml | 1 + benchmarks/DASB/LibriTTS/TTS/valle/train.py | 42 +++++++++++++++---- 8 files changed, 45 insertions(+), 9 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 86d3c7ce3..9dd038b11 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -150,6 +150,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref +freeze_lm_head: False ####################### Model parameters ########################### # Transformer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 2800cfd56..9397ac6be 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -227,6 +227,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice audio_tokens_per_step: 6 +freeze_lm_head: False + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 0db811710..56783935f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -149,6 +149,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref +freeze_lm_head: False ####################### Model parameters ########################### # Transformer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index af6e9cbef..0f1f51672 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -185,13 +185,14 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice audio_tokens_per_step: 8 # Model Settings -model_hub: facebook/encodec_24khz espnet_repo: https://github.com/espnet/espnet espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef model_hub: espnet/libritts_encodec_24k model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth model_config: exp/codec_encodec_ss4_24k/config.yaml +freeze_lm_head: True + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index cee2de622..8a095f441 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -188,6 +188,8 @@ audio_tokens_per_step: 8 # Model Settings model_hub: kyutai/mimi +freeze_lm_head: False + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 41c7cad68..bc88e091f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -187,6 +187,8 @@ audio_tokens_per_step: 8 # Model Settings model_hub: fnlp/SpeechTokenizer +freeze_lm_head: False + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 0ccaf4727..5a628e536 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -189,6 +189,7 @@ model_hub: novateur/WavTokenizer-medium-music-audio-75token config: wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml checkpoint: wavtokenizer_medium_music_audio_320_24k_v2.ckpt +freeze_lm_head: False ############################## models ################################ diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index db4c8e5b0..d08942cd8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -258,18 +258,19 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True + self.modules.model.lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: - self.train_nar = False + self.train_nar = False elif ( self.hparams.number_of_epochs_nar is not None and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) ): - # NOTE: Avoid the AR head being "taken by surprise" self.train_ar = False - + if self.hparams.freeze_lm_head: + self.modules.model.lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed @@ -455,11 +456,8 @@ def inference(self, batch): ) for prefix_item in prefix_items ] - inferred_tokens = [ result[0][0] - if result[0] - else torch.zeros( - 1000, self.hparams.audio_tokens_per_step, device=self.device - ) + inferred_tokens = [ + self._pad_inferred_sample(result) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) @@ -467,6 +465,34 @@ def inference(self, batch): audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) return audio, audio_length + def _pad_inferred_sample(self, result): + """Applies length padding to an inference result + + Arguments + --------- + result : list + The VALL-E Inference output + + Returns + ------- + sample : torch.Tensor + A sample, padded if needed + """ + if result[0]: + sample = result[0][0] + else: + sample = torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) + min_length = getattr(self.hparams, "infer_min_length", 10) + sample_length, tracks = sample.shape + if sample_length < min_length: + sample = pad_right_to( + (min_length, tracks), + sample + ) + return sample + def _get_inference_opts(self): idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ None, : From a1f5e94faa5de53d2f9386aaefa9def865bd4ae4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Feb 2025 00:26:07 -0500 Subject: [PATCH 167/270] DASB: Fix for data parallel --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index d08942cd8..12c291be1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -258,7 +258,12 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True - self.modules.model.lm_head.requires_grad_(True) + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -270,7 +275,7 @@ def apply_curriculum(self): ): self.train_ar = False if self.hparams.freeze_lm_head: - self.modules.model.lm_head.requires_grad_(False) + lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed From e75214612259abeab8fe4891af36c9a9909d7542 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Feb 2025 00:55:26 -0500 Subject: [PATCH 168/270] DASB: Fix padding --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 12c291be1..0654b23e5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -493,9 +493,9 @@ def _pad_inferred_sample(self, result): sample_length, tracks = sample.shape if sample_length < min_length: sample = pad_right_to( + sample, (min_length, tracks), - sample - ) + )[0] return sample def _get_inference_opts(self): From c6d58831be2d1e2ff5d1ce1cfbe79059c703a19d Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Feb 2025 11:51:27 -0500 Subject: [PATCH 169/270] DASB: VALL-E: Fix a crash --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 0654b23e5..1fc2f1e68 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -815,6 +815,10 @@ def sig_pipeline(wav): spk_samplers=spk_samplers, ) resample_fn[dataset](epoch=0) + if hparams["input"] == "phonemes": + dynamic_dataset = dynamic_dataset.filtered_sorted( + key_test={"has_alignments": lambda value: value} + ) datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False From 99588e396d6380d0f38a6ad1f19cd492af6073bd Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Feb 2025 14:55:31 -0500 Subject: [PATCH 170/270] DASB: VALL-E: Add LM head freezing --- .../LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 2 ++ .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 2 ++ .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml | 2 ++ .../DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml | 3 +++ .../LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml | 2 ++ benchmarks/DASB/LJSpeech/TTS/valle/train.py | 8 ++++++++ 6 files changed, 19 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 85fc660a9..f7df72d39 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -206,6 +206,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice audio_tokens_per_step: 6 +freeze_lm_head: False + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index fd1fea7cc..cfe293610 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -139,6 +139,8 @@ sample_dataloader_opts: token_model_kwargs: SSL_layers: !ref +freeze_lm_head: False + ####################### Model parameters ########################### # Transformer d_model: 1024 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index ad486d493..5583680d5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -177,6 +177,8 @@ model_hub: espnet/libritts_encodec_24k model_ckpt: exp/codec_encodec_ss4_24k/120epoch.pth model_config: exp/codec_encodec_ss4_24k/config.yaml +freeze_lm_head: True + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index eb1605f4e..05c506a85 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -175,6 +175,9 @@ audio_tokens_per_step: 8 bandwidth: 6 +freeze_lm_head: False + + ############################## models ################################ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index dd9d60798..bcdb7e2eb 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -142,6 +142,8 @@ sample_dataloader_opts: token_model_kwargs: SSL_layers: !ref +freeze_lm_head: False + ####################### Model parameters ########################### # Transformer d_model: 1024 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 73af26870..7e46f77c5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -258,6 +258,12 @@ def apply_curriculum(self): only the non-autoregressive part""" epoch = self.hparams.epoch_counter.current self.train_ar, self.train_nar = True, True + lm_head = ( + self.modules.model.module.lm_head + if hasattr(self.modules.model, "module") + else self.modules.model.lm_head + ) + lm_head.requires_grad_(True) if self.hparams.audio_tokens_per_step == 1: # NOTE: If there is only one track it's autoregressive self.train_nar = False @@ -268,6 +274,8 @@ def apply_curriculum(self): and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) ): self.train_ar = False + if self.hparams.freeze_lm_head: + lm_head.requires_grad_(False) def is_eval_epoch(self, epoch): """Determines whether or not evaluation should be performed From dad02cb95eb8309b685990a0efb689538d555c14 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Feb 2025 22:08:03 -0500 Subject: [PATCH 171/270] DASB: Vall-E: Fix data-parallel --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 16 +++++++++++----- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 15 ++++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 7e46f77c5..b25a60d4f 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -70,12 +70,18 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - self.modules.tokenizer.device = self.device - if hasattr(self.modules.tokenizer, "codec_vocoder"): - self.modules.tokenizer.codec_vocoder.to(self.device) - self.modules.tokenizer.codec_vocoder.device = self.device - wav = self.modules.tokenizer.tokens_to_sig(audio) + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) + wav = wav.to(self.device) return wav def compute_forward(self, batch, stage): diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 1fc2f1e68..da6cd7083 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -72,11 +72,16 @@ def create_waveform(self, audio, length): ------- wav : torch.Tensor """ - self.modules.tokenizer.device = self.device - if hasattr(self.modules.tokenizer, "codec_vocoder"): - self.modules.tokenizer.codec_vocoder.to(self.device) - self.modules.tokenizer.codec_vocoder.device = self.device - wav = self.modules.tokenizer.tokens_to_sig(audio) + tokenizer = ( + self.modules.tokenizer.module + if hasattr(self.modules.tokenizer, "module") + else self.modules.tokenizer + ) + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) wav = wav.to(self.device) return wav From 63e9972c98eb28982fcf2097c1bc00eb6222e331 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 18 Feb 2025 01:30:52 -0500 Subject: [PATCH 172/270] DASB: VALL-E: Update hyperparameters --- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 6 +++--- .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 4 ++-- .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml | 6 +++--- benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml | 6 +++--- .../DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml | 6 +++--- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 6 +++--- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 6 +++--- .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 6 +++--- .../LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 6 +++--- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 6 +++--- .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml | 6 +++--- 11 files changed, 32 insertions(+), 32 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index f7df72d39..45b4f6e5e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -175,12 +175,12 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1000 text_num_tokens: 39 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index cfe293610..fbcb5db5a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -147,8 +147,8 @@ d_model: 1024 share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 text_num_tokens: 39 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index 5583680d5..0249b0c64 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -141,12 +141,12 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 text_num_tokens: 39 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index 05c506a85..6063d692c 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -142,12 +142,12 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 2048 text_num_tokens: 39 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index bcdb7e2eb..fd21dc4bd 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -146,12 +146,12 @@ freeze_lm_head: False ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 text_num_tokens: 39 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 9dd038b11..c811e1c7f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -154,12 +154,12 @@ freeze_lm_head: False ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 9397ac6be..bd50151af 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -192,12 +192,12 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1000 audio_emb_size: 1024 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 56783935f..54c357b52 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -153,12 +153,12 @@ freeze_lm_head: False ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 0f1f51672..120208cd8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -152,12 +152,12 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 8a095f441..8383cf0f7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -153,12 +153,12 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 5a628e536..012f61e86 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -152,12 +152,12 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False From bacc9f93c77b0909493c91612cf75a5a281d501c Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 18 Feb 2025 12:03:52 -0500 Subject: [PATCH 173/270] DASB: VALL-E: Add data scaling support --- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 1 + .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml | 1 + benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml | 1 + .../DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml | 1 + benchmarks/DASB/LJSpeech/TTS/valle/train.py | 6 ++++++ 6 files changed, 11 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 45b4f6e5e..140b85a84 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -87,6 +87,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index fbcb5db5a..2c22f57a4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -71,6 +71,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index 0249b0c64..74654e590 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -71,6 +71,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index 6063d692c..b528660f5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -72,6 +72,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index fd21dc4bd..c2ff765f4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -74,6 +74,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index b25a60d4f..30bfbe3b1 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -686,6 +686,12 @@ def sig_pipeline(wav): raise NotImplementedError( "sorting must be random, ascending or descending" ) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count + ) return datasets From b1e270a5e81fde18342f933de51a05405fd27aed Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 18 Feb 2025 14:44:44 -0500 Subject: [PATCH 174/270] DASB: Tokotron: Add scaling + selction based on dWER (for comparison) --- .../TTS/tokotron/hparams/train_dac.yaml | 8 +++-- .../tokotron/hparams/train_discrete_ssl.yaml | 6 ++++ .../TTS/tokotron/hparams/train_encodec.yaml | 8 +++-- .../TTS/tokotron/hparams/train_mimi.yaml | 8 +++-- .../hparams/train_speech_tokenizer.yaml | 8 +++-- .../TTS/tokotron/hparams/train_sqcodec.yaml | 8 +++-- .../tokotron/hparams/train_wavtokenizer.yaml | 10 +++--- .../DASB/LJSpeech/TTS/tokotron/train.py | 36 +++++++++++++++---- 8 files changed, 72 insertions(+), 20 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml index f94d25d74..d49afdf29 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_dac.yaml @@ -39,6 +39,7 @@ progress_meta: !ref /meta.yaml num_audio_samples: 32 samples_interval: 5 + tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. tokens_loader: !new:utils.tokens.TokensLoader @@ -49,8 +50,11 @@ token_model_kwargs: splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml index 0a18b2f60..af723f6c9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -66,6 +66,11 @@ spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -80,6 +85,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml index d16403558..1c54128b7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_encodec.yaml @@ -44,8 +44,11 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -60,6 +63,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index b99ac7980..b38a07434 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -47,8 +47,11 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -64,6 +67,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 4b2fb6553..0cb2012ed 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -47,8 +47,11 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -63,6 +66,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 +data_scale: null # index diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index d101e1d85..4ea1ba387 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -49,7 +49,11 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] split_ratio: [90, 5, 5] - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -64,7 +68,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml index 81bcee2ca..d3bf9c770 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -48,9 +48,11 @@ tokens_loader: !new:utils.tokens.TokensLoader splits: ["train", "valid", "test"] -split_ratio: [90, 5, 5] - - +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: null +test_key_kind: min ckpt_interval_minutes: 30 # save checkpoint every N min # Training parameters @@ -65,7 +67,7 @@ skip_prep: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 - +data_scale: null # index pad_index: 0 diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 86e0efc26..3e6b356d4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -306,6 +306,7 @@ def on_stage_end(self, stage, stage_loss, epoch): self.train_stats = stage_stats # End evaluation and report stats + eval_summary_stats = {} if stage != sb.Stage.TRAIN and self.is_eval_epoch(epoch): self.evaluator.on_evaluate_end() eval_summary_stats = self.get_summary_stats() @@ -329,9 +330,14 @@ def on_stage_end(self, stage, stage_loss, epoch): valid_stats=stage_stats, ) - # Save the current checkpoint and delete previous checkpoints. + # Save the current checkpoint and delete previous checkpoints. + ckpt_kwargs = { + f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], + } self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, min_keys=["loss"], + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], + **ckpt_kwargs ) def get_summary_stats(self): @@ -667,6 +673,12 @@ def audio_pipeline(id): raise NotImplementedError( "sorting must be random, ascending or descending" ) + data_scale = hparams.get("data_scale") + if data_scale: + scaled_data_count = int(len(datasets["train"]) * data_scale) + datasets["train"] = datasets["train"].filtered_sorted( + select_n=scaled_data_count + ) return datasets, silence_padding @@ -918,10 +930,22 @@ def apply_overfit_test(hparams, dataset): if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: - tts_brain.evaluate( - test_set=datasets["test"], - test_loader_kwargs=hparams["test_dataloader_opts"], - ) + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + if test_key: + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + tts_brain.evaluate( + test_set=datasets["test"], + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) + # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") From ef35a2f17d4fe9e8b153ce2171e9475f29f018c8 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 18 Feb 2025 23:08:22 -0500 Subject: [PATCH 175/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index da6cd7083..c2aa32f30 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -81,7 +81,7 @@ def create_waveform(self, audio, length): if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device - wav = tokenizer.tokens_to_sig(audio) + wav = self.modules.tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) wav = wav.to(self.device) return wav @@ -427,7 +427,8 @@ def on_stage_end(self, stage, stage_loss, epoch): } # Save the current checkpoint and delete previous checkpoints. self.checkpointer.save_and_keep_only( - meta={"loss": stage_stats["loss"]}, + meta={"loss": stage_stats["loss"], **eval_summary_stats}, + num_to_keep=hparams["ckpt_keep"], **ckpt_kwargs ) elif stage == sb.Stage.TEST: From a6073f50c538ccfdd878de3cdc76e716f796771f Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 19 Feb 2025 02:07:09 -0500 Subject: [PATCH 176/270] DASB: Add support for test set filtering --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 9164d31e0..df31c4c69 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -1019,7 +1019,13 @@ def apply_overfit_test(hparams, dataset): if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], - ) \ No newline at end of file + **eval_kwargs + ) From 1be28c74349de19960a0d1ae1951dd6329fb7b58 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 19 Feb 2025 03:20:36 -0500 Subject: [PATCH 177/270] DASB: Add support for test set filtering --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 35 ++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index c2aa32f30..03038f020 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1097,6 +1097,37 @@ def apply_overfit_test(hparams, dataset): return result +def select_eval_subset(dataset, hparams, key="eval_subset"): + """Selects a subset of the dataset provided, if specified. + The selection is controlled by a hyperparameter named + eval_subset, which is expected to list the IDs of the + data items on which evaluation will take place, one per line + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams : dict + A hyperparameters file + + Returns + ------- + subset : dataset + The dataset, filtered down if applicable + """ + eval_subset_path = hparams.get(key) + if eval_subset_path is not None: + eval_subset_path = Path(eval_subset_path) + if not eval_subset_path.exists(): + raise ValueError(f"eval_subset {eval_subset_path} does not exist") + with open(eval_subset_path) as eval_subset_file: + eval_subset_ids = [line.strip() for line in eval_subset_file] + subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) + else: + subset = dataset + return subset + + def undo_padding_tensor(batch, lengths): """Produces Python lists given a batch of sentences with their corresponding relative lengths. @@ -1238,8 +1269,10 @@ def undo_padding_tensor(batch, lengths): eval_kwargs = { f"{test_key_kind}_key": test_key } + eval_dataset = datasets["test"] + eval_dataset = select_eval_subset(eval_dataset, hparams) tts_brain.evaluate( - test_set=datasets["test"], + test_set=eval_dataset, test_loader_kwargs=hparams["test_dataloader_opts"], **eval_kwargs ) From 4e5f4ebfae89bd8230b73f1949217a90b93dfffd Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 19 Feb 2025 03:37:40 -0500 Subject: [PATCH 178/270] DASB: Add filtering (useful when some samples aren't present, e.g. when using alignments or G2P) --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 03038f020..81435e4a6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1122,6 +1122,10 @@ def select_eval_subset(dataset, hparams, key="eval_subset"): raise ValueError(f"eval_subset {eval_subset_path} does not exist") with open(eval_subset_path) as eval_subset_file: eval_subset_ids = [line.strip() for line in eval_subset_file] + existing_ids = dataset.data_ids + eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids] + if not eval_subset_ids: + raise ValueError("{eval_subset_path}: no items found in the dataset") subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) else: subset = dataset From 8dadf968e753073c4ed4a864a29e70f0cb8db1d2 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 19 Feb 2025 18:46:38 -0500 Subject: [PATCH 179/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 81435e4a6..308c5452d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -433,7 +433,7 @@ def on_stage_end(self, stage, stage_loss, epoch): ) elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch, "lr": lr}, + stats_meta={"epoch": epoch}, train_stats=self.train_stats, valid_stats=stage_stats, ) From 5272a73c2eadb916bec71644312370e80210c410 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 20 Feb 2025 04:13:13 -0500 Subject: [PATCH 180/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 308c5452d..3965e4283 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -81,7 +81,7 @@ def create_waveform(self, audio, length): if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device - wav = self.modules.tokenizer.tokens_to_sig(audio) + wav = tokenizer.tokens_to_sig(audio) clean_padding_(wav, length) wav = wav.to(self.device) return wav From b0df9ac99c8d1b6b4471b817473294479a03f0f7 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 21 Feb 2025 11:47:50 -0500 Subject: [PATCH 181/270] DASB: VALL-E: Fixes for WavTokenizer (AR-only) --- .../TTS/valle/hparams/train_wavtokenizer.yaml | 2 +- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 4 +- .../TTS/valle/hparams/train_wavtokenizer.yaml | 2 +- benchmarks/DASB/model/valle.py | 99 ++++++++++--------- 4 files changed, 56 insertions(+), 51 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index c2ff765f4..af0222d90 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -154,7 +154,7 @@ nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 -vocab_size: 1024 +vocab_size: 4096 text_num_tokens: 39 phn_num_tokens: 52 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 30bfbe3b1..a07c6c53d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -19,7 +19,7 @@ from pathlib import Path from hyperpyyaml import load_hyperpyyaml from speechbrain.dataio.dataio import ( - clean_padding_, + clean_padding, length_to_mask, write_audio, ) @@ -80,7 +80,7 @@ def create_waveform(self, audio, length): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device wav = tokenizer.tokens_to_sig(audio) - clean_padding_(wav, length) + wav = clean_padding(wav, length) wav = wav.to(self.device) return wav diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 012f61e86..17cbc987d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -159,7 +159,7 @@ nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 -vocab_size: 1024 +vocab_size: 4096 audio_emb_freeze: False audio_emb_pretrained: False text_num_tokens: 39 diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 5805cb061..3abcf057f 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -102,16 +102,17 @@ def __init__( qk_norm=qk_norm, dropout=dropout, ) - - self.nar_decoder = ValleNARDecoder( - n_level=nq - 1, - n_ctx=n_ctx, - n_state=att_unit, - n_head=head, - n_layer=nar_layer, - qk_norm=qk_norm, - dropout=dropout, - ) + if nq > 1: + # NOTE: An NAR encoder is not needed if there is only one track + self.nar_decoder = ValleNARDecoder( + n_level=nq - 1, + n_ctx=n_ctx, + n_state=att_unit, + n_head=head, + n_layer=nar_layer, + qk_norm=qk_norm, + dropout=dropout, + ) self.nq = nq self.n_ctx = n_ctx @@ -301,7 +302,7 @@ def inference( nq_level=0, ) # [B, 1, 1] -> [B, 1] - gen_tok, gen_score = gen_tok.squeeze(2), gen_tok.squeeze(2) + gen_tok, gen_score = gen_tok.squeeze(2), gen_score.squeeze(2) generated["token"].append(gen_tok) generated["score"].append(gen_score) @@ -397,42 +398,46 @@ def inference( vocab_mask = torch.cat(mask_cache, dim=1) # (4.2) NAR loop - for step in range(1, opts.nq): - h_nar = self.nar_decoder( - prev_emb, ones * step - 1, mask=mask - ) # [B, T, D] - logits = self.lm_head(h_nar) - gen_tok, gen_score = logits_to_tokens( - logits.unsqueeze(2), - opts, - vocab_mask, - search_algo="greedy_search", - allow_eos=False, - nq_level=step, - ) - gen_tok, gen_score = ( - gen_tok.squeeze(2), - gen_score.squeeze(2), - ) # [B, T] - - generated["token"].append(gen_tok[:, prefix.size(1) :]) - generated["score"].append(gen_score[:, prefix.size(1) :]) - - if opts.search_algo == "teacher_force": - prev_tok = suffix[:, :, step] - else: - prev_tok = generated["token"][-1] - prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] - prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb - - # (5) combine AR and NAR results - gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] - gen_scores_nar = torch.stack(generated["score"], dim=2) - - gen_tokens = torch.cat( - [gen_tokens_ar, gen_tokens_nar], dim=2 - ) # [B, T, nq] - gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + if self.nq > 1: + for step in range(1, opts.nq): + h_nar = self.nar_decoder( + prev_emb, ones * step - 1, mask=mask + ) # [B, T, D] + logits = self.lm_head(h_nar) + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + vocab_mask, + search_algo="greedy_search", + allow_eos=False, + nq_level=step, + ) + gen_tok, gen_score = ( + gen_tok.squeeze(2), + gen_score.squeeze(2), + ) # [B, T] + + generated["token"].append(gen_tok[:, prefix.size(1) :]) + generated["score"].append(gen_score[:, prefix.size(1) :]) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, step] + else: + prev_tok = generated["token"][-1] + prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] + prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb + + # (5) combine AR and NAR results + gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] + gen_scores_nar = torch.stack(generated["score"], dim=2) + + gen_tokens = torch.cat( + [gen_tokens_ar, gen_tokens_nar], dim=2 + ) # [B, T, nq] + gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + else: + gen_tokens = gen_tokens_ar + gen_scores = gen_scores_ar gen_tokens_list, gen_scores_list = [], [] for b in range(len(valid_idx)): From cf24b23ab022d800a9807cd8b474dd0eb3ab6ca2 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 16:31:02 -0500 Subject: [PATCH 182/270] DASB: VALL-E: Update/add test stage logging --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 5 +++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 5 ++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index a07c6c53d..eeb3a9d6b 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -433,6 +433,11 @@ def on_stage_end(self, stage, stage_loss, epoch): num_to_keep=hparams["ckpt_keep"], **ckpt_kwargs ) + elif stage == sb.Stage.TEST: + self.hparams.train_logger.log_stats( + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, + ) def inference(self, batch): """Runs TTS inference diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 3965e4283..a28eabb66 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -433,9 +433,8 @@ def on_stage_end(self, stage, stage_loss, epoch): ) elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( - stats_meta={"epoch": epoch}, - train_stats=self.train_stats, - valid_stats=stage_stats, + stats_meta={"Epoch loaded": self.hparams.epoch_counter.current}, + test_stats=stage_stats, ) def inference(self, batch): From b6224d6a0f641b12af58ba719b5367953655f859 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 17:27:59 -0500 Subject: [PATCH 183/270] DASB: Fix extraction for clusters with no internet connection on compute nodes --- benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml | 3 ++- .../DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml | 3 ++- benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml | 3 ++- benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml index acddcd93b..dc026cc55 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/mimi.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/speech_tokenizer save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -46,7 +47,7 @@ skip_resample: False tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref - save_path: !ref + save_path: !ref num_codebooks: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml index 2b96a749b..8d3a9aa27 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/speech_tokenizer.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/speech_tokenizer save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -44,7 +45,7 @@ skip_resample: False # EnCodec model (see https://huggingface.co/docs/transformers/v4.31.0/en/model_doc/encodec) tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: fnlp/SpeechTokenizer # Only the 24kHz version supports mono audio - save_path: !ref + save_path: !ref tokens_extractor: !new:utils.tokens.TokensExtractor tokenizer: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml index 68dc9df49..3d9792bbb 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/sqcodec.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/sqcodec save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -45,7 +46,7 @@ skip_resample: False # SQCodec model tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref diff --git a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml index 56c13508c..bfd802740 100644 --- a/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/extraction/hparams/wavtokenizer.yaml @@ -9,6 +9,7 @@ seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: !ref results/speech_tokenizer save_folder: !ref /save +pretrained_model_save_folder: !ref train_log: !ref /extraction_log.txt # Data files @@ -46,7 +47,7 @@ skip_resample: False # wavtokenizer model tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref - save_path: !ref + save_path: !ref checkpoint: !ref config: !ref freeze: True From d0900e0d24f9820f82a4999ef7c1946b4509142f Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 18:01:30 -0500 Subject: [PATCH 184/270] DASB: VALL-E: Add layer selection, hpopt updates --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 3 +- .../valle/hparams/train_speech_tokenizer.yaml | 6 ++-- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 28 ++++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index bd50151af..5920c4be3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -63,7 +63,8 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice hubert: speechbrain/hifigan-hubert-k1000-LibriTTS wavlm: speechbrain/hifigan-wavlm-k1000-LibriTTS wav2vec2: speechbrain/hifigan-wav2vec2-k1000-LibriTTS -speech_model_layers: [1, 3, 7, 12, 18, 23] +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref flip_layers: False # Speaker Embeddings diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index bc88e091f..5d0908c0e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -152,12 +152,12 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer -d_model: 1024 +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" share_emb: False qk_norm: True nhead: 16 -num_layers_ar: 12 -num_layers_nar: 12 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 vocab_size: 1024 audio_emb_freeze: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index a28eabb66..6127644c2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -703,6 +703,16 @@ def dataio_prepare(hparams): tokens_loader = hparams.get("tokens_loader") spk_prompt_length = hparams["spk_prompt_length"] + layer_idx = None + if "speech_model_layers" in hparams: + layer_idx = get_selected_layer_indexes(hparams) + + if layer_idx is not None: + num_codebooks = layer_idx + else: + num_codebooks = hparams["audio_tokens_per_step"] + + @sb.utils.data_pipeline.takes("label") @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") def text_pipeline(label): @@ -722,7 +732,7 @@ def spk_prompt(uttid, spk_sample): # Sample a speaker-matched embedding selected_uttid = spk_sample[uttid] audio = tokens_loader.tokens_by_uttid( - selected_uttid, num_codebooks=hparams["audio_tokens_per_step"] + selected_uttid, num_codebooks=num_codebooks ) if audio.size(0) > spk_prompt_length: offset = torch.randint(0, audio.size(0), (1,)).item() @@ -1003,6 +1013,22 @@ def init_sequence_encoder(hparams): return encoder +def get_selected_layer_indexes(hparams): + """Finds the layers of selected layers + + Arguments + --------- + hparams : dict + Hyperparameters + """ + selected_layers = hparams.get("speech_model_layers") + available_layers = hparams.get("available_speech_model_layers") + if not (selected_layers and available_layers): + return None + layer_idx = [available_layers.index(layer) for layer in selected_layers] + return layer_idx + + def read_token_list(file_name): """Reads a simple text file with tokens (e.g. characters or phonemes) listed one per line From c5a3f3af8e1e521a9d26a44f8040c47337d9beee Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 18:57:45 -0500 Subject: [PATCH 185/270] DASB: Add support for eval_run flags --- benchmarks/DASB/run_hparam_optimization.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 3029a3678..9be6a3c64 100755 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -207,7 +207,17 @@ while [[ $# -gt 0 ]]; do ;; -*|--*) - additional_flags+="$1 $2 " # store additional flags + name=$1 + value=$2 + if [[ "$name" =~ ^--eval_run_ ]]; then + name=$(echo $name | sed s/^--eval_run_/--/) + eval_run_additional_flags+="$name $value " + else + if [[ ! "$eval_run_additional_flags" =~ "$name " ]]; then + eval_run_additional_flags+="$name $value " + fi + additional_flags+="$name $value " # store additional flags + fi shift # past argument ;; @@ -415,6 +425,6 @@ scp $best_yaml_file $final_yaml_file ./run_experiments.sh --hparams $final_yaml_file --data_folder $data_folder --cached_data_folder $cached_data_folder \ --output_folder $output_folder/best --task $task --dataset $dataset --seed $seed\ --nruns $nruns_eval --eval_metric $eval_metric --eval_set test \ - --rnd_dir False --testing True $additional_flags + --rnd_dir False --testing True $eval_run_additional_flags echo "The test performance with best hparams is available at $output_folder/best" From 3ddbc57177afc10d1d963c0c61f55b03aa5ac8f3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 22:59:13 -0500 Subject: [PATCH 186/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 30 ++++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 6127644c2..5e8949501 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -242,6 +242,14 @@ def on_stage_start(self, stage, epoch): self.hparams.vocab_size, self.hparams.audio_tokens_per_step, )[None, None, :].to(self.device) + if hasattr(hparams, "speech_model_layers"): + self.layer_idx = get_selected_layer_indexes( + hparams.available_speech_model_layers, + hparams.speech_model_layers + ) + else: + self.layer_idx = None + self.loss_metric = sb.utils.metric_stats.MultiMetricStats( metric=self.compute_loss_stats, batch_eval=True, ) @@ -705,7 +713,10 @@ def dataio_prepare(hparams): layer_idx = None if "speech_model_layers" in hparams: - layer_idx = get_selected_layer_indexes(hparams) + layer_idx = get_selected_layer_indexes( + hparams["available_speech_model_layers"], + hparams["speech_model_layers"], + ) if layer_idx is not None: num_codebooks = layer_idx @@ -751,7 +762,7 @@ def spk_prompt(uttid, spk_sample): ) def prompt_pipeline(id, tokens, spk_prompt): audio = tokens_loader.tokens_by_uttid( - id, num_codebooks=hparams["audio_tokens_per_step"] + id, num_codebooks=num_codebooks ) if hparams["flip_layers"]: audio = audio.flip(-1) @@ -1013,16 +1024,21 @@ def init_sequence_encoder(hparams): return encoder -def get_selected_layer_indexes(hparams): +def get_selected_layer_indexes(available_layers, selected_layers): """Finds the layers of selected layers Arguments --------- - hparams : dict - Hyperparameters + available_layers : list + The available layers + selected_layers : list + The selected layers + + Returns + ------- + layer_idx : list + The layer indexes """ - selected_layers = hparams.get("speech_model_layers") - available_layers = hparams.get("available_speech_model_layers") if not (selected_layers and available_layers): return None layer_idx = [available_layers.index(layer) for layer in selected_layers] From e1bfb7e87823e8ec32d834713db22d7bbf919d8c Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 24 Feb 2025 23:05:18 -0500 Subject: [PATCH 187/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 5e8949501..1f36093d9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -81,7 +81,9 @@ def create_waveform(self, audio, length): if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device - wav = tokenizer.tokens_to_sig(audio) + wav = tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) clean_padding_(wav, length) wav = wav.to(self.device) return wav @@ -265,6 +267,9 @@ def on_stage_start(self, stage, epoch): elif stage == sb.Stage.TEST: self.evaluation_metric.on_evaluation_start() self.is_evaluating = True + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) def apply_curriculum(self): """Applies curriculum settings, if specified, training only the autoregressive part - or From 851bd7d0e2e743007960e38ab7ed735b0622704a Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 25 Feb 2025 01:03:05 -0500 Subject: [PATCH 188/270] DASB: VALL-E: Update max length --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 2 +- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 4 ++-- .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index c811e1c7f..f9d07b443 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -97,7 +97,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 5920c4be3..9d9e65b85 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -116,7 +116,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 16000 -max_audio_length: 2000 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 54c357b52..a4a19ae6b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -96,7 +96,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 120208cd8..714cf91b5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -96,7 +96,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 8383cf0f7..c1e3f1e3a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -97,7 +97,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + @@ -160,7 +160,7 @@ nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 -vocab_size: 1024 +vocab_size: 2048 audio_emb_freeze: False audio_emb_pretrained: False text_num_tokens: 39 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 5d0908c0e..24c494a98 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -96,7 +96,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 17cbc987d..e98056db3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -96,7 +96,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 model_sample_rate: 24000 -max_audio_length: 2300 +max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 n_ctx: !ref + + From 7463474f4ceb00028992bb22b96183e38c9e446d Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 25 Feb 2025 01:53:11 -0500 Subject: [PATCH 189/270] DASB: Fix WavTokenizer --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 1f36093d9..92df5c1a1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -106,9 +106,11 @@ def compute_forward(self, batch, stage): batch = batch.to(self.device) prompt, prompt_length = batch.prompt batch_size, prompt_max_len, num_tracks = prompt.shape - nar_track = torch.randint( - 1, num_tracks, (batch_size,), device=self.device - ) + nar_track = None + if self.train_nar: + nar_track = torch.randint( + 1, num_tracks, (batch_size,), device=self.device + ) logits_ar, logits_nar = self.modules.model( dec_seq=batch.prompt.data, dec_seq_lengths=batch.prompt.lengths, From 05f80142f3b734154642830da8874581ff7f9c4e Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 25 Feb 2025 03:01:59 -0500 Subject: [PATCH 190/270] DASB: VALL-E: Add speaker prompt resampling --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 92df5c1a1..6524ee6a5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -272,6 +272,8 @@ def on_stage_start(self, stage, epoch): self.token_model_kwargs = getattr( self.hparams, "token_model_kwargs", {} ) + dataset = stage.name.lower() + self.resample_fn[dataset](epoch=epoch or 0) def apply_curriculum(self): """Applies curriculum settings, if specified, training only the autoregressive part - or @@ -877,7 +879,7 @@ def sig_pipeline(wav): "sorting must be random, ascending or descending" ) - return datasets + return datasets, resample_fn def sample_dataset(dataset, count, seed): @@ -1283,7 +1285,7 @@ def undo_padding_tensor(batch, lengths): ) # We can now directly create the datasets for training, valid, and test - datasets = dataio_prepare(hparams) + datasets, resample_fn = dataio_prepare(hparams) # Apply overfit test settings datasets = apply_overfit_test(hparams, datasets) @@ -1298,6 +1300,8 @@ def undo_padding_tensor(batch, lengths): checkpointer=hparams["checkpointer"], ) + tts_brain.resample_fn = resample_fn + # The `fit()` method iterates the training loop, calling the methods # necessary to update the parameters of the model. Since all objects # with changing state are managed by the Checkpointer, training can be From f94c61b0c9e5275c7b36f9acc7cba3a9e8cb0b0b Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 25 Feb 2025 11:34:50 -0500 Subject: [PATCH 191/270] DASB: VALL-E: Add SQCodec --- .../TTS/valle/hparams/train_sqcodec.yaml | 234 ++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..337754bf5 --- /dev/null +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,234 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/sqcodec +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER + +# Model Type +output_folder: !ref results/valle/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +prepare_archive_path: null +prepare_skip_ignore_folders: False +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 +tokens_folder: !PLACEHOLDER +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +freeze_token_model: True +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec +g2p_src: speechbrain/soundchoice-g2p +token_model_kmeans_src: poonehmousavi/SSL_Quantization +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +flip_layers: False +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +use_spk_emb: False +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 50 +number_of_epochs_ar: null +number_of_epochs_nar: null +batch_size: 16 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 +data_scale: null + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +special_tokens: ["", "", ""] +special_num_tokens: 4 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step + +# Feature parameters +sample_rate: 22050 +model_sample_rate: 16000 +max_audio_length: 1000 +text_max_length: 500 +n_ctx: !ref + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Token model (pretrained) +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 2048 +text_num_tokens: 39 +phn_num_tokens: 52 + +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 4 +freeze_lm_head: False + + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + +modules: + model: !ref + tokenizer: !ref + +opt_class: !name:torch.optim.Adam + lr: !ref + +compute_cost: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref From 398304e2f6f5daa1e85e9ed8fb99399d135d3a65 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 25 Feb 2025 16:35:54 -0500 Subject: [PATCH 192/270] DASB: Tokotron: Update SQ-Codec ternary coding --- benchmarks/DASB/model/Tokotron.py | 309 ++++++------------------------ 1 file changed, 58 insertions(+), 251 deletions(-) diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index d86d52273..122b80584 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -25,7 +25,7 @@ from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear -from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, bce_loss +from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss from speechbrain.dataio.dataio import length_to_mask from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler @@ -418,16 +418,21 @@ class TernaryPredictionHead(nn.Module): num_positions : int the number of positions """ - def __init__(self, d_model, num_positions): + def __init__(self, d_model, num_positions, d_hidden=512): super().__init__() self.num_positions = num_positions self.d_model = d_model self.num_positions = num_positions - self.lin_p = Linear( + self.lin_hidden = Linear( input_size=d_model, - n_neurons=num_positions * 2 + n_neurons=d_hidden, + ) + self.act = nn.LeakyReLU() + self.lin_p = Linear( + input_size=d_hidden, + n_neurons=num_positions * 3, + bias=False ) - self.sigmoid = nn.Sigmoid() def forward(self, x): """Computes the forward pass @@ -440,13 +445,18 @@ def forward(self, x): Returns ------- p : torch.Tensor - A tensor of shape (Batch x Length x num_positions x 2) where - p[:, :, :, 0] -> the probability of the ternary digit being at least 0 - p[:, :, :, 0] -> the probability of the ternary digit being at least 1 + A tensor of shape (Batch x Length x num_positions x ternary digit) + The values are logits (unnormalized probabilities) + + p[:, :, :, 0] corresponds to -1 + p[:, :, :, 1] corresponds to 0 + p[:, :, :, 2] corresponds to 1 """ batch_size, max_len, _ = x.shape - p = self.sigmoid(self.lin_p(x)) - p = p.reshape(batch_size, max_len, self.num_positions, 2) + x = self.lin_hidden(x) + x = self.act(x) + x = self.lin_p(x) + p = x.reshape(batch_size, max_len, self.num_positions, 3) return p @@ -1960,183 +1970,6 @@ def all_weights(self): return torch.stack([emb.weight for emb in self.emb]) -class DACFeatureExtractor(nn.Module): - """An adapter for feature extraction - - Arguments - --------- - dac : DAC - a DAC model - """ - - def __init__(self, dac, n_quantizers): - super().__init__() - self.dac = dac - self.dac.eval() - self.n_quantizers = n_quantizers - - def encode(self, inputs, length): - """Encodes a raw audio sample using DAC - - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths - - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers - - """ - if inputs.dim() < 3: - inputs = inputs.unsqueeze(1) - emb, codes, _, _, _ = self.dac.encode( - inputs, n_quantizers=self.n_quantizers - ) - emb.transpose_(1, 2) - codes.transpose_(1, 2) - max_len = emb.size(1) - mask = length_to_mask( - length * max_len, max_len, device=inputs.device - ).unsqueeze(-1) - return codes * mask, emb * mask - - def forward(self, inputs, length): - """Encodes a raw audio sample using DAC - - Arguments - --------- - inputs : torch.Tensor - A (Batch x Samples) or (Batch x Channel x Samples) - tensor of audio - length : torch.Tensor - A tensor of relative lengths - - Returns - ------- - tokens : torch.Tensor - A (Batch x Tokens x Heads) tensor of audio tokens - emb : torch.Tensor - Raw vector embeddings from the model's - quantizers - - """ - return self.encode(inputs, length) - - def embeddings(self, tokens): - """Converts token indexes to vector embeddings - - Arguments - --------- - tokens : torch.Tensor - a (Batch x Length x Heads) tensor of token indexes - - Returns - ------- - emb : torch.Tensor - a (Batch x Length x Heads x Embedding) tensor - of raw vector embeddings from the model's - quantizer codebooks - """ - emb, _, _ = self.dac.quantizer.from_codes(tokens.transpose(1, 2).int()) - return emb.transpose(1, 2) - - -class SpeechTokenizerFeatureExtractor(nn.Module): - """This lobe enables the integration of HuggingFace and SpeechBrain - pretrained SpeechTokenizer. - - Please, install speechtokenizer: - pip install speechtokenizer - - Source paper: https://arxiv.org/abs/2308.16692 - - - The model can be used as a fixed Discrete feature extractor or can be finetuned. It - will download automatically the model from HuggingFace or use a local path. - - Arguments - --------- - speech_tokenizer : speechbrain.lobes.models.discrete.speechtokenizer_interface.SpeechTokenizer_interface - The speech tokenizer interface - codebooks : int, optional - The number of codebooks to use - if omitted, - """ - - def __init__(self, speech_tokenizer, codebooks=None): - super().__init__() - self.speech_tokenizer = speech_tokenizer - self.codebooks = codebooks - - def forward(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. - - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. - - Returns - ------- - tokens : torch.Tensor - A tensor of audio tokens - Shape: (N_q x Batch x Time) by default - (Batch x Time x N_q) if shape == compat - - """ - return self.encode(wav, wav_lens) - - def encode(self, wav, wav_lens=None): - """Takes an input waveform and return its corresponding wav2vec encoding. - - Arguments - --------- - wav : torch.Tensor (signal) - A batch of audio signals to transform to features. - wav_lens : torch.Tensor - The relative length of the wav given in SpeechBrain format. - - Returns - ------- - tokens : torch.Tensor - A (Batch x Seq, N_q) tensor of audio tokens - - """ - # Extract discrete codes from SpeechTokenizer - codes = self.speech_tokenizer.encode( - wav.unsqueeze(1), wav_lens - ) # codes: (n_q, B, T) - if self.codebooks is not None: - codes = codes[: self.codebooks] - codes = codes.permute(1, 2, 0) - return codes - - def decode(self, codes): - """Takes an input waveform and return its corresponding wav2vec encoding. - - Arguments - --------- - tokens : torch.Tensor - A (N_q, Batch x Seq) tensor of audio tokens - - Returns - ------- - wav : torch.Tensor (signal) - A batch of reconstructed audio signals. - """ - codes = codes.permute(2, 0, 1) - return self.speech_tokenizer.decode(codes) - - def get_silence_token( model, sample_length=100000, @@ -2335,67 +2168,22 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys): } -def ternary_matrix_to_decimal(matrix): - """ - Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. - - Arguments - --------- - matrix : numpy.ndarray - A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number - of ternary digits, and N is the number of ternary numbers in each batch. - - Returns - ------- - numpy.ndarray - A 2D numpy array of shape (B, N), where each value represents the decimal - equivalent of the corresponding ternary number in the input matrix. - """ - ( - B, - D, - N, - ) = ( - matrix.shape - ) # B is the batch size, D is the number of digits, N is the number of ternary numbers - powers_of_three = 3 **torch.arange(D) # [3^0, 3^1, ..., 3^(D-1)] - - # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] - powers_of_three = powers_of_three[:, None] # Shape [D, 1] - - # Compute dot product using broadcasting: matrix * powers_of_three along D axis - decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis - - return decimals - - def logits_to_ternary(logits): """Converts a tensor with two logits to a ternary matrix Arguments --------- logits : torch.Tensor - The logits (Batch x Length x num_positions x 2) + The logits (Batch x Length x num_positions x 3) Returns ------- result : torch.Tensor The corresponding ternary matrix """ - gte0 = logits[..., 0] >= 0.5 - gte1 = logits[..., 1] >= 0.5 - val_minus_1 = torch.tensor(-1, device=logits.device) - val_zero = torch.tensor(0, device=logits.device) - val_plus_1 = torch.tensor(1, device=logits.device) - return torch.where( - gte0, - torch.where( - gte1, - val_plus_1, - val_zero - ), - val_minus_1 - ) + ternary = logits.argmax(-1) - 1 + return ternary + def ternary_matrix_to_decimal(matrix): """ @@ -2433,7 +2221,7 @@ def ternary_matrix_to_decimal(matrix): def ternary_to_decimal(ternary, n_codebook=4): """Converts ternary digits to their decimal equivalent - + Arguments --------- ternary : torch.Tensor @@ -2479,7 +2267,9 @@ def tokens_to_ternary(tokens): Returns ------- - result : t""" + result : torch.Tensor + A (Batch x Length x Ternary Positions) tensor + with values of (-1, 0, 1)""" batch_size = tokens.size(0) n_codebook = tokens.size(2) tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() @@ -2491,19 +2281,36 @@ def tokens_to_ternary(tokens): def ternary_loss(predictions, targets, length=None, reduction="mean"): - tgt_gte0 = targets >= 0. - tgt_gte1 = targets >= 1. - loss_gte0 = bce_loss( - predictions[:, :, :, 0], - tgt_gte0, - length=length, - reduction=reduction, + batch_size, max_len, positions = targets.shape + predictions_reshaped = ( + predictions + .permute(2, 0, 1, 3) + .reshape(batch_size * positions, max_len, 3) ) - loss_gte1 = bce_loss( - predictions[:, :, :, 0], - tgt_gte1, - length=length, - reduction=reduction, + targets_cat = targets + 1 + targets_cat_reshaped = ( + targets_cat + .permute(2, 0, 1) + .reshape(batch_size * positions, max_len) + ) + length_reshaped = ( + length.unsqueeze(-1) + .expand(batch_size, positions) + .permute(1, 0) + .reshape(batch_size * positions) ) - loss = loss_gte0 + loss_gte1 + loss = nll_loss( + log_probabilities=predictions_reshaped, + targets=targets_cat_reshaped, + length=length_reshaped, + reduction=reduction + ) + if reduction == "batch": + loss = ( + loss + .reshape(positions, batch_size) + .permute(1, 0) + .mean(1) + ) + return loss \ No newline at end of file From c90037c388477239dfbf13032aad1d96f622383b Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 26 Feb 2025 13:26:47 -0500 Subject: [PATCH 193/270] DASB: Add the ability to disable test runs --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 34 ++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 6524ee6a5..5cbca2493 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1315,20 +1315,20 @@ def undo_padding_tensor(batch, lengths): ) # Load best checkpoint for evaluation - - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" - if test_summary_file.exists(): - logging.info("Test run already completed: %s", test_summary_file) - else: - test_key_kind = hparams["test_key_kind"] - test_key = hparams["test_key"] - eval_kwargs = { - f"{test_key_kind}_key": test_key - } - eval_dataset = datasets["test"] - eval_dataset = select_eval_subset(eval_dataset, hparams) - tts_brain.evaluate( - test_set=eval_dataset, - test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs - ) + if hparams["testing"]: + test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + if test_summary_file.exists(): + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + eval_dataset = datasets["test"] + eval_dataset = select_eval_subset(eval_dataset, hparams) + tts_brain.evaluate( + test_set=eval_dataset, + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) From 131eea389f90900b77bdee272567a860244016f8 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 27 Feb 2025 11:35:49 -0500 Subject: [PATCH 194/270] DASB: Tokotron: Update ternary loss aggregation --- benchmarks/DASB/model/Tokotron.py | 43 ++++++++++--------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 122b80584..764c4e34e 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -2282,35 +2282,20 @@ def tokens_to_ternary(tokens): def ternary_loss(predictions, targets, length=None, reduction="mean"): batch_size, max_len, positions = targets.shape - predictions_reshaped = ( - predictions - .permute(2, 0, 1, 3) - .reshape(batch_size * positions, max_len, 3) - ) targets_cat = targets + 1 - targets_cat_reshaped = ( - targets_cat - .permute(2, 0, 1) - .reshape(batch_size * positions, max_len) - ) - length_reshaped = ( - length.unsqueeze(-1) - .expand(batch_size, positions) - .permute(1, 0) - .reshape(batch_size * positions) - ) - loss = nll_loss( - log_probabilities=predictions_reshaped, - targets=targets_cat_reshaped, - length=length_reshaped, - reduction=reduction + predictions_loss = predictions.permute(0, 3, 1, 2) + loss = nn.functional.nll_loss( + predictions_loss, + targets_cat, + reduction="none" ) - if reduction == "batch": - loss = ( - loss - .reshape(positions, batch_size) - .permute(1, 0) - .mean(1) - ) - + mask = length_to_mask( + length * max_len, + max_len + ).unsqueeze(-1) + loss = loss * mask + if reduction == "mean": + loss = loss.sum(2).mean(1).mean(0) / 3.0 + elif reduction == "batch": + loss = loss.sum(2).mean(1) / 3.0 return loss \ No newline at end of file From 7c5e82f4605a23413203baa56ea001b1e8eee3a3 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 27 Feb 2025 13:25:36 -0500 Subject: [PATCH 195/270] DASB: Fix an issue with contiguous tensors --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 2 +- benchmarks/DASB/model/Tokotron.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 3e6b356d4..4c5d12b94 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -229,7 +229,7 @@ def compute_objectives(self, predictions, batch, stage): input_length=batch.tokens.lengths, reduction="batch", ) - return loss_details.loss + return loss_details.loss.contiguous() def on_stage_start(self, stage, epoch): """Gets called at the beginning of each epoch. diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 764c4e34e..7d250e4cc 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -2283,7 +2283,7 @@ def tokens_to_ternary(tokens): def ternary_loss(predictions, targets, length=None, reduction="mean"): batch_size, max_len, positions = targets.shape targets_cat = targets + 1 - predictions_loss = predictions.permute(0, 3, 1, 2) + predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() loss = nn.functional.nll_loss( predictions_loss, targets_cat, @@ -2298,4 +2298,4 @@ def ternary_loss(predictions, targets, length=None, reduction="mean"): loss = loss.sum(2).mean(1).mean(0) / 3.0 elif reduction == "batch": loss = loss.sum(2).mean(1) / 3.0 - return loss \ No newline at end of file + return loss From 7046db00b85d6ab643e7ec6f1ca5b57a2de9d8c7 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 28 Feb 2025 13:03:46 -0500 Subject: [PATCH 196/270] DASB: Tokotron: SQ-Codec Add the ability to bypass additional ternary projections --- .../TTS/tokotron/hparams/train_sqcodec.yaml | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 4ea1ba387..6b9782eeb 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -17,7 +17,7 @@ testing: True # If set to True, the test evlaution is done, otherwise skipped. config: config.yaml checkpoint: ckpt_00190000.pth -sq_codec_save_path: !ref /sq-codec +sq_codec_save_path: !ref /sq-codec g2p_src: flexthink/soundchoice-g2p # Model type @@ -110,6 +110,8 @@ gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp gamma: !ref max_weight: !ref +ternary_input_mode: embedding + silence_padding: !ref # Token model (pretrained) @@ -156,7 +158,11 @@ transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU audio_num_tokens: 19683 -audio_emb_size: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + embedding: 36 + projection: 1024 audio_emb_freeze: False audio_emb_pretrained: False audio_token_offsets: False @@ -210,9 +216,13 @@ inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference transform_audio: !name:model.Tokotron.tokens_to_ternary feed_audio: !name:model.Tokotron.ternary_logits_to_tokens -audio_emb: !new:model.Tokotron.TernaryInput - emb_size: !ref - num_positions: !ref +audio_emb: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + projection: !new:model.Tokotron.TernaryInput + emb_size: !ref + num_positions: !ref + embedding: !new:torch.nn.Identity out_proj: !new:model.Tokotron.TernaryPredictionHead d_model: !ref From ebe181172ca065c12769fb6b2da19345e2607a9e Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 28 Feb 2025 23:38:58 -0500 Subject: [PATCH 197/270] DASB: Tokotron: Fixes --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 4c5d12b94..ec1845d36 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -934,6 +934,7 @@ def apply_overfit_test(hparams, dataset): if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: + eval_kwargs = {} test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] if test_key: From dae8bcb82ef7be7cc838f1cf82823bd486d23646 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 1 Mar 2025 00:02:38 -0500 Subject: [PATCH 198/270] DASB: Fixes: SQ-Codec refactoring (decouple from Tokotron, simplify) --- .../TTS/tokotron/hparams/train_sqcodec.yaml | 26 +-- benchmarks/DASB/model/Tokotron.py | 210 +----------------- benchmarks/DASB/model/custom_model.py | 55 +++++ benchmarks/DASB/model/sq_codec.py | 147 ++++++++++++ 4 files changed, 211 insertions(+), 227 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index 6b9782eeb..f0ab3d9c1 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -110,8 +110,6 @@ gate_offset: !apply:model.Tokotron.distance_diff_loss_ramp gamma: !ref max_weight: !ref -ternary_input_mode: embedding - silence_padding: !ref # Token model (pretrained) @@ -145,7 +143,7 @@ sample_dataloader_opts: padding_kwargs: value: !ref -transform_audio: !name:model.Tokotron.tokens_to_ternary +transform_audio: !name:model.sq_codec.tokens_to_ternary ####################### Model parameters ########################### # Transformer @@ -158,11 +156,7 @@ transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU audio_num_tokens: 19683 -audio_emb_size: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - embedding: 36 - projection: 1024 +audio_emb_size: 36 audio_emb_freeze: False audio_emb_pretrained: False audio_token_offsets: False @@ -213,18 +207,12 @@ inference: !new:model.Tokotron.TokotronTransformerAutoregressiveInference audio_token_shift: 0 max_steps: !ref representation_mode: !ref - transform_audio: !name:model.Tokotron.tokens_to_ternary - feed_audio: !name:model.Tokotron.ternary_logits_to_tokens + transform_audio: !name:model.sq_codec.tokens_to_ternary + feed_audio: !name:model.sq_codec.ternary_logits_to_tokens -audio_emb: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - projection: !new:model.Tokotron.TernaryInput - emb_size: !ref - num_positions: !ref - embedding: !new:torch.nn.Identity +audio_emb: !new:torch.nn.Identity -out_proj: !new:model.Tokotron.TernaryPredictionHead +out_proj: !new:model.custom_model.TernaryPredictionHead d_model: !ref num_positions: !ref @@ -249,7 +237,7 @@ compute_cost: !new:model.Tokotron.TokotronLoss gate_gamma: !ref gate_max_weight: !ref silence_padding: !ref - seq_cost: !name:model.Tokotron.ternary_loss + seq_cost: !name:model.sq_codec.ternary_loss multihead_output: False lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 7d250e4cc..6a2de5859 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -29,7 +29,6 @@ from speechbrain.dataio.dataio import length_to_mask from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler -from model.sq_codec import decimal_to_ternary_matrix from enum import Enum from collections import namedtuple @@ -407,78 +406,6 @@ def init_audio_emb(self, emb): self.audio_emb.initialize(emb) -class TernaryPredictionHead(nn.Module): - """An alternative prediction head that predicts a fixed number of ternary digits - for each position (as used in SQ-Codec) - - Arguments - --------- - d_model : int - The model dimension - num_positions : int - the number of positions - """ - def __init__(self, d_model, num_positions, d_hidden=512): - super().__init__() - self.num_positions = num_positions - self.d_model = d_model - self.num_positions = num_positions - self.lin_hidden = Linear( - input_size=d_model, - n_neurons=d_hidden, - ) - self.act = nn.LeakyReLU() - self.lin_p = Linear( - input_size=d_hidden, - n_neurons=num_positions * 3, - bias=False - ) - - def forward(self, x): - """Computes the forward pass - - Arguments - --------- - x : torch.Tensor - The decoder output (Batch x Length x d_model) - - Returns - ------- - p : torch.Tensor - A tensor of shape (Batch x Length x num_positions x ternary digit) - The values are logits (unnormalized probabilities) - - p[:, :, :, 0] corresponds to -1 - p[:, :, :, 1] corresponds to 0 - p[:, :, :, 2] corresponds to 1 - """ - batch_size, max_len, _ = x.shape - x = self.lin_hidden(x) - x = self.act(x) - x = self.lin_p(x) - p = x.reshape(batch_size, max_len, self.num_positions, 3) - return p - - -class TernaryInput(nn.Module): - def __init__(self, emb_size, num_positions): - super().__init__() - self.num_positions = num_positions - self.in_proj = Linear( - input_size=num_positions * 3, - n_neurons=emb_size, - ) - - def forward(self, x): - batch_size, max_len = x.shape[:2] - x_onehot = torch.nn.functional.one_hot( - (x + 1).long(), - 3 - ).reshape(batch_size, max_len, self.num_positions * 3) - in_proj = self.in_proj(x_onehot.float()) - return in_proj - - class TokotronTransformerAutoregressiveInference(nn.Module): """A greedy autoregressive inference implementation @@ -2076,7 +2003,7 @@ def feature_pad_to(tensor, length, padding=None): def batch_feature_pad(tensors, padding=None): - """Similar to batch_pad_right but pads with the specified padding, whcih + """Similar to batch_pad_right but pads with the specified padding, which can be a vector or a tensor Arguments @@ -2165,137 +2092,4 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys): "collate_fn": partial( token_collate_fn, silence_token=silence_token, token_keys=token_keys ), - } - - -def logits_to_ternary(logits): - """Converts a tensor with two logits to a ternary matrix - - Arguments - --------- - logits : torch.Tensor - The logits (Batch x Length x num_positions x 3) - - Returns - ------- - result : torch.Tensor - The corresponding ternary matrix - """ - ternary = logits.argmax(-1) - 1 - return ternary - - -def ternary_matrix_to_decimal(matrix): - """ - Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. - - Arguments - --------- - matrix : numpy.ndarray - A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number - of ternary digits, and N is the number of ternary numbers in each batch. - - Returns - ------- - numpy.ndarray - A 2D numpy array of shape (B, N), where each value represents the decimal - equivalent of the corresponding ternary number in the input matrix. - """ - ( - B, - D, - N, - ) = ( - matrix.shape - ) # B is the batch size, D is the number of digits, N is the number of ternary numbers - powers_of_three = 3 ** torch.arange(D, device=matrix.device) # [3^0, 3^1, ..., 3^(D-1)] - - # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] - powers_of_three = powers_of_three[:, None] # Shape [D, 1] - - # Compute dot product using broadcasting: matrix * powers_of_three along D axis - decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis - - return decimals - - -def ternary_to_decimal(ternary, n_codebook=4): - """Converts ternary digits to their decimal equivalent - - Arguments - --------- - ternary : torch.Tensor - (Batch x Length x num_positions) - ternary digits - n_codebooks : torch.Tensor - The number of coedbooks""" - chunks = ternary.chunk(n_codebook, dim=1) - codec_ls = [] - # TODO: Vectorize - for i, chunk in enumerate(chunks): - chunk = chunk + 1 - tmp_codec = ternary_matrix_to_decimal(chunk) - codec_ls.append(tmp_codec) - codec_ls = torch.stack(codec_ls) - return codec_ls.permute(1, 2, 0) - - -def ternary_logits_to_tokens(logits): - """Converts ternary logits to tokens (as used for SQ-Codec) - - Arguments - --------- - logits : torch.Tensor - The logits - - Returns - ------- - tokens : torch.Tensor - Token IDs - """ - ternary_matrix = logits_to_ternary(logits) - tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2)) - return tokens - - -def tokens_to_ternary(tokens): - """Converts a sequence of tokens to a ternary matrix - - Arguments - --------- - tokens : torch.Tensor - A (Batch x Length x Codebooks) tensor of tokens - - Returns - ------- - result : torch.Tensor - A (Batch x Length x Ternary Positions) tensor - with values of (-1, 0, 1)""" - batch_size = tokens.size(0) - n_codebook = tokens.size(2) - tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() - ternary_matrix = torch.cat([ - decimal_to_ternary_matrix(item, D=9) - 1 - for item in tokens - ], dim=1) - return ternary_matrix.transpose(1, 2) - - -def ternary_loss(predictions, targets, length=None, reduction="mean"): - batch_size, max_len, positions = targets.shape - targets_cat = targets + 1 - predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() - loss = nn.functional.nll_loss( - predictions_loss, - targets_cat, - reduction="none" - ) - mask = length_to_mask( - length * max_len, - max_len - ).unsqueeze(-1) - loss = loss * mask - if reduction == "mean": - loss = loss.sum(2).mean(1).mean(0) / 3.0 - elif reduction == "batch": - loss = loss.sum(2).mean(1) / 3.0 - return loss + } \ No newline at end of file diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 972d35c66..e5a9db761 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,4 +1,5 @@ import torch +from speechbrain.nnet.linear import Linear class AttentionMLP(torch.nn.Module): @@ -109,3 +110,57 @@ def forward(self, in_tokens): if self.proj_layer is not None: in_embs = self.proj_layer(in_embs) return in_embs + + +class TernaryPredictionHead(torch.nn.Module): + """An alternative prediction head that predicts a fixed number of ternary digits + for each position (as used in SQ-Codec) + + Arguments + --------- + d_model : int + The model dimension + num_positions : int + the number of positions + """ + def __init__(self, d_model, num_positions, d_hidden=512): + super().__init__() + self.num_positions = num_positions + self.d_model = d_model + self.num_positions = num_positions + self.lin_hidden = Linear( + input_size=d_model, + n_neurons=d_hidden, + ) + self.act = torch.nn.LeakyReLU() + self.lin_p = Linear( + input_size=d_hidden, + n_neurons=num_positions * 3, + bias=False + ) + + def forward(self, x): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The decoder output (Batch x Length x d_model) + + Returns + ------- + p : torch.Tensor + A tensor of shape (Batch x Length x num_positions x ternary digit) + The values are logits (unnormalized probabilities) + + p[:, :, :, 0] corresponds to -1 + p[:, :, :, 1] corresponds to 0 + p[:, :, :, 2] corresponds to 1 + """ + batch_size, max_len, _ = x.shape + x = self.lin_hidden(x) + x = self.act(x) + x = self.lin_p(x) + p = x.reshape(batch_size, max_len, self.num_positions, 3) + return p + diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 7901675e1..d0b850056 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -21,6 +21,8 @@ from torch.autograd import Function from torch.nn.utils import remove_weight_norm, weight_norm +from speechbrain.dataio.dataio import length_to_mask + class SQCodec(nn.Module): """ @@ -1342,6 +1344,41 @@ def ternary_matrix_to_decimal(matrix): return decimals +def ternary_matrix_to_decimal_torch(matrix): + """ + Convert a B*D*N ternary matrix to a 2D array of decimal numbers for each batch. + + Arguments + --------- + matrix : numpy.ndarray + A 3D numpy array of shape (B, D, N), where B is the batch size, D is the number + of ternary digits, and N is the number of ternary numbers in each batch. + + Returns + ------- + numpy.ndarray + A 2D numpy array of shape (B, N), where each value represents the decimal + equivalent of the corresponding ternary number in the input matrix. + """ + ( + B, + D, + N, + ) = ( + matrix.shape + ) # B is the batch size, D is the number of digits, N is the number of ternary numbers + powers_of_three = 3 ** torch.arange(D, device=matrix.device) # [3^0, 3^1, ..., 3^(D-1)] + + # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] + powers_of_three = powers_of_three[:, None] # Shape [D, 1] + + # Compute dot product using broadcasting: matrix * powers_of_three along D axis + decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + + return decimals + + + def get_padding(kernel_size, dilation=1): """ Computes the padding size for a given kernel size and dilation. @@ -1359,3 +1396,113 @@ def get_padding(kernel_size, dilation=1): Calculated padding size. """ return int((kernel_size * dilation - dilation) / 2) + + +def ternary_to_decimal(ternary, n_codebook=4): + """Converts ternary digits to their decimal equivalent + + Arguments + --------- + ternary : torch.Tensor + (Batch x Length x num_positions) - ternary digits + n_codebooks : torch.Tensor + The number of codebooks + + Returns + ------- + result: torch.Tensor + the result (Batch x Length x codebooks) + """ + chunks = ternary.chunk(n_codebook, dim=1) + codec_ls = [] + # TODO: Vectorize + for i, chunk in enumerate(chunks): + chunk = chunk + 1 + tmp_codec = ternary_matrix_to_decimal_torch(chunk) + codec_ls.append(tmp_codec) + codec_ls = torch.stack(codec_ls) + return codec_ls.permute(1, 2, 0) + + +def ternary_logits_to_tokens(logits): + """Converts ternary logits to tokens (as used for SQ-Codec) + + Arguments + --------- + logits : torch.Tensor + The logits + + Returns + ------- + tokens : torch.Tensor + Token IDs + """ + ternary_matrix = logits_to_ternary(logits) + tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2)) + return tokens + + +def tokens_to_ternary(tokens): + """Converts a sequence of tokens to a ternary matrix + + Arguments + --------- + tokens : torch.Tensor + A (Batch x Length x Codebooks) tensor of tokens + + Returns + ------- + result : torch.Tensor + A (Batch x Length x Ternary Positions) tensor + with values of (-1, 0, 1)""" + has_batch = tokens.dim() > 2 + if not has_batch: + tokens = tokens.unsqueeze(0) + batch_size = tokens.size(0) + n_codebook = tokens.size(2) + tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() + ternary_matrix = torch.cat([ + decimal_to_ternary_matrix(item, D=9) - 1 + for item in tokens + ], dim=1) + ternary_matrix = ternary_matrix.transpose(1, 2) + if not has_batch: + ternary_matrix = ternary_matrix[0] + return ternary_matrix + + +def logits_to_ternary(logits): + """Converts a tensor with two logits to a ternary matrix + + Arguments + --------- + logits : torch.Tensor + The logits (Batch x Length x num_positions x 3) + + Returns + ------- + result : torch.Tensor + The corresponding ternary matrix + """ + ternary = logits.argmax(-1) - 1 + return ternary + +def ternary_loss(predictions, targets, length=None, reduction="mean"): + batch_size, max_len, positions = targets.shape + targets_cat = targets + 1 + predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() + loss = nn.functional.nll_loss( + predictions_loss, + targets_cat, + reduction="none" + ) + mask = length_to_mask( + length * max_len, + max_len + ).unsqueeze(-1) + loss = loss * mask + if reduction == "mean": + loss = loss.sum(2).mean(1).mean(0) / 3.0 + elif reduction == "batch": + loss = loss.sum(2).mean(1) / 3.0 + return loss From 9b09d2049f5ee292574bdbf1b38347ea1da862f4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 4 Mar 2025 11:09:50 -0500 Subject: [PATCH 199/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 5cbca2493..695937f36 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -19,7 +19,7 @@ from pathlib import Path from hyperpyyaml import load_hyperpyyaml from speechbrain.dataio.dataio import ( - clean_padding_, + clean_padding, length_to_mask, write_audio, ) @@ -84,7 +84,7 @@ def create_waveform(self, audio, length): wav = tokenizer.tokens_to_sig( audio, **self.token_model_kwargs ) - clean_padding_(wav, length) + wav = clean_padding(wav, length) wav = wav.to(self.device) return wav From 4c4663db840be8e2b84b3e56ca1e216d06ec01db Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Mar 2025 15:13:17 -0500 Subject: [PATCH 200/270] DASB: Update VALL-E for SQCodec --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 12 +++--- benchmarks/DASB/model/sq_codec.py | 44 ++++++++++++++++----- benchmarks/DASB/model/valle.py | 15 +++++-- 3 files changed, 52 insertions(+), 19 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index eeb3a9d6b..771d4b14a 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -159,7 +159,7 @@ def compute_objectives(self, predictions, batch, stage): logits_ar_sm = self.hparams.log_softmax(logits_ar) targets_ar = prompt[:, 1:, 0] loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask + logits_ar_sm, targets=targets_ar, mask=mask ) loss_components.append(loss_ar) else: @@ -168,7 +168,7 @@ def compute_objectives(self, predictions, batch, stage): logits_nar_sm = self.hparams.log_softmax(logits_nar) targets_nar = prompt[batch_idx, 1:, nar_track] loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, + logits_nar_sm, targets=targets_nar, mask=mask, ) loss_components.append(loss_nar) else: @@ -218,12 +218,12 @@ def compute_loss_stats( stats = {} if self.train_ar: stats["loss_ar"] = self.hparams.compute_cost( - log_probabilities=logits_ar, targets=targets_ar, mask=mask, + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, ) if self.train_nar: stats["loss_nar"] = self.hparams.compute_cost( - log_probabilities=logits_nar, targets=targets_nar, mask=mask, + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, ) return stats @@ -258,6 +258,7 @@ def on_stage_start(self, stage, epoch): elif stage == sb.Stage.TEST: self.evaluation_metric.on_evaluation_start() self.is_evaluating = True + self.transform_audio = getattr(self.hparams, "transform_audio", None) def apply_curriculum(self): """Applies curriculum settings, if specified, training only the autoregressive part - or @@ -572,7 +573,7 @@ def dataio_prepare(hparams): "valid": hparams["valid_json"], "test": hparams["test_json"], } - + label_encoder = hparams["label_encoder"] input_feature = INPUT_FEATURE_MAP[hparams["input"]] offsets = get_offsets( @@ -606,7 +607,6 @@ def prompt_pipeline(id, tokens): audio = tokens_loader.tokens_by_uttid( id, num_codebooks=hparams["audio_tokens_per_step"] ) - if hparams["flip_layers"]: audio = audio.flip(-1) yield audio diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index d0b850056..ce8764af7 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1283,6 +1283,23 @@ def forward(self, x): return x +class TernaryEmbedding(nn.Module): + """A module wrapper for tokens-to-ternary conversion + + Arguments + --------- + tokens : torch.Tensor + the tokens""" + def forward(self, tokens): + if tokens.dim() < 3: + tokens = tokens.unsqueeze(-1) + batch_size, max_len, tracks = tokens.shape + emb = tokens_to_ternary(tokens).float() + positions = emb.size(-1) + emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) + return emb + + def decimal_to_ternary_matrix(decimals, D): """ Convert a tensor of decimal numbers to a D*T ternary matrix for each batch. @@ -1378,7 +1395,6 @@ def ternary_matrix_to_decimal_torch(matrix): return decimals - def get_padding(kernel_size, dilation=1): """ Computes the padding size for a given kernel size and dilation. @@ -1444,12 +1460,12 @@ def ternary_logits_to_tokens(logits): def tokens_to_ternary(tokens): """Converts a sequence of tokens to a ternary matrix - + Arguments --------- tokens : torch.Tensor A (Batch x Length x Codebooks) tensor of tokens - + Returns ------- result : torch.Tensor @@ -1487,7 +1503,12 @@ def logits_to_ternary(logits): ternary = logits.argmax(-1) - 1 return ternary -def ternary_loss(predictions, targets, length=None, reduction="mean"): + +def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", reduction="mean"): + if targets.dim() < 3: + targets = targets.unsqueeze(-1) + if targets_type == "tokens": + targets = tokens_to_ternary(targets.unsqueeze(-1)) batch_size, max_len, positions = targets.shape targets_cat = targets + 1 predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() @@ -1496,13 +1517,16 @@ def ternary_loss(predictions, targets, length=None, reduction="mean"): targets_cat, reduction="none" ) - mask = length_to_mask( - length * max_len, - max_len - ).unsqueeze(-1) - loss = loss * mask + mask = None + if length is not None: + mask = length_to_mask( + length * max_len, + max_len + ).unsqueeze(-1) + if mask is not None: + loss = loss * mask if reduction == "mean": loss = loss.sum(2).mean(1).mean(0) / 3.0 elif reduction == "batch": loss = loss.sum(2).mean(1) / 3.0 - return loss + return loss \ No newline at end of file diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 3abcf057f..4cc155c2d 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -71,6 +71,10 @@ class ValleLM(nn.Module): Number of layers in NAR Transformer. n_ctx : int maximum context length of AR & NAR Transformer. + lm_head : torch.nn.Module, optional + an alternative LM head implementation head, an alternative + to the default Linear, useful for non-trivial codecs, + such as SQ-Codec """ def __init__( @@ -86,11 +90,16 @@ def __init__( ar_layer=4, nar_layer=4, n_ctx=3000, + emb=None, + lm_head=None, ): super().__init__() - - self.emb = torch.nn.Embedding(vocab_size, att_unit) - self.lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) + if emb is None: + emb = torch.nn.Embedding(vocab_size, att_unit) + self.emb = emb + if lm_head is None: + lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) + self.lm_head = lm_head if share_emb: self.lm_head.weight = self.emb.weight From 6af2d83c28d3de453ac36e8011b50fac700baebe Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Mar 2025 15:14:25 -0500 Subject: [PATCH 201/270] DASB: Fixes / clean-up --- benchmarks/DASB/LJSpeech/ljspeech_prepare.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py index 08d7297e5..416c63010 100644 --- a/benchmarks/DASB/LJSpeech/ljspeech_prepare.py +++ b/benchmarks/DASB/LJSpeech/ljspeech_prepare.py @@ -197,7 +197,6 @@ def prepare_ljspeech( model_name, data_split["train"], save_json_train, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -217,7 +216,6 @@ def prepare_ljspeech( model_name, data_split["valid"], save_json_valid, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -237,7 +235,6 @@ def prepare_ljspeech( model_name, data_split["test"], save_json_test, - data_folder, wavs_folder, meta_csv, phoneme_alignments_folder, @@ -391,7 +388,6 @@ def prepare_json( model_name, seg_lst, json_file, - data_folder, wavs_folder, csv_reader, phoneme_alignments_folder, @@ -437,14 +433,8 @@ def prepare_json( Max f0 for pitch computation use_custom_cleaner : bool If True, uses custom cleaner defined for this recipe - extract_features : list, optional - If specified, feature extraction will be performed - extract_features_context : types.SimpleNamespace, optional - Context for feature extraction (pretrained models, etc) - extract_features_folder : path-like, optional - The folder where extracted features will be saved - extract_features_opts : dict, optional - Options for feature extraction + extract_phonemes : bool + Whether to extract phonemes g2p_src : str The name of the HuggingFace Hub to use for the Grapheme-to-Phoneme model or the path to it From 8c6a886876445a716cdf4eab61fef3ea69bbd83f Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Mar 2025 16:38:59 -0500 Subject: [PATCH 202/270] DASB: SQ-Codec: Make the special loss optional --- .../TTS/valle/hparams/train_sqcodec.yaml | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 337754bf5..d1d584bcc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -175,6 +175,8 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 4 +ternary_num_digits: 9 +pred_mode: ternary freeze_lm_head: False @@ -191,6 +193,8 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length dropout: !ref share_emb: !ref qk_norm: !ref + emb: !ref + lm_head: !ref inference_opts: !name:model.valle.SpeechLMInferenceOptions start: !ref @@ -199,6 +203,20 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions maxlenratio: !ref nq: !ref +lm_head: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + num_positions: !ref + tokens: null + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer save_path: !ref checkpoint: !ref @@ -211,7 +229,12 @@ modules: opt_class: !name:torch.optim.Adam lr: !ref -compute_cost: !name:model.valle.masked_nll_loss +compute_cost: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !name:model.sq_codec.ternary_loss + targets_type: tokens + tokens: !name:model.valle.masked_nll_loss log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True From 583f42a847c7ae1d474e721a292d4c110806c039 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Mar 2025 19:37:57 -0500 Subject: [PATCH 203/270] DASB: SQ Codec: Fixes --- benchmarks/DASB/model/sq_codec.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index ce8764af7..d8d52924f 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1284,19 +1284,25 @@ def forward(self, x): class TernaryEmbedding(nn.Module): - """A module wrapper for tokens-to-ternary conversion - - Arguments - --------- - tokens : torch.Tensor - the tokens""" + """A module wrapper for tokens-to-ternary conversion""" def forward(self, tokens): + """Computes the forward pass + + Arguments + --------- + tokens : torch.Tensor + the tokens + """ + squeeze = False if tokens.dim() < 3: + squeeze = True tokens = tokens.unsqueeze(-1) batch_size, max_len, tracks = tokens.shape emb = tokens_to_ternary(tokens).float() positions = emb.size(-1) emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) + if squeeze: + emb = emb.squeeze(-2) return emb From 7a011eb5bcdb2b30d302ef74a0fc99bd377a1992 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 5 Mar 2025 22:00:07 -0500 Subject: [PATCH 204/270] DASB: SQCodec: Fixes --- .../DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 2 +- benchmarks/DASB/model/valle.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index d1d584bcc..cf99298a4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -152,7 +152,7 @@ nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 -vocab_size: 2048 +vocab_size: 19683 text_num_tokens: 39 phn_num_tokens: 52 diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 4cc155c2d..2ce0d5806 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -391,7 +391,10 @@ def inference( prev_tok = suffix[:, :, 0] else: prev_tok = gen_tokens_ar[:, :, 0] - start_emb = self.emb.weight[opts.start].tile( + start_token = torch.tensor( + [opts.start], device=prefix.device + )[None, None, :] + start_emb = self.emb(start_token).squeeze().tile( len(valid_idx), 1, 1 ) # [B, 1, D] prev_emb = torch.cat( From 24a401440e7b0735c2b064f8f0617c836f71f063 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Mar 2025 01:23:27 -0500 Subject: [PATCH 205/270] DASB: VALL-E: SQ-Codec updates --- .../TTS/valle/hparams/train_sqcodec.yaml | 3 +- .../DASB/LibriTTS/TTS/tokotron/train.py | 2 +- .../TTS/valle/hparams/train_sqcodec.yaml | 277 ++++++++++++++++++ benchmarks/DASB/model/sq_codec.py | 19 +- 4 files changed, 295 insertions(+), 6 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index cf99298a4..9d5596d41 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 4 -ternary_num_digits: 9 +ternary_num_digits: 10 pred_mode: ternary freeze_lm_head: False @@ -213,6 +213,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref linear: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index df31c4c69..7d99c5c7d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -811,7 +811,7 @@ def init_sequence_encoder(hparams): def get_selected_layer_indexes(hparams): - """Finds the layers of selected layers + """Finds the indexes of selected layers Arguments --------- diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml new file mode 100644 index 000000000..377e0d7a3 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -0,0 +1,277 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ + +experiment_name: valle/speech_tokenizer + +# Seed needs to be set at top of yaml, before objects with parameters are made +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results// +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +alignments_folder: null +prepare_save_folder: !ref +data_folder_alignments: null # e.g., /path/to/LibriSpeech +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +progress_meta: !ref /meta.yaml +num_audio_samples: 32 +samples_interval: 5 + +g2p_src: flexthink/soundchoice-g2p +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref +flip_layers: False +splits: ["train", "valid", "test"] + +ckpt_key: dwer +ckpt_key_kind: min +ckpt_keep: 2 +test_key: dwer +test_key_kind: min +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 100 +number_of_epochs_ar: null +number_of_epochs_nar: null +epoch_size: 50000 +epoch_fixed: False +batch_size: 16 +valid_inter_data_count: 50 +valid_batch_size: !ref +grad_accumulation_factor: 1 +max_grad_norm: 1.0 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + +# index +pad_index: 0 +bos_index: 1 +eos_index: 2 +eot_index: 3 +eop_index: 4 +special_tokens: ["", "", "", ""] +special_num_tokens: 5 + +# stages related parameters +lr: 0.001 # @orion_step1: --lr~"loguniform(0.00001,0.1)" +lr_warmup_steps: 70000 +lr_annealing_mode: step +betas: [0.9, 0.95] + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 24000 +max_audio_length: 4000 +text_max_length: 500 +spk_prompt_length: 150 +n_ctx: !ref + + +infer_max_audio_length: !ref +max_length_ratio: 10.0 +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + looped_nominal_epoch: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + True: !ref // + False: null + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + + +####################### Model parameters ########################### +# Transformer +d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +share_emb: False +qk_norm: True +nhead: 16 +num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" +num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" +dropout: 0.2 +vocab_size: 19683 +audio_emb_freeze: False +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +model_vocab_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + ( * ) + + phonemes: !ref + ( * ) + + +audio_token_shift: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + + phonemes: !ref + + +audio_tokens_per_step: 8 +ternary_num_digits: 10 +pred_mode: ternary + +# Model Settings +config: config.yaml +checkpoint: ckpt_00190000.pth +sq_codec_save_path: !ref /sq-codec + +freeze_lm_head: False + +############################## models ################################ + +model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length + vocab_size: !ref + nq: !ref + att_unit: !ref + head: !ref + ar_layer: !ref + nar_layer: !ref + n_ctx: !ref + dropout: !ref + share_emb: !ref + qk_norm: !ref + lm_head: !ref + emb: !ref + +inference_opts: !name:model.valle.SpeechLMInferenceOptions + start: !ref + eos: !ref + minlenratio: 1.0 + maxlenratio: !ref + nq: !ref + +lm_head: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryPredictionHead + d_model: !ref + num_positions: !ref + tokens: null + +emb: !new:speechbrain.nnet.containers.Sequential + ternary: !new:model.sq_codec.TernaryEmbedding + num_digits: !ref + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref + n_neurons: !ref + +tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer + save_path: !ref + checkpoint: !ref + config: !ref + + +modules: + model: !ref + tokenizer: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.AdamW + lr: !ref + betas: !ref + +compute_cost: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !name:model.sq_codec.ternary_loss + targets_type: tokens + tokens: !name:model.valle.masked_nll_loss + +log_softmax: !new:speechbrain.nnet.activations.Softmax + apply_log: True + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index d8d52924f..6e0daed80 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1284,7 +1284,16 @@ def forward(self, x): class TernaryEmbedding(nn.Module): - """A module wrapper for tokens-to-ternary conversion""" + """A module wrapper for tokens-to-ternary conversion + + Arguments + --------- + num_digits : int + The number of ternary digits""" + def __init__(self, num_digits): + super().__init__() + self.num_digits = num_digits + def forward(self, tokens): """Computes the forward pass @@ -1298,7 +1307,7 @@ def forward(self, tokens): squeeze = True tokens = tokens.unsqueeze(-1) batch_size, max_len, tracks = tokens.shape - emb = tokens_to_ternary(tokens).float() + emb = tokens_to_ternary(tokens, D=self.num_digits).float() positions = emb.size(-1) emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) if squeeze: @@ -1464,13 +1473,15 @@ def ternary_logits_to_tokens(logits): return tokens -def tokens_to_ternary(tokens): +def tokens_to_ternary(tokens, D=9): """Converts a sequence of tokens to a ternary matrix Arguments --------- tokens : torch.Tensor A (Batch x Length x Codebooks) tensor of tokens + D : int + The number of ternary digits Returns ------- @@ -1484,7 +1495,7 @@ def tokens_to_ternary(tokens): n_codebook = tokens.size(2) tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() ternary_matrix = torch.cat([ - decimal_to_ternary_matrix(item, D=9) - 1 + decimal_to_ternary_matrix(item, D=D) - 1 for item in tokens ], dim=1) ternary_matrix = ternary_matrix.transpose(1, 2) From 7e5d15d5f532667b056704433977675b01676ff5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Mar 2025 11:30:35 -0500 Subject: [PATCH 206/270] DASB: SQCodec: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 377e0d7a3..2a3daf66a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -182,7 +182,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice text: !ref + phonemes: !ref + -audio_tokens_per_step: 8 +audio_tokens_per_step: 4 ternary_num_digits: 10 pred_mode: ternary From 0f14a23ce399e5944a9c09fa3bcf3f898980a1da Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Mar 2025 15:08:29 -0500 Subject: [PATCH 207/270] DASB: SQ-Codec: Fully implement ternary mode --- .../TTS/valle/hparams/train_sqcodec.yaml | 13 +++++++- .../TTS/valle/hparams/train_sqcodec.yaml | 12 ++++++- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 ++--- benchmarks/DASB/model/custom_model.py | 31 +++++++++++++++++++ benchmarks/DASB/model/sq_codec.py | 4 +-- benchmarks/DASB/model/valle.py | 14 ++++++--- 6 files changed, 70 insertions(+), 12 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 9d5596d41..0919133ad 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 4 -ternary_num_digits: 10 +ternary_num_digits: 11 pred_mode: ternary freeze_lm_head: False @@ -195,6 +195,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length qk_norm: !ref emb: !ref lm_head: !ref + logits_to_probs: !ref inference_opts: !name:model.valle.SpeechLMInferenceOptions start: !ref @@ -211,6 +212,15 @@ lm_head: !apply:speechbrain.utils.hparams.choice num_positions: !ref tokens: null +logits_to_probs: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + tokens: !new:torch.nn.Identity + + emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref @@ -235,6 +245,7 @@ compute_cost: !apply:speechbrain.utils.hparams.choice choices: ternary: !name:model.sq_codec.ternary_loss targets_type: tokens + num_positions: !ref tokens: !name:model.valle.masked_nll_loss log_softmax: !new:speechbrain.nnet.activations.Softmax diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 2a3daf66a..4b015c203 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -183,7 +183,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 4 -ternary_num_digits: 10 +ternary_num_digits: 11 pred_mode: ternary # Model Settings @@ -208,6 +208,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length qk_norm: !ref lm_head: !ref emb: !ref + logits_to_probs: !ref inference_opts: !name:model.valle.SpeechLMInferenceOptions start: !ref @@ -224,6 +225,14 @@ lm_head: !apply:speechbrain.utils.hparams.choice num_positions: !ref tokens: null +logits_to_probs: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + ternary: !new:model.custom_model.TernaryLogitTokenizer + num_tokens: !ref + num_positions: !ref + tokens: !new:torch.nn.Identity + emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref @@ -251,6 +260,7 @@ compute_cost: !apply:speechbrain.utils.hparams.choice choices: ternary: !name:model.sq_codec.ternary_loss targets_type: tokens + num_positions: !ref tokens: !name:model.valle.masked_nll_loss log_softmax: !new:speechbrain.nnet.activations.Softmax diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 695937f36..2df9405ca 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -162,7 +162,7 @@ def compute_objectives(self, predictions, batch, stage): logits_ar_sm = self.hparams.log_softmax(logits_ar) targets_ar = prompt[:, 1:, 0] loss_ar = self.hparams.compute_cost( - log_probabilities=logits_ar_sm, targets=targets_ar, mask=mask + logits_ar_sm, targets=targets_ar, mask=mask ) loss_components.append(loss_ar) else: @@ -171,7 +171,7 @@ def compute_objectives(self, predictions, batch, stage): logits_nar_sm = self.hparams.log_softmax(logits_nar) targets_nar = prompt[batch_idx, 1:, nar_track] loss_nar = self.hparams.compute_cost( - log_probabilities=logits_nar_sm, targets=targets_nar, mask=mask, + logits_nar_sm, targets=targets_nar, mask=mask, ) loss_components.append(loss_nar) else: @@ -221,12 +221,12 @@ def compute_loss_stats( stats = {} if self.train_ar: stats["loss_ar"] = self.hparams.compute_cost( - log_probabilities=logits_ar, targets=targets_ar, mask=mask, + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, ) if self.train_nar: stats["loss_nar"] = self.hparams.compute_cost( - log_probabilities=logits_nar, targets=targets_nar, mask=mask, + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, ) return stats diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index e5a9db761..b389a9473 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,5 +1,6 @@ import torch from speechbrain.nnet.linear import Linear +from model.sq_codec import tokens_to_ternary class AttentionMLP(torch.nn.Module): @@ -163,4 +164,34 @@ def forward(self, x): x = self.lin_p(x) p = x.reshape(batch_size, max_len, self.num_positions, 3) return p + +class TernaryLogitTokenizer(torch.nn.Module): + """Converts ternary logits to probabilities + + Arguments + --------- + num_positions : int + The number of ternary digits/positions + num_tokens : int + The number of tokens + """ + def __init__(self, num_positions, num_tokens=None): + super().__init__() + self.num_positions = num_positions + if num_tokens is None: + num_tokens = 3 ** num_positions + self.num_tokens = num_tokens + self.register_buffer("vocab", torch.arange(num_tokens)) + self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) + self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) + + def forward(self, logits): + logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) + token_logits_raw = torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + logits_unsq, + 1 - logits_unsq + ).prod(-1).prod(-1) + token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) + return (token_logits_raw / token_logits_raw_sum).squeeze(2) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 6e0daed80..e8119a9e9 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1521,11 +1521,11 @@ def logits_to_ternary(logits): return ternary -def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", reduction="mean"): +def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"): if targets.dim() < 3: targets = targets.unsqueeze(-1) if targets_type == "tokens": - targets = tokens_to_ternary(targets.unsqueeze(-1)) + targets = tokens_to_ternary(targets.unsqueeze(-1), D=num_positions) batch_size, max_len, positions = targets.shape targets_cat = targets + 1 predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 2ce0d5806..8baf8e562 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -22,8 +22,7 @@ from torch.nn import functional as F from dataclasses import dataclass -from speechbrain.nnet.losses import reduce_loss -from speechbrain.nnet.losses import truncate +from speechbrain.nnet.losses import reduce_loss, truncate @dataclass @@ -75,6 +74,9 @@ class ValleLM(nn.Module): an alternative LM head implementation head, an alternative to the default Linear, useful for non-trivial codecs, such as SQ-Codec + logits_to_probs : callable, optional + A module or a function that converts logits to token probabilities to + support top-K sampling """ def __init__( @@ -92,6 +94,7 @@ def __init__( n_ctx=3000, emb=None, lm_head=None, + logits_to_probs=None, ): super().__init__() if emb is None: @@ -100,6 +103,9 @@ def __init__( if lm_head is None: lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) self.lm_head = lm_head + if logits_to_probs is None: + logits_to_probs = nn.Identity() + self.logits_to_probs = logits_to_probs if share_emb: self.lm_head.weight = self.emb.weight @@ -302,7 +308,7 @@ def inference( # (3.2) AR loop prev_emb = self.emb(prev_tok) # [B, 1, D] h_ar = self.ar_decoder(prev_emb, kv_cache=cache) - logits = self.lm_head(h_ar) # [B, 1, V] + logits = self.logits_to_probs(self.lm_head(h_ar)) # [B, 1, V] gen_tok, gen_score = logits_to_tokens( logits.unsqueeze(2), opts, @@ -415,7 +421,7 @@ def inference( h_nar = self.nar_decoder( prev_emb, ones * step - 1, mask=mask ) # [B, T, D] - logits = self.lm_head(h_nar) + logits = self.logits_to_probs(self.lm_head(h_nar)) gen_tok, gen_score = logits_to_tokens( logits.unsqueeze(2), opts, From 10f8fdb8126c95194eb0aa0147ba338113eb3247 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 6 Mar 2025 16:14:54 -0500 Subject: [PATCH 208/270] DASB: Fix SpeechTokenizer --- .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 24c494a98..24be494a5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/speech_tokenizer +experiment_name: valle/speech_tokenizer # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -95,7 +95,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 -model_sample_rate: 24000 +model_sample_rate: 16000 max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 From c00962e36451a3f5898abf14140594aac374cf86 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Mar 2025 13:10:04 -0500 Subject: [PATCH 209/270] Fixes for SQCodec: Make offsets optional, align the shift with ternary --- .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 15 ++++++--------- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 15 +++++++++++++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 0919133ad..f61b2e56e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -48,6 +48,7 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: False token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False @@ -165,17 +166,13 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice model_vocab_size: !apply:speechbrain.utils.hparams.choice value: !ref choices: - text: !ref + ( * ) + - phonemes: !ref + ( * ) + - -audio_token_shift: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref + - phonemes: !ref + + text: !ref * 2 + phonemes: !ref * 2 +audio_token_shift: !ref 3**( - 1) + audio_tokens_per_step: 4 -ternary_num_digits: 11 +ternary_num_digits: 10 pred_mode: ternary freeze_lm_head: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 771d4b14a..d26b27cca 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -242,6 +242,8 @@ def on_stage_start(self, stage, epoch): self.offsets = get_offsets( self.hparams.vocab_size, self.hparams.audio_tokens_per_step, )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) self.loss_metric = sb.utils.metric_stats.MultiMetricStats( metric=self.compute_loss_stats, batch_eval=True, @@ -489,9 +491,10 @@ def _get_inference_opts(self): tracks = torch.arange( self.hparams.audio_tokens_per_step, device=self.device )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) track_start = ( - self.hparams.text_num_tokens - + self.hparams.special_num_tokens + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size ) if self.hparams.flip_layers: @@ -501,6 +504,12 @@ def _get_inference_opts(self): ((idx >= track_start) & (idx < track_end)) | (idx == self.hparams.bos_index) ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True return self.hparams.inference_opts( masks={self.hparams.bos_index: mask}, device=self.device, ) @@ -579,6 +588,8 @@ def dataio_prepare(hparams): offsets = get_offsets( hparams["vocab_size"], hparams["audio_tokens_per_step"] ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) if hparams["flip_layers"]: offsets = offsets.flip(-1) From 50ef659417b60e3c8e01012b9579056e17779807 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 7 Mar 2025 15:53:48 -0500 Subject: [PATCH 210/270] DASB: SQ-Codec: Add chunking to avoid OOM --- benchmarks/DASB/model/custom_model.py | 28 +++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index b389a9473..84007b7cd 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,3 +1,4 @@ +import math import torch from speechbrain.nnet.linear import Linear from model.sq_codec import tokens_to_ternary @@ -175,23 +176,34 @@ class TernaryLogitTokenizer(torch.nn.Module): The number of ternary digits/positions num_tokens : int The number of tokens + chunk_size : int + The size of the chunk (to prevent OOM) """ - def __init__(self, num_positions, num_tokens=None): + def __init__(self, num_positions, num_tokens=None, chunk_size=10): super().__init__() self.num_positions = num_positions if num_tokens is None: num_tokens = 3 ** num_positions self.num_tokens = num_tokens + self.chunk_size = chunk_size self.register_buffer("vocab", torch.arange(num_tokens)) self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) def forward(self, logits): logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) - token_logits_raw = torch.where( - self.vocab_ternary[:, None, None, :, :, None] == self.idx, - logits_unsq, - 1 - logits_unsq - ).prod(-1).prod(-1) - token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) - return (token_logits_raw / token_logits_raw_sum).squeeze(2) + chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size)) + token_logits_chunks = [] + for chunk in chunks: + token_logits_raw = torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + chunk, + 1 - chunk + ).prod(-1).prod(-1) + token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) + token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) + token_logits = torch.cat( + token_logits_chunks, + dim=1 + ) + return token_logits From 08b14ff0da4df7730119193329bb3ad3c67e2178 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 8 Mar 2025 00:48:34 -0500 Subject: [PATCH 211/270] DASB: SQ-Codec: Update LibriTTS --- .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 8 ++------ .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 15 +++------------ 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index f61b2e56e..21530b199 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -163,13 +163,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref -model_vocab_size: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref * 2 - phonemes: !ref * 2 +model_vocab_size: !ref * 2 -audio_token_shift: !ref 3**( - 1) +audio_token_shift: 19683 audio_tokens_per_step: 4 ternary_num_digits: 10 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 4b015c203..fca222b27 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -170,20 +170,11 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref -model_vocab_size: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref + ( * ) + - phonemes: !ref + ( * ) + - -audio_token_shift: !apply:speechbrain.utils.hparams.choice - value: !ref - choices: - text: !ref + - phonemes: !ref + +model_vocab_size: !ref * 2 +audio_token_shift: 19683 audio_tokens_per_step: 4 -ternary_num_digits: 11 +ternary_num_digits: 10 pred_mode: ternary # Model Settings From d1ce08a2e627d39c90bd43b12db25f519a449c4c Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 8 Mar 2025 14:12:34 -0500 Subject: [PATCH 212/270] DASB: Add a mulltitrack ternary language model head (a separate projection learned for each layer, independently) --- .../TTS/valle/hparams/train_sqcodec.yaml | 4 +- benchmarks/DASB/model/custom_model.py | 70 ++++++++++++++++++- benchmarks/DASB/model/valle.py | 25 +++++-- 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 21530b199..10a1403fc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -98,6 +98,7 @@ text_max_length: 500 n_ctx: !ref + infer_max_audio_length: !ref max_length_ratio: 10.0 +top_k: 1 debug_infer_max_audio_length: 10 # Label encoder @@ -196,11 +197,12 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + top_k: !ref lm_head: !apply:speechbrain.utils.hparams.choice value: !ref choices: - ternary: !new:model.custom_model.TernaryPredictionHead + ternary: !new:model.custom_model.MultitrackTernaryPredictionHead d_model: !ref num_positions: !ref tokens: null diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 84007b7cd..11a5a9dac 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -141,7 +141,7 @@ def __init__(self, d_model, num_positions, d_hidden=512): bias=False ) - def forward(self, x): + def forward(self, x, track=None): """Computes the forward pass Arguments @@ -149,6 +149,9 @@ def forward(self, x): x : torch.Tensor The decoder output (Batch x Length x d_model) + track : int + The track index (if applicable) + Returns ------- p : torch.Tensor @@ -165,7 +168,70 @@ def forward(self, x): x = self.lin_p(x) p = x.reshape(batch_size, max_len, self.num_positions, 3) return p - + + +class MultitrackTernaryPredictionHead(torch.nn.Module): + """An alternative prediction head that predicts a fixed number of ternary digits + for each position (as used in SQ-Codec) + + Arguments + --------- + d_model : int + The model dimension + num_positions : int + the number of positions + """ + def __init__(self, d_model, num_positions, d_hidden=512, num_tracks=1): + super().__init__() + self.num_positions = num_positions + self.d_model = d_model + self.num_positions = num_positions + self.lin_hidden = torch.nn.ModuleList( + [ + Linear( + input_size=d_model, + n_neurons=d_hidden, + ) + ] * num_tracks + ) + self.act = torch.nn.LeakyReLU() + self.lin_p = torch.nn.ModuleList( + [ + Linear( + input_size=d_hidden, + n_neurons=num_positions * 3, + ) + ] * num_tracks + ) + + def forward(self, x, track=0): + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The decoder output (Batch x Length x d_model) + + track : int + The track index (if applicable) + + Returns + ------- + p : torch.Tensor + A tensor of shape (Batch x Length x num_positions x ternary digit) + The values are logits (unnormalized probabilities) + + p[:, :, :, 0] corresponds to -1 + p[:, :, :, 1] corresponds to 0 + p[:, :, :, 2] corresponds to 1 + """ + batch_size, max_len, _ = x.shape + x = self.lin_hidden[track](x) + x = self.act(x) + x = self.lin_p[track](x) + p = x.reshape(batch_size, max_len, self.num_positions, 3) + return p + class TernaryLogitTokenizer(torch.nn.Module): """Converts ternary logits to probabilities diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 8baf8e562..828b33898 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -94,6 +94,7 @@ def __init__( n_ctx=3000, emb=None, lm_head=None, + lm_head_multitrack=False, logits_to_probs=None, ): super().__init__() @@ -103,6 +104,7 @@ def __init__( if lm_head is None: lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) self.lm_head = lm_head + self.lm_head_multitrack = lm_head_multitrack if logits_to_probs is None: logits_to_probs = nn.Identity() self.logits_to_probs = logits_to_probs @@ -204,9 +206,9 @@ def forward( # Logits logits_ar, logits_nar = None, None if predict_ar: - logits_ar = self.lm_head(h_ar) + logits_ar = self.apply_lm_head(h_ar, 0) if predict_nar: - logits_nar = self.lm_head(h_nar) + logits_nar = self.apply_lm_head(h_nar, nar_level_idx + 1) return logits_ar, logits_nar @@ -308,7 +310,7 @@ def inference( # (3.2) AR loop prev_emb = self.emb(prev_tok) # [B, 1, D] h_ar = self.ar_decoder(prev_emb, kv_cache=cache) - logits = self.logits_to_probs(self.lm_head(h_ar)) # [B, 1, V] + logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0)) # [B, 1, V] gen_tok, gen_score = logits_to_tokens( logits.unsqueeze(2), opts, @@ -421,7 +423,9 @@ def inference( h_nar = self.nar_decoder( prev_emb, ones * step - 1, mask=mask ) # [B, T, D] - logits = self.logits_to_probs(self.lm_head(h_nar)) + + logits = self.apply_lm_head(h_nar, step) + logits = self.logits_to_probs(logits) gen_tok, gen_score = logits_to_tokens( logits.unsqueeze(2), opts, @@ -463,6 +467,19 @@ def inference( gen_scores_list.append(gen_scores[b][: finish_idx[b]]) return gen_tokens_list, gen_scores_list + + def apply_lm_head(self, x, track): + """Applies the language model head + + Arguments + --------- + """ + + if self.lm_head_multitrack: + result = self.lm_head(x, track) + else: + result = self.lm_head(x) + return result def _initialize(self): for m in self.modules(): From 15f096c69c6f023c3b9c5dbdf1885f23c41280ae Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 8 Mar 2025 15:21:02 -0500 Subject: [PATCH 213/270] DASB: Vall-E: Multitrack fixes --- .../DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 1 + .../DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 5 ++++- benchmarks/DASB/model/valle.py | 5 +++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 10a1403fc..9e887e4c2 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -205,6 +205,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.MultitrackTernaryPredictionHead d_model: !ref num_positions: !ref + num_tracks: !ref tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index fca222b27..928aaa094 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -102,6 +102,7 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +top_k: 1 debug_infer_max_audio_length: 10 # Label encoder @@ -207,13 +208,15 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + top_k: !ref lm_head: !apply:speechbrain.utils.hparams.choice value: !ref choices: - ternary: !new:model.custom_model.TernaryPredictionHead + ternary: !new:model.custom_model.MultitrackTernaryPredictionHead d_model: !ref num_positions: !ref + num_tracks: !ref tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 828b33898..523bade2b 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -14,6 +14,7 @@ import logging import torch +import inspect from typing import Tuple, Optional from speechbrain.dataio.dataio import length_to_mask @@ -94,7 +95,6 @@ def __init__( n_ctx=3000, emb=None, lm_head=None, - lm_head_multitrack=False, logits_to_probs=None, ): super().__init__() @@ -104,7 +104,8 @@ def __init__( if lm_head is None: lm_head = torch.nn.Linear(att_unit, vocab_size, bias=False) self.lm_head = lm_head - self.lm_head_multitrack = lm_head_multitrack + spec = inspect.getfullargspec(lm_head.forward) + self.lm_head_multitrack = "track" in spec.args if logits_to_probs is None: logits_to_probs = nn.Identity() self.logits_to_probs = logits_to_probs From 981fe9366166d77148cfa966cca703aa3c50a700 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 9 Mar 2025 09:32:12 -0400 Subject: [PATCH 214/270] DASB: SQ-Codec: Fixes --- benchmarks/DASB/model/custom_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 11a5a9dac..64d84c522 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -226,6 +226,8 @@ def forward(self, x, track=0): p[:, :, :, 2] corresponds to 1 """ batch_size, max_len, _ = x.shape + if torch.is_tensor(track): + track = track.int().item() x = self.lin_hidden[track](x) x = self.act(x) x = self.lin_p[track](x) From de4aaaa690e756f9333706b378d11354d2c9e89e Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Mar 2025 15:51:40 -0400 Subject: [PATCH 215/270] DASB: SQ-Codec: Remove the multi-track ternary head (it did not help) --- .../TTS/valle/hparams/train_sqcodec.yaml | 3 +- .../TTS/valle/hparams/train_sqcodec.yaml | 3 +- benchmarks/DASB/model/custom_model.py | 107 ------------------ 3 files changed, 2 insertions(+), 111 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 9e887e4c2..6e4e7d4f6 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -202,10 +202,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions lm_head: !apply:speechbrain.utils.hparams.choice value: !ref choices: - ternary: !new:model.custom_model.MultitrackTernaryPredictionHead + ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref num_positions: !ref - num_tracks: !ref tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 928aaa094..942f85cd5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -213,10 +213,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions lm_head: !apply:speechbrain.utils.hparams.choice value: !ref choices: - ternary: !new:model.custom_model.MultitrackTernaryPredictionHead + ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref num_positions: !ref - num_tracks: !ref tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 64d84c522..f02598b57 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -168,110 +168,3 @@ def forward(self, x, track=None): x = self.lin_p(x) p = x.reshape(batch_size, max_len, self.num_positions, 3) return p - - -class MultitrackTernaryPredictionHead(torch.nn.Module): - """An alternative prediction head that predicts a fixed number of ternary digits - for each position (as used in SQ-Codec) - - Arguments - --------- - d_model : int - The model dimension - num_positions : int - the number of positions - """ - def __init__(self, d_model, num_positions, d_hidden=512, num_tracks=1): - super().__init__() - self.num_positions = num_positions - self.d_model = d_model - self.num_positions = num_positions - self.lin_hidden = torch.nn.ModuleList( - [ - Linear( - input_size=d_model, - n_neurons=d_hidden, - ) - ] * num_tracks - ) - self.act = torch.nn.LeakyReLU() - self.lin_p = torch.nn.ModuleList( - [ - Linear( - input_size=d_hidden, - n_neurons=num_positions * 3, - ) - ] * num_tracks - ) - - def forward(self, x, track=0): - """Computes the forward pass - - Arguments - --------- - x : torch.Tensor - The decoder output (Batch x Length x d_model) - - track : int - The track index (if applicable) - - Returns - ------- - p : torch.Tensor - A tensor of shape (Batch x Length x num_positions x ternary digit) - The values are logits (unnormalized probabilities) - - p[:, :, :, 0] corresponds to -1 - p[:, :, :, 1] corresponds to 0 - p[:, :, :, 2] corresponds to 1 - """ - batch_size, max_len, _ = x.shape - if torch.is_tensor(track): - track = track.int().item() - x = self.lin_hidden[track](x) - x = self.act(x) - x = self.lin_p[track](x) - p = x.reshape(batch_size, max_len, self.num_positions, 3) - return p - - -class TernaryLogitTokenizer(torch.nn.Module): - """Converts ternary logits to probabilities - - Arguments - --------- - num_positions : int - The number of ternary digits/positions - num_tokens : int - The number of tokens - chunk_size : int - The size of the chunk (to prevent OOM) - """ - def __init__(self, num_positions, num_tokens=None, chunk_size=10): - super().__init__() - self.num_positions = num_positions - if num_tokens is None: - num_tokens = 3 ** num_positions - self.num_tokens = num_tokens - self.chunk_size = chunk_size - self.register_buffer("vocab", torch.arange(num_tokens)) - self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) - self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) - - def forward(self, logits): - logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) - chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size)) - token_logits_chunks = [] - for chunk in chunks: - token_logits_raw = torch.where( - self.vocab_ternary[:, None, None, :, :, None] == self.idx, - chunk, - 1 - chunk - ).prod(-1).prod(-1) - token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) - token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) - token_logits = torch.cat( - token_logits_chunks, - dim=1 - ) - return token_logits From e8af8994008aa39dc0e251eee4d7cadff2e60cc5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Mar 2025 16:06:48 -0400 Subject: [PATCH 216/270] DASB: VALL-E Fix ternary loss masking --- benchmarks/DASB/model/sq_codec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index e8119a9e9..29a483456 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1534,12 +1534,12 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter targets_cat, reduction="none" ) - mask = None if length is not None: mask = length_to_mask( length * max_len, max_len - ).unsqueeze(-1) + ) + mask = mask.unsqueeze(-1) if mask is not None: loss = loss * mask if reduction == "mean": From 851eb845526231990b3bfb737bbd0742661e7613 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Mar 2025 16:15:46 -0400 Subject: [PATCH 217/270] DASB: SQCodec: Fixes --- benchmarks/DASB/model/custom_model.py | 42 +++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index f02598b57..c88d7b536 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -168,3 +168,45 @@ def forward(self, x, track=None): x = self.lin_p(x) p = x.reshape(batch_size, max_len, self.num_positions, 3) return p + + +class TernaryLogitTokenizer(torch.nn.Module): + """Converts ternary logits to probabilities + + Arguments + --------- + num_positions : int + The number of ternary digits/positions + num_tokens : int + The number of tokens + chunk_size : int + The size of the chunk (to prevent OOM) + """ + def __init__(self, num_positions, num_tokens=None, chunk_size=10): + super().__init__() + self.num_positions = num_positions + if num_tokens is None: + num_tokens = 3 ** num_positions + self.num_tokens = num_tokens + self.chunk_size = chunk_size + self.register_buffer("vocab", torch.arange(num_tokens)) + self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) + self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) + + def forward(self, logits): + logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) + chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size)) + token_logits_chunks = [] + for chunk in chunks: + token_logits_raw = torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + chunk, + 1 - chunk + ).prod(-1).prod(-1) + token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) + token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) + token_logits = torch.cat( + token_logits_chunks, + dim=1 + ) + return token_logits From 38ed4324e75c190e58d421e30de40d6486d4b48c Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Mar 2025 19:52:13 -0400 Subject: [PATCH 218/270] DASB: Add the ability to filter priors --- benchmarks/DASB/run_hparam_optimization.sh | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 9be6a3c64..058bbcb18 100755 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -63,6 +63,7 @@ orion_db_type="PickledDB" exp_max_trials=50 store_all=True compress_exp=True +hparam_filter="" # Function to print argument descriptions and exit print_argument_descriptions() { @@ -202,6 +203,12 @@ while [[ $# -gt 0 ]]; do shift ;; + --hparam_filter) + hparam_filter="$2" + shift + shift + ;; + --help) print_argument_descriptions ;; @@ -281,6 +288,11 @@ echo "-------------------------------------" get_flag() { local file_path="$1" local pattern="$2" + local filter="$3" + + if [[ -z "$filter" ]]; then + filter=".*" + fi # Check if the file exists if [ ! -f "$file_path" ]; then @@ -289,7 +301,7 @@ get_flag() { fi # Use grep to find all lines containing the pattern and then extract the flags using sed - grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | tr -d '\n' + grep -o "$pattern.*" "$file_path" | sed "s/$pattern//" | grep $filter | tr -d '\n' } @@ -333,7 +345,9 @@ function extract_best_params() { step_id=1 hparams_step=$hparams pattern="@orion_step1:" -opt_flags=$(get_flag "$hparams_step" "$pattern") +opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") +echo ">>> OPT FLAGS: $opt_flags" +exit # Check if the string is empty and exit with an error if it is if [ -z "$opt_flags" ]; then @@ -409,7 +423,7 @@ while [ -n "$opt_flags" ]; do pattern="@orion_step$step_id:" # update optimization flags pattern - opt_flags=$(get_flag "$hparams_step" "$pattern") + opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") done echo From d5aea40c93fc2eb2080bcdce9b494bfec65754a1 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 10 Mar 2025 20:48:54 -0400 Subject: [PATCH 219/270] DASB: Removed debugging code --- benchmarks/DASB/run_hparam_optimization.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index 058bbcb18..c0b06b09a 100755 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -346,8 +346,6 @@ step_id=1 hparams_step=$hparams pattern="@orion_step1:" opt_flags=$(get_flag "$hparams_step" "$pattern" "$hparam_filter") -echo ">>> OPT FLAGS: $opt_flags" -exit # Check if the string is empty and exit with an error if it is if [ -z "$opt_flags" ]; then From 6cef5492752fe8069832bf4d2910872ec8452008 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 11 Mar 2025 01:41:02 -0400 Subject: [PATCH 220/270] DASB: VALL-E: SQ-Codec fixes --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../TTS/valle/hparams/train_encodec.yaml | 1 + .../TTS/valle/hparams/train_espnet_encodec.yaml | 1 + .../LJSpeech/TTS/valle/hparams/train_mimi.yaml | 1 + .../TTS/valle/hparams/train_wavtokenizer.yaml | 1 + .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 3 ++- .../TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../TTS/valle/hparams/train_encodec.yaml | 3 ++- .../TTS/valle/hparams/train_espnet_encodec.yaml | 3 ++- .../LibriTTS/TTS/valle/hparams/train_mimi.yaml | 3 ++- .../valle/hparams/train_speech_tokenizer.yaml | 1 + .../TTS/valle/hparams/train_sqcodec.yaml | 8 +++++++- .../TTS/valle/hparams/train_wavtokenizer.yaml | 3 ++- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 17 ++++++++++++++--- 14 files changed, 38 insertions(+), 9 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 140b85a84..715a2d199 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -60,6 +60,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index 2c22f57a4..747e6626e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -45,6 +45,7 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index 74654e590..c3874b6a7 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -45,6 +45,7 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index b528660f5..b5747d763 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -46,6 +46,7 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index af0222d90..110839413 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -48,6 +48,7 @@ kmeans_dataset: LibriSpeech available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True token_offset: 1 spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index f9d07b443..3052dc76b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/dac +experiment_name: valle/dac # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -51,6 +51,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 9d9e65b85..4c61228a2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -66,6 +66,7 @@ vocoder_repo_id: !apply:speechbrain.utils.hparams.choice available_speech_model_layers: [1, 3, 7, 12, 18, 23] speech_model_layers: !ref flip_layers: False +use_token_offsets: True # Speaker Embeddings spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index a4a19ae6b..6596858b2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/encodec +experiment_name: valle/encodec # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 714cf91b5..a23789f15 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/encodec +experiment_name: valle/encodec # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index c1e3f1e3a..7b61a18a7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/mimi +experiment_name: valle/mimi # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -51,6 +51,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 24be494a5..3b9bd8214 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 942f85cd5..4ac5e039a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: False splits: ["train", "valid", "test"] ckpt_key: dwer @@ -176,6 +177,7 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 ternary_num_digits: 10 +ternary_emb_hidden_size: 512 pred_mode: ternary # Model Settings @@ -229,8 +231,12 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref - linear: !new:speechbrain.nnet.linear.Linear + hidden: !new:speechbrain.nnet.linear.Linear input_size: !ref + n_neurons: !ref + act: !new:torch.nn.LeakyReLU + linear: !new:speechbrain.nnet.linear.Linear + input_size: !ref n_neurons: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index e98056db3..4e4d13c27 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -3,7 +3,7 @@ # Authors: Artem Ploujnikov # ############################################################################ -experiment_name: tokotron/wavtokenizer +experiment_name: valle/wavtokenizer # Seed needs to be set at top of yaml, before objects with parameters are made seed: 74443 @@ -50,6 +50,7 @@ tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are sav tokens_loader: !new:utils.tokens.TokensLoader data_path: !ref flip_layers: False +use_token_offsets: True splits: ["train", "valid", "test"] ckpt_key: dwer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 2df9405ca..2d3091654 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -245,6 +245,8 @@ def on_stage_start(self, stage, epoch): self.offsets = get_offsets( self.hparams.vocab_size, self.hparams.audio_tokens_per_step, )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) if hasattr(hparams, "speech_model_layers"): self.layer_idx = get_selected_layer_indexes( @@ -527,9 +529,10 @@ def _get_inference_opts(self): tracks = torch.arange( self.hparams.audio_tokens_per_step, device=self.device )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) track_start = ( - self.hparams.text_num_tokens - + self.hparams.special_num_tokens + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size ) if self.hparams.flip_layers: @@ -537,8 +540,14 @@ def _get_inference_opts(self): track_end = track_start + self.hparams.vocab_size mask = ( ((idx >= track_start) & (idx < track_end)) - | (idx == self.hparams.eos_index) + | (idx == self.hparams.bos_index) ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True return self.hparams.inference_opts( masks={self.hparams.bos_index: mask}, device=self.device, ) @@ -714,6 +723,8 @@ def dataio_prepare(hparams): offsets = get_offsets( hparams["vocab_size"], hparams["audio_tokens_per_step"] ).unsqueeze(0) + if not hparams["use_token_offsets"]: + offsets = torch.zeros_like(offsets) if hparams["flip_layers"]: offsets = offsets.flip(-1) From 9fe48e4c268bf706142e1eacb33e4652ed0188d8 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 11 Mar 2025 14:09:19 -0400 Subject: [PATCH 221/270] DASB: SQ-Codec: Fix the sample rate --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 4ac5e039a..a69b3db3d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -96,7 +96,7 @@ betas: [0.9, 0.95] # Feature parameters sample_rate: 24000 -model_sample_rate: 24000 +model_sample_rate: 16000 max_audio_length: 4000 text_max_length: 500 spk_prompt_length: 150 From 263f8b501fdcf81a35052a8e28119fa2f9f9a0ae Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 11 Mar 2025 15:41:17 -0400 Subject: [PATCH 222/270] VALL-E: SQ-Codec: Add target dropout (optional, disabled by default) --- .../TTS/valle/hparams/train_sqcodec.yaml | 2 + benchmarks/DASB/model/valle.py | 56 +++++++++++-------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 6e4e7d4f6..579c860aa 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -154,6 +154,7 @@ nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 +target_dropout: 0.5 vocab_size: 19683 text_num_tokens: 39 phn_num_tokens: 52 @@ -185,6 +186,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length nar_layer: !ref n_ctx: !ref dropout: !ref + target_dropout: !ref share_emb: !ref qk_norm: !ref emb: !ref diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 523bade2b..fbaa45b30 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -61,6 +61,9 @@ class ValleLM(nn.Module): If true, apply LayerNorm to q and k in atention. dropout : float dropout rate for attention layers. + target_dropout : float + a separate dropout applied to targets only (may be + useful to mitigate autorgressive prediction instability) att_unit: int Dimention of Transformer attention. head : int @@ -88,6 +91,7 @@ def __init__( share_emb=True, qk_norm=False, dropout=0.0, + target_dropout=0.0, att_unit=256, head=2, ar_layer=4, @@ -119,6 +123,7 @@ def __init__( n_layer=ar_layer, qk_norm=qk_norm, dropout=dropout, + target_dropout=target_dropout ) if nq > 1: # NOTE: An NAR encoder is not needed if there is only one track @@ -575,6 +580,30 @@ def forward( class TransformerDecoder(nn.Module): + """A custom transformer decoder implementation for VALL-E + + Arguments + --------- + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Whether to normalize queries and keys + dropout : float + The dropout probability + target_dropout : float + The target dropout probability + layer_class : type + The layer type to be used + """ def __init__( self, n_ctx, @@ -584,30 +613,10 @@ def __init__( causal=True, qk_norm=False, dropout=0.0, + target_dropout=0.0, layer_class=ResidualAttentionBlock, ): - """A custom transformer decoder implementation for VALL-E - Arguments - --------- - n_ctx : int - The context length - n_state : int - The number of states - n_head : int - The number of heads - n_layer : int - The number of layers - causal : bool - Whether to operate in causal mode (i.e. avoid attending - to future steps) - qk_norm : bool - Whether to normalize queries and keys - dropout : float - The dropout probability - layer_class : type - The layer type to be used - """ super().__init__() self.pos_emb = nn.Embedding(n_ctx, n_state) @@ -626,6 +635,7 @@ def __init__( ] ) self.ln = LayerNorm(n_state) + self.target_dropout = nn.Dropout(target_dropout) self.causal = causal self.kv_cache = None @@ -654,9 +664,11 @@ def forward( offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 x = x + self.pos_emb.weight[offset : offset + x.shape[1]].unsqueeze(0) + tgt = self.target_dropout(x) for block in self.blocks: - x = block(x, mask=mask, kv_cache=kv_cache) + x = block(x, tgt, mask=mask, kv_cache=kv_cache) + tgt = x x = self.ln(x) return x From fb2d573a2597c34d7d2d91990cc463ee88e25112 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Mar 2025 01:15:00 -0400 Subject: [PATCH 223/270] DASB: SQ-Codec updates --- .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 14 +++++++------- benchmarks/DASB/model/custom_model.py | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index a69b3db3d..337a69f15 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -71,7 +71,7 @@ batch_size: 16 valid_inter_data_count: 50 valid_batch_size: !ref grad_accumulation_factor: 1 -max_grad_norm: 1.0 +max_grad_norm: 0.01 sorting: random num_workers: 4 skip_prep: False @@ -155,12 +155,14 @@ sample_dataloader_opts: ####################### Model parameters ########################### # Transformer d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 128 share_emb: False qk_norm: True nhead: 16 num_layers_ar: 12 # @orion_step1: --num_layers_ar~"choices([6, 9, 12])" num_layers_nar: 12 # @orion_step1: --num_layers_nar~"choices([6, 9, 12])" dropout: 0.2 +target_dropout: 0.2 vocab_size: 19683 audio_emb_freeze: False audio_emb_pretrained: False @@ -177,7 +179,6 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 ternary_num_digits: 10 -ternary_emb_hidden_size: 512 pred_mode: ternary # Model Settings @@ -198,6 +199,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length nar_layer: !ref n_ctx: !ref dropout: !ref + target_dropout: !ref share_emb: !ref qk_norm: !ref lm_head: !ref @@ -217,7 +219,9 @@ lm_head: !apply:speechbrain.utils.hparams.choice choices: ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref + d_hidden: !ref num_positions: !ref + norm: False tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice @@ -231,12 +235,8 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref - hidden: !new:speechbrain.nnet.linear.Linear - input_size: !ref - n_neurons: !ref - act: !new:torch.nn.LeakyReLU linear: !new:speechbrain.nnet.linear.Linear - input_size: !ref + input_size: !ref n_neurons: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index c88d7b536..fec745d96 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -125,11 +125,12 @@ class TernaryPredictionHead(torch.nn.Module): num_positions : int the number of positions """ - def __init__(self, d_model, num_positions, d_hidden=512): + def __init__(self, d_model, num_positions, d_hidden=512, norm=False): super().__init__() self.num_positions = num_positions self.d_model = d_model self.num_positions = num_positions + self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity() self.lin_hidden = Linear( input_size=d_model, n_neurons=d_hidden, @@ -163,6 +164,7 @@ def forward(self, x, track=None): p[:, :, :, 2] corresponds to 1 """ batch_size, max_len, _ = x.shape + x = self.norm(x) x = self.lin_hidden(x) x = self.act(x) x = self.lin_p(x) From 51438b9d45cfd48b374d994485a240506ef70db1 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Mar 2025 14:25:21 -0400 Subject: [PATCH 224/270] DASB: SQ-Codec: Add argmax mode --- benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 2 ++ benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 579c860aa..0993a052e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -171,6 +171,7 @@ audio_token_shift: 19683 audio_tokens_per_step: 4 ternary_num_digits: 10 +ternary_tokenizer_mode: argmax pred_mode: ternary freeze_lm_head: False @@ -215,6 +216,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.TernaryLogitTokenizer num_tokens: !ref num_positions: !ref + mode: !ref tokens: !new:torch.nn.Identity diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 337a69f15..1d596c3fa 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -179,6 +179,7 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 ternary_num_digits: 10 +ternary_tokenizer_mode: argmax pred_mode: ternary # Model Settings @@ -230,6 +231,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.TernaryLogitTokenizer num_tokens: !ref num_positions: !ref + mode: !ref tokens: !new:torch.nn.Identity emb: !new:speechbrain.nnet.containers.Sequential From acbcfcfa2c6e7e6f6b4a956b92b5eea5f7081105 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Mar 2025 14:25:49 -0400 Subject: [PATCH 225/270] DASB: SQ-Codec: Add argmax mode --- benchmarks/DASB/model/custom_model.py | 17 +++++++++++++++-- benchmarks/DASB/model/sq_codec.py | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index fec745d96..23b138688 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,7 +1,7 @@ import math import torch from speechbrain.nnet.linear import Linear -from model.sq_codec import tokens_to_ternary +from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens class AttentionMLP(torch.nn.Module): @@ -183,19 +183,27 @@ class TernaryLogitTokenizer(torch.nn.Module): The number of tokens chunk_size : int The size of the chunk (to prevent OOM) + mode : str + "probability" : treats the outputs as a probability distribution + "argmax" : "hard" mode, only the top probability is used. Cannot be used with + top_k sampling with k > 1 + """ - def __init__(self, num_positions, num_tokens=None, chunk_size=10): + def __init__(self, num_positions, num_tokens=None, chunk_size=10, mode="probability"): super().__init__() self.num_positions = num_positions if num_tokens is None: num_tokens = 3 ** num_positions self.num_tokens = num_tokens self.chunk_size = chunk_size + self.mode = mode self.register_buffer("vocab", torch.arange(num_tokens)) self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) def forward(self, logits): + if self.mode == "argmax": + return self._probs_argmax(logits) logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size)) token_logits_chunks = [] @@ -212,3 +220,8 @@ def forward(self, logits): dim=1 ) return token_logits + + def _probs_argmax(self, logits): + logit_tokens = ternary_logits_to_tokens(logits, n_codebook=1) + probs = (logit_tokens == self.vocab[None, None, :]).float() + return probs diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 29a483456..d213308e7 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1455,7 +1455,7 @@ def ternary_to_decimal(ternary, n_codebook=4): return codec_ls.permute(1, 2, 0) -def ternary_logits_to_tokens(logits): +def ternary_logits_to_tokens(logits, n_codebook=4): """Converts ternary logits to tokens (as used for SQ-Codec) Arguments @@ -1469,7 +1469,7 @@ def ternary_logits_to_tokens(logits): Token IDs """ ternary_matrix = logits_to_ternary(logits) - tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2)) + tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook) return tokens From b38c1ccac91fd578c54d2b99cb71a077af495af0 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 12 Mar 2025 14:31:16 -0400 Subject: [PATCH 226/270] DASB: Fixes --- benchmarks/DASB/model/sq_codec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index d213308e7..307daaeeb 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -126,7 +126,7 @@ def build_codec_model(self, config): exp_model_config = OmegaConf.load(config) scalar_codec = ScalarModel(**exp_model_config.generator.config) device = next(iter(scalar_codec.parameters())).device - parameter_dict = torch.load(self.ckpt_path, map_location=device) + parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False) scalar_codec.load_state_dict(parameter_dict["codec_model"]) return scalar_codec @@ -1543,7 +1543,7 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter if mask is not None: loss = loss * mask if reduction == "mean": - loss = loss.sum(2).mean(1).mean(0) / 3.0 + loss = loss.sum(2).sum(1).sum(0) / mask.sum() elif reduction == "batch": - loss = loss.sum(2).mean(1) / 3.0 + loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1) return loss \ No newline at end of file From 44e93fd2daf55f5be14d687c18e921722526de6d Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Mar 2025 14:02:04 -0400 Subject: [PATCH 227/270] SQCodec: Fixes --- benchmarks/DASB/model/custom_model.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 23b138688..31ce24d42 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -125,11 +125,10 @@ class TernaryPredictionHead(torch.nn.Module): num_positions : int the number of positions """ - def __init__(self, d_model, num_positions, d_hidden=512, norm=False): + def __init__(self, d_model, num_positions, d_hidden=512, norm=True): super().__init__() self.num_positions = num_positions self.d_model = d_model - self.num_positions = num_positions self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity() self.lin_hidden = Linear( input_size=d_model, @@ -165,10 +164,16 @@ def forward(self, x, track=None): """ batch_size, max_len, _ = x.shape x = self.norm(x) + if self.use_emb: + positions = torch.arange( + self.num_positions, + device=x.device + )[None, None, :] + x = x[:, :, None, :] + self.emb(positions) x = self.lin_hidden(x) x = self.act(x) - x = self.lin_p(x) - p = x.reshape(batch_size, max_len, self.num_positions, 3) + p = self.lin_p(x) + p = p.reshape(batch_size, max_len, self.num_positions, 3) return p @@ -211,8 +216,8 @@ def forward(self, logits): token_logits_raw = torch.where( self.vocab_ternary[:, None, None, :, :, None] == self.idx, chunk, - 1 - chunk - ).prod(-1).prod(-1) + 1. + ).prod(-1).log().sum(-1).exp() token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) token_logits = torch.cat( From f875cd9c04691736eafe827f7ee19e3202a75dee Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Mar 2025 14:14:28 -0400 Subject: [PATCH 228/270] DASB: SQCodec: Fixes --- benchmarks/DASB/model/custom_model.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 31ce24d42..3bf24f4d3 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -164,12 +164,6 @@ def forward(self, x, track=None): """ batch_size, max_len, _ = x.shape x = self.norm(x) - if self.use_emb: - positions = torch.arange( - self.num_positions, - device=x.device - )[None, None, :] - x = x[:, :, None, :] + self.emb(positions) x = self.lin_hidden(x) x = self.act(x) p = self.lin_p(x) From 69b346b687e377cceaba81d22d6453acb3c0985a Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 14 Mar 2025 20:04:51 -0400 Subject: [PATCH 229/270] DASB: SQCodec: Update to predict everything autoregressively --- .../TTS/valle/hparams/train_sqcodec.yaml | 14 +++--- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 5 ++- benchmarks/DASB/model/custom_model.py | 22 +++++---- benchmarks/DASB/model/sq_codec.py | 10 +++-- benchmarks/DASB/model/valle.py | 45 +++++++++++-------- 5 files changed, 55 insertions(+), 41 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 0993a052e..7d3cb278e 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -98,7 +98,7 @@ text_max_length: 500 n_ctx: !ref + infer_max_audio_length: !ref max_length_ratio: 10.0 -top_k: 1 +top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -170,8 +170,8 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 +flatten: true ternary_num_digits: 10 -ternary_tokenizer_mode: argmax pred_mode: ternary freeze_lm_head: False @@ -180,7 +180,7 @@ freeze_lm_head: False model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length vocab_size: !ref - nq: !ref + nq: 1 att_unit: !ref head: !ref ar_layer: !ref @@ -199,7 +199,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions eos: !ref minlenratio: 1.0 maxlenratio: !ref - nq: !ref + nq: 1 top_k: !ref lm_head: !apply:speechbrain.utils.hparams.choice @@ -207,7 +207,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice choices: ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref - num_positions: !ref + num_positions: !ref * tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice @@ -216,15 +216,15 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.TernaryLogitTokenizer num_tokens: !ref num_positions: !ref - mode: !ref tokens: !new:torch.nn.Identity emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref + flat: True linear: !new:speechbrain.nnet.linear.Linear - input_size: !ref + input_size: !ref * n_neurons: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index d26b27cca..97e319eb9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -157,7 +157,10 @@ def compute_objectives(self, predictions, batch, stage): if self.train_ar: logits_ar_sm = self.hparams.log_softmax(logits_ar) - targets_ar = prompt[:, 1:, 0] + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] loss_ar = self.hparams.compute_cost( logits_ar_sm, targets=targets_ar, mask=mask ) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 3bf24f4d3..cdfcd5ced 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -188,29 +188,32 @@ class TernaryLogitTokenizer(torch.nn.Module): top_k sampling with k > 1 """ - def __init__(self, num_positions, num_tokens=None, chunk_size=10, mode="probability"): + def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10): super().__init__() self.num_positions = num_positions if num_tokens is None: num_tokens = 3 ** num_positions self.num_tokens = num_tokens + self.num_tracks = num_tracks self.chunk_size = chunk_size - self.mode = mode self.register_buffer("vocab", torch.arange(num_tokens)) self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) def forward(self, logits): - if self.mode == "argmax": - return self._probs_argmax(logits) - logits_unsq = logits.softmax(-1).unsqueeze(-3).unsqueeze(-3) - chunks = logits_unsq.chunk(dim=1, chunks=math.ceil(logits_unsq.size(1) / self.chunk_size)) + batch_size, max_len, num_positions, _ = logits.shape + logits = logits.softmax(-1) + logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3) + chunks = logits.chunk( + dim=1, + chunks=math.ceil(logits.size(1) / self.chunk_size) + ) token_logits_chunks = [] for chunk in chunks: token_logits_raw = torch.where( self.vocab_ternary[:, None, None, :, :, None] == self.idx, chunk, - 1. + torch.ones_like(chunk) ).prod(-1).log().sum(-1).exp() token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) @@ -219,8 +222,3 @@ def forward(self, logits): dim=1 ) return token_logits - - def _probs_argmax(self, logits): - logit_tokens = ternary_logits_to_tokens(logits, n_codebook=1) - probs = (logit_tokens == self.vocab[None, None, :]).float() - return probs diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 307daaeeb..2c52ee8ac 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -1290,9 +1290,10 @@ class TernaryEmbedding(nn.Module): --------- num_digits : int The number of ternary digits""" - def __init__(self, num_digits): + def __init__(self, num_digits, emb_size=512, flat=False): super().__init__() self.num_digits = num_digits + self.flat = flat def forward(self, tokens): """Computes the forward pass @@ -1309,7 +1310,10 @@ def forward(self, tokens): batch_size, max_len, tracks = tokens.shape emb = tokens_to_ternary(tokens, D=self.num_digits).float() positions = emb.size(-1) - emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) + if self.flat: + emb = emb.unsqueeze(-2) + else: + emb = emb.reshape(batch_size, max_len, tracks, positions // tracks) if squeeze: emb = emb.squeeze(-2) return emb @@ -1546,4 +1550,4 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter loss = loss.sum(2).sum(1).sum(0) / mask.sum() elif reduction == "batch": loss = loss.sum(2).sum(1) / mask.sum(-1).sum(-1) - return loss \ No newline at end of file + return loss diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index fbaa45b30..619e555ca 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -307,6 +307,9 @@ def inference( ) modality_index = prev_tok.flatten() mask = modality_index_to_mask(modality_index, opts) + tracks = prefix.size(-1) + if opts.nq == 1 and tracks > 1: + prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks) mask_cache = [] modality_tokens = torch.tensor( list(opts.masks.keys()), device=prefix.device @@ -314,11 +317,13 @@ def inference( for step in range(maxlen): # (3.2) AR loop - prev_emb = self.emb(prev_tok) # [B, 1, D] + prev_emb = self.emb(prev_tok).squeeze(2) # [B, 1, D] h_ar = self.ar_decoder(prev_emb, kv_cache=cache) logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0)) # [B, 1, V] + if logits.dim() < 4: + logits = logits.unsqueeze(-2) gen_tok, gen_score = logits_to_tokens( - logits.unsqueeze(2), + logits, opts, mask, allow_eos=step >= minlen, @@ -408,23 +413,24 @@ def inference( start_token = torch.tensor( [opts.start], device=prefix.device )[None, None, :] - start_emb = self.emb(start_token).squeeze().tile( - len(valid_idx), 1, 1 - ) # [B, 1, D] - prev_emb = torch.cat( - [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 - ) # [B, T, D] - - ones = torch.ones_like(valid_idx) - mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() - mask = mask.unsqueeze(1).unsqueeze(1) - generated = {"token": [], "score": []} - - mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache - vocab_mask = torch.cat(mask_cache, dim=1) # (4.2) NAR loop if self.nq > 1: + start_emb = self.emb(start_token).squeeze().tile( + len(valid_idx), 1, 1 + ) # [B, 1, D] + prev_emb = torch.cat( + [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 + ) # [B, T, D] + + ones = torch.ones_like(valid_idx) + mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) + generated = {"token": [], "score": []} + + mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache + vocab_mask = torch.cat(mask_cache, dim=1) + for step in range(1, opts.nq): h_nar = self.nar_decoder( prev_emb, ones * step - 1, mask=mask @@ -469,8 +475,11 @@ def inference( gen_tokens_list, gen_scores_list = [], [] for b in range(len(valid_idx)): - gen_tokens_list.append(gen_tokens[b][: finish_idx[b]]) - gen_scores_list.append(gen_scores[b][: finish_idx[b]]) + item_finish_idx = finish_idx[b] + if len(item_finish_idx) > 1: + item_finish_idx = item_finish_idx[0] + gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) + gen_scores_list.append(gen_scores[b][:item_finish_idx]) return gen_tokens_list, gen_scores_list From f51b3a8f0d345794a55b165e232e0a7f46581425 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Mar 2025 01:15:41 -0400 Subject: [PATCH 230/270] DASB: VALL-E: Fixes --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 97e319eb9..872a190c5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -276,7 +276,7 @@ def apply_curriculum(self): else self.modules.model.lm_head ) lm_head.requires_grad_(True) - if self.hparams.audio_tokens_per_step == 1: + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: # NOTE: If there is only one track it's autoregressive self.train_nar = False elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: From 1f05e76c95ce33dd2f5ac0bc42bc5175e6a24717 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Mar 2025 23:05:12 -0400 Subject: [PATCH 231/270] DASB: SQCodec: Fixes, add LibriTTS --- .../DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml | 1 + .../TTS/tokotron/hparams/train_speech_tokenizer.yaml | 1 + .../LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml | 1 + .../LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml | 1 + .../DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml | 1 + .../DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 1 + .../LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 1 + .../LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 1 + .../TTS/valle/hparams/train_speech_tokenizer.yaml | 1 + .../DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 2 +- .../LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml | 1 + benchmarks/DASB/LibriTTS/TTS/valle/train.py | 9 ++++++--- benchmarks/DASB/model/valle.py | 6 +++++- 15 files changed, 24 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index b38a07434..505460dfa 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -166,6 +166,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 +flatten: false attention_type: regularMHA ############################## models ################################ diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 0cb2012ed..0ff172529 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -165,6 +165,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 +flatten: false bandwidth: 1.5 attention_type: regularMHA diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index 715a2d199..b2a5f37dc 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -207,6 +207,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 6 +flatten: false freeze_lm_head: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index 747e6626e..cae286efd 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -175,6 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false bandwidth: 6 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index c3874b6a7..5aae5e0db 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -173,6 +173,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false espnet_repo: https://github.com/espnet/espnet espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef model_hub: espnet/libritts_encodec_24k diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index b5747d763..edae05d51 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -174,6 +174,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false bandwidth: 6 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 3052dc76b..b7579f092 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -186,6 +186,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 2 +flatten: false # Model Settings model_type: 24khz diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 4c61228a2..7e4b5e0be 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -228,6 +228,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 6 +flatten: false freeze_lm_head: False diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 6596858b2..c35aaa4f9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -185,6 +185,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false # Model Settings model_hub: facebook/encodec_24khz diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index a23789f15..efd408469 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false # Model Settings espnet_repo: https://github.com/espnet/espnet diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 3b9bd8214..b6f699cf9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false # Model Settings model_hub: fnlp/SpeechTokenizer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 1d596c3fa..5ea73a123 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -178,6 +178,7 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 +flatten: false ternary_num_digits: 10 ternary_tokenizer_mode: argmax pred_mode: ternary @@ -222,7 +223,6 @@ lm_head: !apply:speechbrain.utils.hparams.choice d_model: !ref d_hidden: !ref num_positions: !ref - norm: False tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 4e4d13c27..b63fe0d24 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -184,6 +184,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 1 +flatten: false # Model Settings model_hub: novateur/WavTokenizer-medium-music-audio-75token diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 2d3091654..07539443e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -160,7 +160,10 @@ def compute_objectives(self, predictions, batch, stage): if self.train_ar: logits_ar_sm = self.hparams.log_softmax(logits_ar) - targets_ar = prompt[:, 1:, 0] + if self.hparams.flatten: + targets_ar = prompt[:, 1:] + else: + targets_ar = prompt[:, 1:, 0] loss_ar = self.hparams.compute_cost( logits_ar_sm, targets=targets_ar, mask=mask ) @@ -288,11 +291,11 @@ def apply_curriculum(self): else self.modules.model.lm_head ) lm_head.requires_grad_(True) - if self.hparams.audio_tokens_per_step == 1: + if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: # NOTE: If there is only one track it's autoregressive self.train_nar = False elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: - self.train_nar = False + self.train_nar = False elif ( self.hparams.number_of_epochs_nar is not None and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 619e555ca..2223c0991 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -350,7 +350,11 @@ def inference( if torch.any(modality_change_mask): modality_index = torch.where( modality_change_mask, prev_tok[:, 0], modality_index, - ) + ).flatten().squeeze() + if modality_index.dim() == 0: + modality_index = modality_index.unsqueeze(0) + if modality_index.size(0) > 1: + modality_index = modality_index[0:1] mask = modality_index_to_mask(modality_index, opts) logging.warning( f"Step {step}: change modality index {modality_index}" From 9011781577f3c11c184dabaaa6c90ade0a7d8915 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 15 Mar 2025 23:06:04 -0400 Subject: [PATCH 232/270] DASB: SQCodec updates --- .../LJSpeech/TTS/valle/hparams/train_sqcodec.yaml | 2 ++ .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 13 ++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index 7d3cb278e..fb1ca4d33 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -148,6 +148,7 @@ token_model_kwargs: ####################### Model parameters ########################### # Transformer d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" +ternary_d_hidden: 512 share_emb: False qk_norm: True nhead: 16 @@ -207,6 +208,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice choices: ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref + d_hidden: !ref num_positions: !ref * tokens: null diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 5ea73a123..ec95ebaf6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -178,9 +178,8 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 -flatten: false +flatten: true ternary_num_digits: 10 -ternary_tokenizer_mode: argmax pred_mode: ternary # Model Settings @@ -194,7 +193,7 @@ freeze_lm_head: False model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length vocab_size: !ref - nq: !ref + nq: 1 att_unit: !ref head: !ref ar_layer: !ref @@ -213,7 +212,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions eos: !ref minlenratio: 1.0 maxlenratio: !ref - nq: !ref + nq: 1 top_k: !ref lm_head: !apply:speechbrain.utils.hparams.choice @@ -222,7 +221,7 @@ lm_head: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.TernaryPredictionHead d_model: !ref d_hidden: !ref - num_positions: !ref + num_positions: !ref * tokens: null logits_to_probs: !apply:speechbrain.utils.hparams.choice @@ -231,14 +230,14 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice ternary: !new:model.custom_model.TernaryLogitTokenizer num_tokens: !ref num_positions: !ref - mode: !ref tokens: !new:torch.nn.Identity emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding num_digits: !ref + flat: True linear: !new:speechbrain.nnet.linear.Linear - input_size: !ref + input_size: !ref * n_neurons: !ref tokenizer: !new:utils.tokenizer_interface.SQCodecTokenizer From 9a7565212560fed07611e6fa9fa43f6048eebe86 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 16 Mar 2025 19:41:39 -0400 Subject: [PATCH 233/270] DASB: VALL-E fixes --- benchmarks/DASB/model/valle.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 2223c0991..110e4ca3d 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -308,7 +308,8 @@ def inference( modality_index = prev_tok.flatten() mask = modality_index_to_mask(modality_index, opts) tracks = prefix.size(-1) - if opts.nq == 1 and tracks > 1: + is_flattened = opts.nq == 1 and tracks > 1 + if is_flattened: prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks) mask_cache = [] modality_tokens = torch.tensor( @@ -342,7 +343,10 @@ def inference( # (3.3) detect modality swtich mask_cache.append(mask.clone()) - modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens) + mod_tok = prev_tok[:, 0] + if is_flattened: + mod_tok = mod_tok[:, 0] + modality_change_mask = torch.isin(mod_tok, modality_tokens) # Note: The ESPNET VALL-E had # modality_change_mask = torch.logical_and( # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, @@ -484,7 +488,9 @@ def inference( item_finish_idx = item_finish_idx[0] gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) gen_scores_list.append(gen_scores[b][:item_finish_idx]) - + if is_flattened: + gen_tokens_list = [item.squeeze(-2) for item in gen_tokens_list] + gen_scores_list = [item.squeeze(-2) for item in gen_scores_list] return gen_tokens_list, gen_scores_list def apply_lm_head(self, x, track): From add349ad8639af90f704d811009c744d5498c8c5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Mar 2025 11:55:31 -0400 Subject: [PATCH 234/270] DASB: Fixes --- benchmarks/DASB/model/valle.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 110e4ca3d..61fd5bb08 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -310,7 +310,7 @@ def inference( tracks = prefix.size(-1) is_flattened = opts.nq == 1 and tracks > 1 if is_flattened: - prev_tok = prev_tok.unsqueeze(-1).expand(1, 1, tracks) + prev_tok = prev_tok.expand(1, tracks) mask_cache = [] modality_tokens = torch.tensor( list(opts.masks.keys()), device=prefix.device @@ -318,6 +318,8 @@ def inference( for step in range(maxlen): # (3.2) AR loop + if is_flattened: + prev_tok = prev_tok.unsqueeze(1) prev_emb = self.emb(prev_tok).squeeze(2) # [B, 1, D] h_ar = self.ar_decoder(prev_emb, kv_cache=cache) logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0)) # [B, 1, V] @@ -331,7 +333,7 @@ def inference( nq_level=0, ) # [B, 1, 1] -> [B, 1] - gen_tok, gen_score = gen_tok.squeeze(2), gen_score.squeeze(2) + gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1) generated["token"].append(gen_tok) generated["score"].append(gen_score) @@ -343,10 +345,7 @@ def inference( # (3.3) detect modality swtich mask_cache.append(mask.clone()) - mod_tok = prev_tok[:, 0] - if is_flattened: - mod_tok = mod_tok[:, 0] - modality_change_mask = torch.isin(mod_tok, modality_tokens) + modality_change_mask = torch.isin(prev_tok[:, 0], modality_tokens) # Note: The ESPNET VALL-E had # modality_change_mask = torch.logical_and( # prev_tok[:, 0] >= 32, prev_tok[:, 0] < 64, @@ -484,13 +483,8 @@ def inference( gen_tokens_list, gen_scores_list = [], [] for b in range(len(valid_idx)): item_finish_idx = finish_idx[b] - if len(item_finish_idx) > 1: - item_finish_idx = item_finish_idx[0] gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) gen_scores_list.append(gen_scores[b][:item_finish_idx]) - if is_flattened: - gen_tokens_list = [item.squeeze(-2) for item in gen_tokens_list] - gen_scores_list = [item.squeeze(-2) for item in gen_scores_list] return gen_tokens_list, gen_scores_list def apply_lm_head(self, x, track): From 331bad099ffd989a2d98b6e1a19653333e92505f Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 17 Mar 2025 21:38:40 -0400 Subject: [PATCH 235/270] DASB: Train dataset data loader fix --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 07539443e..d0208518b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -23,11 +23,13 @@ length_to_mask, write_audio, ) +from speechbrain.dataio.dataloader import LoopedLoader from speechbrain.utils.data_utils import pad_right_to from speechbrain.utils.distributed import run_on_main from speechbrain.utils.data_utils import batch_pad_right from speechbrain.dataio.dataset import FilteredSortedDynamicItemDataset from functools import partial +from torch.utils.data import DataLoader import re import string @@ -648,11 +650,14 @@ def fit( "Test only mode, skipping training and validation stages." ) return - + if not ( + isinstance(train_set, DataLoader) + or isinstance(train_set, LoopedLoader) + ): + train_set = self.make_dataloader( + train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs + ) self.on_fit_start() - train_set = self.make_dataloader( - train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs - ) epoch = self.hparams.epoch_counter.current if epoch < self.hparams.number_of_epochs: valid_set = sample_dataset( @@ -892,7 +897,6 @@ def sig_pipeline(wav): raise NotImplementedError( "sorting must be random, ascending or descending" ) - return datasets, resample_fn From 17ebf5d4ffef409b5fd5a719d6bbd27e84358f84 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 28 Mar 2025 02:29:51 -0400 Subject: [PATCH 236/270] DASB: Add a fallback for hparams files --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index d0208518b..94dd6c746 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1251,6 +1251,8 @@ def undo_padding_tensor(batch, lengths): # Load evaluation hyperparameters eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" if eval_hparams_file.exists(): logger.info( "Using evaluation hyperparameters from %s", eval_hparams_file From 47744ab075cd76831503542f29a8de9aaad9edff Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 28 Mar 2025 02:46:43 -0400 Subject: [PATCH 237/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 7b61a18a7..9359a2b24 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -185,6 +185,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 +flatten: false # Model Settings model_hub: kyutai/mimi From fa87f1dae5752d3fdcbbf8235d6303edf50f4722 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 16 Apr 2025 16:21:48 -0400 Subject: [PATCH 238/270] DASB: Fix the summary.json check --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 7d99c5c7d..71ca5b37b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -1015,7 +1015,7 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + test_summary_file = Path(hparams["output_folder"]).glob("eval/test/*/summary.json") if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: From bb08a3ae1c9394506e06c71cf88d890988d037aa Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 16 Apr 2025 16:30:21 -0400 Subject: [PATCH 239/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 71ca5b37b..323134e90 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -1015,8 +1015,8 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = Path(hparams["output_folder"]).glob("eval/test/*/summary.json") - if test_summary_file.exists(): + test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) + if test_summary_file is not None: logging.info("Test run already completed: %s", test_summary_file) else: test_key_kind = hparams["test_key_kind"] From 33daea82a17dc57a465d3673d3d42550324a20e0 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 20 Apr 2025 08:55:51 -0400 Subject: [PATCH 240/270] DASB: Fixes --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 94dd6c746..d0abffc47 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1336,8 +1336,8 @@ def undo_padding_tensor(batch, lengths): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" - if test_summary_file.exists(): + test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) + if test_summary_file is not None: logging.info("Test run already completed: %s", test_summary_file) else: test_key_kind = hparams["test_key_kind"] From d27a9ef32ea3f44ea0a7960acae1fc073ca36b03 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 21 Apr 2025 15:23:00 -0400 Subject: [PATCH 241/270] DASB: Add memory fraction (to share a large GPU) --- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 17 +++++++++++++++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 872a190c5..c932fc872 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -13,6 +13,7 @@ import logging import speechbrain as sb +import os import torch import sys import shutil @@ -727,6 +728,19 @@ def get_offsets(vocab_size, tracks): return torch.arange(tracks) * vocab_size +def apply_mem_fraction(): + """Applies the memory fraction, based on environment variables, useful for cases where + multiple experiments share a large GPU""" + if not torch.cuda.is_available(): + return + mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION") + if mem_fraction: + fraction, device = mem_fraction.split(":") + fraction, device = float(fraction), int(device) + logger.info("Using %f of GPU %f", fraction, device) + torch.cuda.set_per_process_memory_fraction(fraction, device) + + def init_sequence_encoder(hparams): """Initialize a sequence encoder @@ -895,6 +909,9 @@ def undo_padding_tensor(batch, lengths): # Initialize ddp (useful only for multi-GPU DDP training) sb.utils.distributed.ddp_init_group(run_opts) + # Applies the memory fraction for a shared GPU + apply_mem_fraction() + # Load hyperparameters file with command-line overrides with open(hparams_file) as fin: yaml = fin.read() diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index d0abffc47..738ac2a3f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -31,6 +31,7 @@ from functools import partial from torch.utils.data import DataLoader import re +import os import string base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) @@ -941,6 +942,19 @@ def get_offsets(vocab_size, tracks): return torch.arange(tracks) * vocab_size +def apply_mem_fraction(): + """Applies the memory fraction, based on environment variables, useful for cases where + multiple experiments share a large GPU""" + if not torch.cuda.is_available(): + return + mem_fraction = os.environ.get("SB_CUDA_MEM_FRACTION") + if mem_fraction: + fraction, device = mem_fraction.split(":") + fraction, device = float(fraction), int(device) + logger.info("Using %f of GPU %f", fraction, device) + torch.cuda.set_per_process_memory_fraction(fraction, device) + + def group_by_speaker(dataset, hparams): """Groups utterance IDs in a dataset by speaker, for selection. The selection is stable based on the seed - calling this method multiple times will always @@ -1245,6 +1259,9 @@ def undo_padding_tensor(batch, lengths): # Initialize ddp (useful only for multi-GPU DDP training) sb.utils.distributed.ddp_init_group(run_opts) + # Applies the memory fraction for a shared GPU + apply_mem_fraction() + # Load hyperparameters file with command-line overrides with open(hparams_file) as fin: yaml = fin.read() From 4ae6e86dff07b87325b54e6a191678b205fbf214 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 21 Apr 2025 18:23:15 -0400 Subject: [PATCH 242/270] DASB: Fix kmeans path conflicts --- .../DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index b2a5f37dc..bba258f8d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -233,7 +233,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref + save_path: !ref / ssl_model: !ref vocoder_repo_id: !ref kmeans_dataset: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 7e4b5e0be..e7e4657aa 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -254,7 +254,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer - save_path: !ref + save_path: !ref / ssl_model: !ref vocoder_repo_id: !ref kmeans_dataset: !ref From 27a460875fe948b67c109c870acf3bcc402ba851 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 26 Apr 2025 22:55:01 -0400 Subject: [PATCH 243/270] DASB: Mimi fix --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 9359a2b24..40d3f03f6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -209,7 +209,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length inference_opts: !name:model.valle.SpeechLMInferenceOptions start: !ref eos: !ref - minlenratio: 1.0 + minlenratio: 0.0 maxlenratio: !ref nq: !ref From 6de5acbd32962bd97b9e069128cd9c077b2cae38 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 13 May 2025 20:23:11 -0400 Subject: [PATCH 244/270] DASB: WER/CER fix --- benchmarks/DASB/utils/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 76f2a6c2f..da5d71ddb 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -483,8 +483,10 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): predicted_words = [self.normalize(text) for text in predicted_words] ids = range(1, len(wavs) + 1) wer_metric, cer_metric = init_asr_metrics() - wer_metric.append(ids, predicted_words, text) - cer_metric.append(ids, predicted_words, text) + predicted_words_split = [item.split(" ") for item in predicted_words] + text_split = [item.split(" ") for item in text] + wer_metric.append(ids, predicted_words_split, text_split) + cer_metric.append(ids, predicted_words_split, text_split) wer = torch.tensor( [score["WER"] for score in wer_metric.scores], device=wavs.device ) From 7210b3c7ccbf39a1dd1cef1be2f7edde46c557a6 Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 14 May 2025 09:21:09 -0400 Subject: [PATCH 245/270] WER/CER fixes --- benchmarks/DASB/utils/eval.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index da5d71ddb..0bd3cac30 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -330,6 +330,8 @@ def compute_diff_rate(self, details, device): wer_metric, cer_metric = init_asr_metrics() pred = self._replace_blanks(details["pred"]) pred_ref = self._replace_blanks(details["pred_ref"]) + pred = [item.split(" ") for item in pred] + pred_ref = [item.split(" ") for item in pred_ref] wer_metric.append(ids, pred, pred_ref) cer_metric.append(ids, pred, pred_ref) dwer = torch.tensor( From 4c5dba53d52c61cf893b3bb2ae48dc41817cc6f4 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 16 May 2025 11:23:57 -0400 Subject: [PATCH 246/270] DASB: VALL-E: Added an option to do preparation only without training --- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 1 + .../TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../TTS/valle/hparams/train_encodec.yaml | 1 + .../valle/hparams/train_espnet_encodec.yaml | 1 + .../TTS/valle/hparams/train_mimi.yaml | 1 + .../valle/hparams/train_speech_tokenizer.yaml | 1 + .../TTS/valle/hparams/train_sqcodec.yaml | 1 + .../TTS/valle/hparams/train_wavtokenizer.yaml | 1 + benchmarks/DASB/LibriTTS/TTS/valle/train.py | 93 ++++++++++--------- 9 files changed, 55 insertions(+), 46 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index b7579f092..c3cc4a750 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -76,6 +76,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index e7e4657aa..4ef2d230c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -95,6 +95,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index c35aaa4f9..df8513cb8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -75,6 +75,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index efd408469..3587b3b23 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -75,6 +75,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 40d3f03f6..16348d9e7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -76,6 +76,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index b6f699cf9..ac8172585 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -75,6 +75,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index ec95ebaf6..c49c6d88c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -75,6 +75,7 @@ max_grad_norm: 0.01 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index b63fe0d24..22b8c19f3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -75,6 +75,7 @@ max_grad_norm: 1.0 sorting: random num_workers: 4 skip_prep: False +prep_only: False overfit_test: False overfit_test_sample_count: !ref overfit_test_epoch_data_count: 1000 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 738ac2a3f..7014131cc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1321,51 +1321,52 @@ def undo_padding_tensor(batch, lengths): }, ) - # We can now directly create the datasets for training, valid, and test - datasets, resample_fn = dataio_prepare(hparams) - - # Apply overfit test settings - datasets = apply_overfit_test(hparams, datasets) - audio_keys = ["audio_tokens"] - - # Trainer initialization - tts_brain = VALLEBrain( - modules=hparams["modules"], - opt_class=hparams["opt_class"], - hparams=hparams, - run_opts=run_opts, - checkpointer=hparams["checkpointer"], - ) + if not hparams.get("prep_only"): + # We can now directly create the datasets for training, valid, and test + datasets, resample_fn = dataio_prepare(hparams) + + # Apply overfit test settings + datasets = apply_overfit_test(hparams, datasets) + audio_keys = ["audio_tokens"] + + # Trainer initialization + tts_brain = VALLEBrain( + modules=hparams["modules"], + opt_class=hparams["opt_class"], + hparams=hparams, + run_opts=run_opts, + checkpointer=hparams["checkpointer"], + ) - tts_brain.resample_fn = resample_fn - - # The `fit()` method iterates the training loop, calling the methods - # necessary to update the parameters of the model. Since all objects - # with changing state are managed by the Checkpointer, training can be - # stopped at any point, and will be resumed on next call. - tts_brain.fit( - tts_brain.hparams.epoch_counter, - datasets["train"], - datasets["valid"], - train_loader_kwargs=hparams["train_dataloader_opts"], - valid_loader_kwargs=hparams["valid_dataloader_opts"], - ) + tts_brain.resample_fn = resample_fn + + # The `fit()` method iterates the training loop, calling the methods + # necessary to update the parameters of the model. Since all objects + # with changing state are managed by the Checkpointer, training can be + # stopped at any point, and will be resumed on next call. + tts_brain.fit( + tts_brain.hparams.epoch_counter, + datasets["train"], + datasets["valid"], + train_loader_kwargs=hparams["train_dataloader_opts"], + valid_loader_kwargs=hparams["valid_dataloader_opts"], + ) - # Load best checkpoint for evaluation - if hparams["testing"]: - test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) - if test_summary_file is not None: - logging.info("Test run already completed: %s", test_summary_file) - else: - test_key_kind = hparams["test_key_kind"] - test_key = hparams["test_key"] - eval_kwargs = { - f"{test_key_kind}_key": test_key - } - eval_dataset = datasets["test"] - eval_dataset = select_eval_subset(eval_dataset, hparams) - tts_brain.evaluate( - test_set=eval_dataset, - test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs - ) + # Load best checkpoint for evaluation + if hparams["testing"]: + test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) + if test_summary_file is not None: + logging.info("Test run already completed: %s", test_summary_file) + else: + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + eval_kwargs = { + f"{test_key_kind}_key": test_key + } + eval_dataset = datasets["test"] + eval_dataset = select_eval_subset(eval_dataset, hparams) + tts_brain.evaluate( + test_set=eval_dataset, + test_loader_kwargs=hparams["test_dataloader_opts"], + **eval_kwargs + ) From 7fec49f8bda36ef35412271e8e8f6c582996dded Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 17 May 2025 00:08:13 -0400 Subject: [PATCH 247/270] DASB: VALL-E: Add a duration filter --- .../DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 5 +++++ .../TTS/valle/hparams/train_discrete_ssl.yaml | 5 +++++ .../LibriTTS/TTS/valle/hparams/train_encodec.yaml | 5 +++++ .../TTS/valle/hparams/train_espnet_encodec.yaml | 6 ++++++ .../DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 5 +++++ .../TTS/valle/hparams/train_speech_tokenizer.yaml | 5 +++++ .../LibriTTS/TTS/valle/hparams/train_sqcodec.yaml | 5 +++++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 5 +++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 13 +++++++++++++ 9 files changed, 54 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index c3cc4a750..cc52722b1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -54,6 +54,11 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 4ef2d230c..7bf2be75f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -73,6 +73,11 @@ spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec use_spk_emb: False splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index df8513cb8..0114bdcb1 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -53,6 +53,11 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 3587b3b23..f33998f26 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -53,6 +53,12 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 16348d9e7..daf37c2a7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -54,6 +54,11 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index ac8172585..407562365 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -53,6 +53,11 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index c49c6d88c..66fb3535a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -53,6 +53,11 @@ flip_layers: False use_token_offsets: False splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 22b8c19f3..7780a3fc3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -53,6 +53,11 @@ flip_layers: False use_token_offsets: True splits: ["train", "valid", "test"] +# Duration Filter +duration_min: null +duration_max: null + + ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 7014131cc..26758e076 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -874,6 +874,19 @@ def sig_pipeline(wav): dynamic_dataset = dynamic_dataset.filtered_sorted( key_test={"has_alignments": lambda value: value} ) + duration_min = hparams.get("duration_min") + duration_max = hparams.get("duration_max") + if duration_min or duration_max: + key_min_value = None + key_max_value = None + if duration_min: + key_min_value = {"duration": duration_min} + if duration_max: + key_max_value = {"duration": duration_max} + dynamic_dataset = dynamic_dataset.filtered_sorted( + key_min_value=key_min_value, + key_max_value=key_max_value, + ) datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False From 3d67a55d711b4ac0ec7addc73be33b6c2bc148f5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 17 May 2025 16:58:02 -0400 Subject: [PATCH 248/270] DASB: A fix for broken annotations --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 26758e076..9acb98831 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -887,6 +887,14 @@ def sig_pipeline(wav): key_min_value=key_min_value, key_max_value=key_max_value, ) + dynamic_dataset = dynamic_dataset.filtered_sorted( + key_test={ + "wrd": lambda wrd: not any( + "{" in item + for item in wrd + ) + } + ) datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False From bd47e648d17a5ebd3ae0699048e055777715d584 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 20 May 2025 18:48:42 -0400 Subject: [PATCH 249/270] DASB: Minor fix for backward compatibility --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index ec1845d36..5b9082da5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -935,8 +935,8 @@ def apply_overfit_test(hparams, dataset): logging.info("Test run already completed: %s", test_summary_file) else: eval_kwargs = {} - test_key_kind = hparams["test_key_kind"] - test_key = hparams["test_key"] + test_key_kind = hparams.get("test_key_kind", "min") + test_key = hparams.get("test_key") if test_key: eval_kwargs = { f"{test_key_kind}_key": test_key From 42ecf13826e57f3f405ca5fc4292667361470710 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 22 May 2025 15:10:00 -0400 Subject: [PATCH 250/270] DASB: Add inference grid search and micro dWER --- .../DASB/LJSpeech/TTS/valle/evaluation.py | 2 + .../DASB/LibriTTS/TTS/valle/evaluation.py | 5 + .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 24 +- .../valle/hparams/train_espnet_encodec.yaml | 2 +- .../DASB/LibriTTS/TTS/valle/inference_fit.py | 348 ++++++++++++++++++ benchmarks/DASB/utils/eval.py | 70 +++- 6 files changed, 441 insertions(+), 10 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py index 6c2dd1c8d..d5aaa649d 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -75,6 +75,8 @@ def on_evaluation_start(self, output_folder="eval"): self.read_reports() self.create_reports() self.item_ids = [] + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_start() def on_evaluation_end(self): """Invoked at the beginning of the evaluation cycle. The default diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py index 9fd6da808..ebb619757 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -241,6 +241,11 @@ def summarize(self, field=None): items=self.details[evaluator_key], key=metric_key, ).items() } + for evaluator_key in self.enabled_evaluators: + result.update({ + f"{evaluator_key}_{stat_key}": value + for stat_key, value in + self.evaluators[evaluator_key].global_metrics().items()}) if field is not None: result = result[field] return result diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml index 129cf9337..c58c1d49b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -8,6 +8,7 @@ eval_subset: null eval_asr_beam_size: 66 eval_asr_type: encoder_decoder eval_asr_source: openai/whisper-small +eval_asr_metric_mode: micro eval_spk_sim_source: microsoft/wavlm-base-sv evaluations: utmos,asr,spk_sim tmp_folder: null @@ -19,11 +20,17 @@ eval_utmos_domain_id: null eval_utmos_judge_id: null eval_perf: False +# Inference Fit +inference_fit_top_k: [20, 30] +inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2] +inference_fit_key_metric: dwer +inference_fit_key_metric_kind: min eval_asr: !name:utils.eval.WhisperASRSpeechEvaluator source: !ref sample_rate: !ref savedir: !ref + metric_mode: !ref eval_utmos: !name:utils.eval.UTMOSSpeechEvaluator source: !ref @@ -51,7 +58,22 @@ eval_summary: spk_sim: descriptive: ["score"] +dwer_metric_key: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + macro: asr_dwer_median + micro: asr_dwer_micro + eval_summary_log: utmos: utmos_utmos_mean - dwer: asr_dwer_median + dwer: !ref + spk_sim: spk_sim_score_mean + +inference_fit_space: + top_k: !ref + sampling_temperature: !ref + +inference_fit_metrics: + utmos: utmos_utmos_mean + dwer: !ref spk_sim: spk_sim_score_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index f33998f26..bc485eaa9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -264,4 +264,4 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler - seed: !ref + seed: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py new file mode 100644 index 000000000..e2f02e95a --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -0,0 +1,348 @@ +"""Inference fit grid search for VALL-E + +Curriculum inspired by Lifeiteng's VALL-E +https://github.com/lifeiteng/vall-e + +Authors + * Artem Ploujnikov 2024 +""" + +import speechbrain as sb +import sys +import csv +import torch +import operator +import yaml + +from hyperpyyaml import load_hyperpyyaml +from pathlib import Path +from torch import nn +from tqdm.auto import tqdm +from types import SimpleNamespace +from speechbrain.dataio.dataio import clean_padding +from speechbrain.utils.logger import get_logger +from speechbrain.utils.data_utils import batch_pad_right, pad_right_to + +base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) +sys.path.append(base_dir) + +from evaluation import SpeechEvaluationMetricStats # noqa: E402 +from train import undo_padding_tensor, get_offsets # noqa: E402 + +logger = get_logger(__name__) + +class InferenceFit: + """An inference fit wrapper""" + def __init__(self, hparams, run_opts): + device = run_opts.get("device", "cpu") + self.hparams = SimpleNamespace(**hparams) + self.modules = nn.ModuleDict(self.hparams.modules).to(device) + self.device = device + self.space = self.hparams.inference_fit_space + self.result = None + self.evaluation_metric = SpeechEvaluationMetricStats( + self.hparams, self.device + ) + self.offsets = get_offsets( + self.hparams.vocab_size, self.hparams.audio_tokens_per_step, + )[None, None, :].to(self.device) + if not self.hparams.use_token_offsets: + self.offsets = torch.zeros_like(self.offsets) + self.output_folder_rel = "eval/inference_fit" + self.output_folder = Path(self.hparams.output_folder) / self.output_folder_rel + self.token_model_kwargs = getattr( + self.hparams, "token_model_kwargs", {} + ) + + def fit(self, dataset): + """Performs infernece fitting + + Arguments + --------- + dataset: DynamicItemDataset + a dataset + + Returns + ------- + result: dict + the fit result + """ + self.result = [] + self.recover() + logger.info("Parameter Space: %s", format_space(self.space)) + evaluations = self.enumerate_param_space() + for idx, params in enumerate(tqdm(evaluations, desc="Parameter space")): + eval_result = self.evaluate(dataset, params) + self.result.append({"idx": idx, **params, **eval_result}) + self.best = self.find_best() + return self.result, self.best + + def find_best(self): + best = self.result[0] + op = ( + operator.lt + if self.hparams.inference_fit_key_metric_kind == "min" + else operator.gt + ) + for item in self.result[1:]: + value = item[self.hparams.inference_fit_key_metric] + if op(value, best[self.hparams.inference_fit_key_metric]): + best = item + return best + + def enumerate_param_space(self): + return enumerate_space(self.space) + + def evaluate(self, dataset, params): + dataloader = sb.dataio.dataloader.make_dataloader(dataset) + params_str = format_params(params) + logger.info("Starting evaluation of %s", params_str) + folder_name = params_to_folder_name(params) + self.evaluation_metric.on_evaluation_start(f"{self.output_folder_rel}/{folder_name}") + for batch in tqdm(dataloader, desc="Evaluation run", total=len(dataset)): + self.evaluate_batch(batch, params) + logger.info("Finished evaluation of %s", params_str) + self.evaluation_metric.on_evaluation_end() + summary = self.evaluation_metric.summarize() + metrics = { + key: summary.get(value, 0.0) + for key, value in self.hparams.inference_fit_metrics.items() + } + return metrics + + def evaluate_batch(self, batch, params): + audio_tokens, audio_length = self.inference(batch, params) + wav = self.create_waveform(audio_tokens, audio_length) + wav = wav.squeeze(1) + self.evaluation_metric.append( + ids=batch.uttid, + wav=wav, + text=batch.label_norm_eval, + length=audio_length, + wav_ref=batch.sig.data, + length_ref=batch.sig.lengths, + ) + + def write_report(self): + if self.result is None: + logger.warning("Nothing to report") + return + + report_file_name = self.output_folder / "results.csv" + report_file_name.parent.mkdir(parents=True, exist_ok=True) + with open(report_file_name, "w") as report_file: + columns = next(iter(self.result)).keys() + writer = csv.DictWriter(report_file, columns) + writer.writeheader() + for result in self.result: + writer.writerow(result) + best_file_name = self.output_folder / "best.yaml" + with open(best_file_name, "w") as best_file: + yaml.dump(self.best, best_file) + + def inference(self, batch, params): + """Runs TTS inference + + Arguments + --------- + batch : PaddedBatch + A batch + + Returns + ------- + audio : torch.Tensor + A padded tensor of audio + audio_length : torch.Tensor + Relative lengths + """ + prefix, prefix_length = batch.prefix + # NOTE: ESPNET VALL-E does not support batched inference + prefix_items = undo_padding_tensor(prefix.int(), prefix_length) + inference = self.modules.model.inference + inference_results = [ + inference( + prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts(params) + ) + for prefix_item in prefix_items + ] + inferred_tokens = [ + self._pad_inferred_sample(result) + for result in inference_results + ] + audio, audio_length = batch_pad_right(inferred_tokens) + audio_length = audio_length.to(self.device) + audio = (audio - hparams["audio_token_shift"] - self.offsets).clip(0) + return audio, audio_length + + # TODO: Duplicated in train, consider refactoring + def _pad_inferred_sample(self, result): + """Applies length padding to an inference result + + Arguments + --------- + result : list + The VALL-E Inference output + + Returns + ------- + sample : torch.Tensor + A sample, padded if needed + """ + if result[0]: + sample = result[0][0] + else: + sample = torch.zeros( + 1000, self.hparams.audio_tokens_per_step, device=self.device + ) + min_length = getattr(self.hparams, "infer_min_length", 10) + sample_length, tracks = sample.shape + if sample_length < min_length: + sample = pad_right_to( + sample, + (min_length, tracks), + )[0] + return sample + + def create_waveform(self, audio, length): + """Creates a waveform from a discrete or continuous audio + representation + + Arguments + --------- + audio : torch.Tensor + An audio tensor (Batch x Length x Heads or Batch x Length x Heads x Features) + lengths : torch.Tensor + A 1-D tensor + + Returns + ------- + wav : torch.Tensor + """ + tokenizer = self.modules.tokenizer + tokenizer.device = self.device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(self.device) + tokenizer.codec_vocoder.device = self.device + wav = tokenizer.tokens_to_sig( + audio, **self.token_model_kwargs + ) + wav = clean_padding(wav, length) + wav = wav.to(self.device) + return wav + + def _get_inference_opts(self, params): + idx = torch.arange(self.hparams.model_vocab_size, device=self.device)[ + None, : + ] + tracks = torch.arange( + self.hparams.audio_tokens_per_step, device=self.device + )[:, None] + if not self.hparams.use_token_offsets: + tracks = torch.zeros_like(tracks) + track_start = ( + self.hparams.audio_token_shift + + tracks * self.hparams.vocab_size + ) + if self.hparams.flip_layers: + track_start = track_start.flip(0) + track_end = track_start + self.hparams.vocab_size + mask = ( + ((idx >= track_start) & (idx < track_end)) + | (idx == self.hparams.bos_index) + ).logical_not() + mask[ + ( + (idx >= self.hparams.special_num_tokens) + & (idx <= self.hparams.audio_token_shift) + ).expand_as(mask) + ] = True + return self.hparams.inference_opts( + masks={self.hparams.bos_index: mask}, + **params, + device=self.device, + ) + + def recover(self): + test_key_kind = hparams["test_key_kind"] + test_key = hparams["test_key"] + kwargs = { + f"{test_key_kind}_key": test_key + } + logger.info("Revovering a checkpoint") + ckpt = self.hparams.checkpointer.recover_if_possible(**kwargs) + if not ckpt: + logger.error("Checkpoint not found - cannot evaluate") + raise ValueError("No checkpoint available") + logger.info("Checkpoint recovered: %s", ckpt) + + +def enumerate_space(space, entry=None, points=None): + if points is None: + points = [] + if not space: + points.append(entry) + return points + if entry is None: + entry = {} + key, values = next(iter(space.items())) + rest = dict(space) + del rest[key] + for value in values: + enumerate_space(rest, {**entry, key: value}, points) + return points + + +def format_space(space): + return ", ".join( + f"{parameter}: {values}" + for parameter, values in space.items() + ) + + +def format_params(params): + return ", ".join( + f"{key}={value}" + for key, value in params.items() + ) + + +def params_to_folder_name(params): + params_str = "-".join( + f"{key}-{value}" + for key, value in params.items() + ) + return f"eval-{params_str}" + + +if __name__ == "__main__": + # Reading command line arguments + hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:]) + + # Load hyperparameters file with command-line overrides + with open(hparams_file) as fin: + yaml_content = fin.read() + + # Load evaluation hyperparameters + eval_hparams_file = Path(hparams_file).parent / "eval.yaml" + if not eval_hparams_file.exists(): + eval_hparams_file = Path(__file__).parent / "hparams" / "eval.yaml" + if eval_hparams_file.exists(): + logger.info( + "Using evaluation hyperparameters from %s", eval_hparams_file + ) + with open(eval_hparams_file) as eval_hparams: + hparams_yaml = eval_hparams.read() + yaml_content = "\n".join([yaml_content, hparams_yaml]) + else: + logger.info( + "%s not found - not using evaluation hyperparameters", + eval_hparams_file, + ) + hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True) + from train import dataio_prepare + datasets, _ = dataio_prepare(hparams) + dataset = datasets["valid"] + + inference_fit = InferenceFit(hparams, run_opts) + inference_fit.fit(dataset) + inference_fit.write_report() diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 0bd3cac30..6f4d4f808 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -204,6 +204,17 @@ def resample(self, audio, sample_rate=None): ) return audio + def on_evaluation_start(self): + """Invoked when evaluation starts""" + pass + + def on_evaluation_end(self): + """Invoked when evaluation ends""" + pass + + def global_metrics(self): + return {} + def _unbatchify(value): """Removes the batch dimension from the tensor. If a single @@ -238,7 +249,24 @@ def __call__(self, wavs, length): class ASRSpeechEvaluator(SpeechEvaluator): - """A superclass for ASR speech evaluators""" + """A superclass for ASR speech evaluators + + Arguments + --------- + sample_rate : int + The sample rate used by the underlying ASR system + metric_mode : str + macro = metrics are evaluated per utterance and aggregated + micro = metrics are evaluated globally + """ + + def __init__(self, sample_rate=16000, metric_mode="macro"): + super().__init__(sample_rate=sample_rate) + self.metric_mode = metric_mode + self.metrics = {} + + def on_evaluation_start(self): + self.metrics = {} def evaluate( self, @@ -327,21 +355,32 @@ def compute_diff_rate(self, details, device): """ ids = range(1, len(details["pred"]) + 1) - wer_metric, cer_metric = init_asr_metrics() + wer_metric, cer_metric = self.get_asr_metrics("diff") pred = self._replace_blanks(details["pred"]) pred_ref = self._replace_blanks(details["pred_ref"]) pred = [item.split(" ") for item in pred] pred_ref = [item.split(" ") for item in pred_ref] wer_metric.append(ids, pred, pred_ref) cer_metric.append(ids, pred, pred_ref) + count = len(ids) dwer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=device + [score["WER"] for score in wer_metric.scores[-count:]], device=device ) dcer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=device + [score["WER"] for score in cer_metric.scores[-count:]], device=device ) return {"dwer": dwer, "dcer": dcer} + def get_asr_metrics(self, kind="regular"): + if self.metric_mode == "micro": + if kind not in self.metrics: + metrics = init_asr_metrics() + self.metrics[kind] = metrics + metrics = self.metrics[kind] + else: + metrics = init_asr_metrics() + return metrics + def _replace_blanks(self, preds): """Replaces blanks with single spaces, preventing an exception in the case of an unintelligible sample @@ -351,6 +390,19 @@ def _replace_blanks(self, preds): """ return [" " if item == "" else item for item in preds] + def global_metrics(self): + global_metrics = {} + if self.metric_mode == "micro": + wer_metric, cer_metric = self.get_asr_metrics("diff") + if wer_metric.scores: + global_metrics["wer_micro"] = wer_metric.summarize("WER") + global_metrics["cer_micro"] = cer_metric.summarize("WER") + dwer_metric, dcer_metric = self.get_asr_metrics("diff") + if dwer_metric.scores: + global_metrics["dwer_micro"] = dwer_metric.summarize("WER") + global_metrics["dcer_micro"] = dcer_metric.summarize("WER") + return global_metrics + class WhisperASRSpeechEvaluator(ASRSpeechEvaluator): """A speech evaluator implementation based on Whisper ASR @@ -383,12 +435,13 @@ def __init__( source, savedir=None, sample_rate=22050, + metric_mode="macro", min_decode_ratio=0.0, max_decode_ratio=1.0, run_opts=None, unbatch=True, ): - super().__init__(sample_rate=sample_rate) + super().__init__(sample_rate=sample_rate, metric_mode=metric_mode) if run_opts is None: run_opts = {} if savedir is None: @@ -484,16 +537,17 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): ) predicted_words = [self.normalize(text) for text in predicted_words] ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = init_asr_metrics() + wer_metric, cer_metric = self.get_asr_metrics() predicted_words_split = [item.split(" ") for item in predicted_words] text_split = [item.split(" ") for item in text] wer_metric.append(ids, predicted_words_split, text_split) cer_metric.append(ids, predicted_words_split, text_split) + count = len(ids) wer = torch.tensor( - [score["WER"] for score in wer_metric.scores], device=wavs.device + [score["WER"] for score in wer_metric.scores[-count:]], device=wavs.device ) cer = torch.tensor( - [score["WER"] for score in cer_metric.scores], device=wavs.device + [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device ) return { "wer": wer, From d6150c310c3e51a3bc1c5e1d615b3c42e1d8009b Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 23 May 2025 20:07:46 -0400 Subject: [PATCH 251/270] DASB: Add ASR-based selection + minor updates --- .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 3 +- .../valle/hparams/train_espnet_encodec.yaml | 22 +- .../DASB/LibriTTS/TTS/valle/inference_fit.py | 3 +- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 48 ++++- benchmarks/DASB/model/valle.py | 189 +++++++++++++++++- benchmarks/DASB/utils/data.py | 33 +++ 6 files changed, 282 insertions(+), 16 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml index c58c1d49b..8fc302473 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -1,5 +1,6 @@ -eval_dataset: valid +eval_dataset: test eval_suffix: "" +eval_folder: null eval_sample_rate: 16000 eval_spk_sim_sample_rate: 16000 eval_samples: null diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index bc485eaa9..59c31bb99 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -110,6 +110,7 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 debug_infer_max_audio_length: 10 # Label encoder @@ -157,6 +158,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + ####################### Model parameters ########################### # Transformer @@ -222,6 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface source: !ref @@ -264,4 +272,16 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler - seed: !ref \ No newline at end of file + seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py index e2f02e95a..79fcea5d6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -339,9 +339,10 @@ def params_to_folder_name(params): eval_hparams_file, ) hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True) - from train import dataio_prepare + from train import dataio_prepare, select_eval_subset # noqa datasets, _ = dataio_prepare(hparams) dataset = datasets["valid"] + dataset = select_eval_subset(dataset, hparams) inference_fit = InferenceFit(hparams, run_opts) inference_fit.fit(dataset) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 9acb98831..1030c1b9f 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -38,6 +38,7 @@ sys.path.append(base_dir) from evaluation import SpeechEvaluationMetricStats # noqa: E402 +from model.valle import DefaultSampleSelector logger = logging.getLogger(__name__) @@ -273,7 +274,7 @@ def on_stage_start(self, stage, epoch): self.evaluation_metric.on_evaluation_start() self.is_evaluating = True else: - logger.info("No evaluation on epoch %d", epoch) + logger.info("No evaluation on epoch %d", epoch) elif stage == sb.Stage.TEST: self.evaluation_metric.on_evaluation_start() self.is_evaluating = True @@ -282,6 +283,22 @@ def on_stage_start(self, stage, epoch): ) dataset = stage.name.lower() self.resample_fn[dataset](epoch=epoch or 0) + self.init_sample_selector(stage) + + def init_sample_selector(self, stage): + """Initializes the sample selector""" + if stage == sb.Stage.TRAIN: + self.sample_selector = None + else: + sample_selector = getattr( + self.hparams, "sample_selector", None + ) + if not sample_selector: + sample_selector = DefaultSampleSelector + self.sample_selector = sample_selector( + token_shift=self.hparams.audio_token_shift, + offsets=self.offsets + ) def apply_curriculum(self): """Applies curriculum settings, if specified, training only the autoregressive part - or @@ -484,13 +501,23 @@ def inference(self, batch): self.modules.model.module.inference if hasattr(self.modules.model, "module") else self.modules.model.inference - ) + ) + logger.info("Running inference") inference_results = [ inference( prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts() ) for prefix_item in prefix_items ] + logger.info("Running selection") + inference_results = [ + self.sample_selector.select( + tokens, + scores, + label + ) + for (tokens, scores), label in zip(inference_results, batch.label_norm_eval) + ] inferred_tokens = [ self._pad_inferred_sample(result) for result in inference_results @@ -513,8 +540,8 @@ def _pad_inferred_sample(self, result): sample : torch.Tensor A sample, padded if needed """ - if result[0]: - sample = result[0][0] + if result is not None: + sample = result else: sample = torch.zeros( 1000, self.hparams.audio_tokens_per_step, device=self.device @@ -579,8 +606,15 @@ def save_eval(self, stage): def _get_eval_output_folder(self, stage): epoch = self.hparams.epoch_counter.current + eval_folder_name = None + if stage == sb.Stage.TEST and self.hparams.eval_folder: + eval_folder_name = self.hparams.eval_folder + if not eval_folder_name: + eval_folder_name = stage.name.lower() + if self.hparams.eval_suffix: + eval_folder_name += self.hparams.eval_suffix output_folder = ( - Path(self.hparams.output_folder) / "eval" / stage.name.lower() + Path(self.hparams.output_folder) / "eval" / eval_folder_name ) if epoch is not None: output_folder = output_folder / str(epoch) @@ -1384,7 +1418,9 @@ def undo_padding_tensor(batch, lengths): eval_kwargs = { f"{test_key_kind}_key": test_key } - eval_dataset = datasets["test"] + eval_dataset_key = hparams["eval_dataset"] + logger.info("Performing final evaluation on the %s dataset", eval_dataset_key) + eval_dataset = datasets[eval_dataset_key] eval_dataset = select_eval_subset(eval_dataset, hparams) tts_brain.evaluate( test_set=eval_dataset, diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 61fd5bb08..665a5a570 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -12,11 +12,16 @@ # Implementation of Vall-E: https://arxiv.org/abs/2301.02111 +from io import StringIO import logging +import re +import string import torch import inspect +import torchaudio from typing import Tuple, Optional from speechbrain.dataio.dataio import length_to_mask +from speechbrain.utils.metric_stats import ErrorRateStats from torch import Tensor from torch import nn @@ -24,6 +29,13 @@ from dataclasses import dataclass from speechbrain.nnet.losses import reduce_loss, truncate +from speechbrain.lobes.models.huggingface_transformers import Whisper +from speechbrain.decoders.seq2seq import S2SWhisperGreedySearcher +from speechbrain.utils.data_utils import batch_pad_right +from speechbrain.utils.logger import get_logger +from utils.data import undo_padding_tensor + +logger = get_logger(__name__) @dataclass @@ -353,11 +365,13 @@ def inference( if torch.any(modality_change_mask): modality_index = torch.where( modality_change_mask, prev_tok[:, 0], modality_index, - ).flatten().squeeze() - if modality_index.dim() == 0: - modality_index = modality_index.unsqueeze(0) - if modality_index.size(0) > 1: - modality_index = modality_index[0:1] + ) + if is_flattened: + modality_index = modality_index.flatten().squeeze() + if modality_index.dim() == 0: + modality_index = modality_index.unsqueeze(0) + if modality_index.size(0) > 1: + modality_index = modality_index[0:1] mask = modality_index_to_mask(modality_index, opts) logging.warning( f"Step {step}: change modality index {modality_index}" @@ -486,10 +500,10 @@ def inference( gen_tokens_list.append(gen_tokens[b][:item_finish_idx]) gen_scores_list.append(gen_scores[b][:item_finish_idx]) return gen_tokens_list, gen_scores_list - + def apply_lm_head(self, x, track): """Applies the language model head - + Arguments --------- """ @@ -1228,3 +1242,164 @@ def masked_nll_loss( loss *= mask loss = reduce_loss(loss, mask, reduction, 0.0, log_probabilities, targets) return loss + + +class SampleSelector: + """A base class for sample selectors""" + + def select(self, tokens, scores, label): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + label : str + The label for the sample + """ + raise NotImplementedError() + + +class DefaultSampleSelector(SampleSelector): + def __init__(self, **kwargs): + pass + + def select(self, tokens, scores, text): + return tokens[0] + + +RE_PUNCTUATION = re.compile( + "|".join(re.escape(char) for char in string.punctuation) +) + + +class WhisperASRSampleSelector(SampleSelector): + """A selector implemented using Whisper + + Arguments + --------- + tokenizer: BaseTokenizer + A tokenizer interface + source : str + The source for the Whisper model + savedir : str + The path where the Whisper model will be saved + model : Whisper + Alternatively, a pre-initialized Whisper model instance + sample_rate : int + The sample rate of the underlying Whisper model + tokenizer_sample_rate : int + The sample rate of the tokenizer provided + min_decode_ratio : float + The minimum decode ratio for ASR + max_decode_ratio : float + The maximum decode ratio for ASR + language : str + The ASR language + debug : bool + Whether debug mode is enabled. This will trigger + more verbose logging, including a WER report + """ + def __init__( + self, + tokenizer, + source=None, + savedir=None, + model=None, + sample_rate=16000, + tokenizer_sample_rate=16000, + min_decode_ratio=0.0, + max_decode_ratio=1.0, + language="english", + token_shift=0, + offsets=None, + debug=False + ): + self.tokenizer = tokenizer + self.sample_rate = sample_rate + self.tokenizer_sample_rate = tokenizer_sample_rate + if model is not None: + self.model = model + else: + self.model = Whisper( + source, savedir, sample_rate, freeze=True, freeze_encoder=True, + ) + self.model.tokenizer.set_prefix_tokens(language, "transcribe", False) + self.searcher = S2SWhisperGreedySearcher( + self.model, + min_decode_ratio=min_decode_ratio, + max_decode_ratio=max_decode_ratio, + ) + self.token_shift = token_shift + self.offsets = offsets + self.debug = debug + + def select(self, tokens, scores, text): + tokens, length = batch_pad_right(tokens) + tokens_shift = tokens - self.token_shift + if self.offsets is not None: + tokens_shift = tokens_shift - self.offsets + tokens_shift = tokens_shift.clip(0) + wav = self.tokenizer.tokens_to_sig(tokens_shift) + if self.sample_rate != self.tokenizer_sample_rate: + wav = torchaudio.functional.resample( + wav, + orig_freq=self.tokenizer_sample_rate, + new_freq=self.sample_rate + ) + wav = undo_padding_tensor(wav, length) + metric = ErrorRateStats() + text = text.split(" ") + ids = range(len(wav)) + preds = [self.predict(wav_item).split(" ") for wav_item in wav] + metric.append(ids, preds, [text] * len(wav)) + sample_scores = [score["WER"] for score in metric.scores] + idx = torch.argmin(torch.tensor(sample_scores)).item() + logger.info( + "Ground truth text: %s, sample scores: %s, best: #%d", + text, + sample_scores, + idx + ) + if self.debug: + sio = StringIO() + metric.write_stats(sio) + logger.info("%s", sio.getvalue()) + return tokens[idx] + + def predict(self, wav): + if wav.dim() < 2: + wav = wav.unsqueeze(0) + wav = self.model.pad_or_trim(wav) + mels = self.model.log_mel_spectrogram(wav) + enc_out = self.model.forward_encoder(mels) + pred, _, _, _ = self.searcher(enc_out.detach(), torch.tensor(1., device=wav.device)) + pred = self.model.tokenizer.batch_decode( + pred, skip_special_tokens=True + )[0] + pred = self.normalize(pred) + return pred + + def normalize(self, text): + """Performs text normalization (uppercase, remove whitespace, + remove punctuation) + + Arguments + --------- + text : str + Unnormalized text + + Returns + ------- + text : str + Normalized text + """ + text = text.upper() + text = text.strip() + text = RE_PUNCTUATION.sub("", text) + return text + diff --git a/benchmarks/DASB/utils/data.py b/benchmarks/DASB/utils/data.py index 6c68358f5..3ad31419a 100644 --- a/benchmarks/DASB/utils/data.py +++ b/benchmarks/DASB/utils/data.py @@ -89,3 +89,36 @@ def _undo_padding(batch, lengths): def as_dict(batch): """Converts a batch to a dictionary""" return {key: getattr(batch, key) for key in batch._PaddedBatch__keys} + + +def undo_padding_tensor(batch, lengths): + """Produces Python lists given a batch of sentences with + their corresponding relative lengths. + + Arguments + --------- + batch : torch.Tensor + Batch of sentences gathered in a batch. + lengths : torch.Tensor + Relative length of each sentence in the batch. + + Returns + ------- + as_list : list + A python list of the corresponding input tensor. + + Example + ------- + >>> batch=torch.rand([4,100]) + >>> lengths=torch.tensor([0.5,0.6,0.7,1.0]) + >>> snt_list=undo_padding(batch, lengths) + >>> len(snt_list) + 4 + """ + batch_max_len = batch.shape[1] + as_list = [] + for seq, seq_length in zip(batch, lengths): + actual_size = int(torch.round(seq_length * batch_max_len)) + seq_true = seq.narrow(0, 0, actual_size) + as_list.append(seq_true) + return as_list From 5804bbae7ae65db1e0be2e15b5a28300f586e427 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 23 May 2025 23:55:56 -0400 Subject: [PATCH 252/270] DASB: Fix the max validation set size --- benchmarks/DASB/LibriTTS/TTS/valle/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 1030c1b9f..5aae00e3a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -1373,6 +1373,7 @@ def undo_padding_tensor(batch, lengths): "seed": hparams["seed"], "alignments_folder": hparams.get("alignments_folder"), "model_name": hparams["model"].__class__.__name__, + "max_valid_size": hparams.get("max_valid_size", 10000) }, ) From dd65c62459f247c9d20f6d8820caac306266d69c Mon Sep 17 00:00:00 2001 From: flexthink Date: Sat, 24 May 2025 16:24:48 -0400 Subject: [PATCH 253/270] DASB: Evaluations and fit fixes --- benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py | 4 ++++ benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 4 ++-- benchmarks/DASB/utils/eval.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py index ebb619757..8ee32cb9d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -73,6 +73,8 @@ def on_evaluation_start(self, output_folder="eval"): self.read_reports() self.create_reports() self.item_ids = [] + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_start() def on_evaluation_end(self): """Invoked at the beginning of the evaluation cycle. The default @@ -80,6 +82,8 @@ def on_evaluation_end(self): """ logger.info("Ending evaluation") self.write_summary() + for evaluator_key in self.enabled_evaluators: + self.evaluators[evaluator_key].on_evaluation_end() def create_reports(self): """Creates report files and report writers""" diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml index 8fc302473..1e41dd473 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -22,8 +22,8 @@ eval_utmos_judge_id: null eval_perf: False # Inference Fit -inference_fit_top_k: [20, 30] -inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2] +inference_fit_top_k: [10, 20, 30] +inference_fit_sampling_temperature: [0.7, 0.8, 1.0, 1.2, 1.3] inference_fit_key_metric: dwer inference_fit_key_metric_kind: min diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 6f4d4f808..9e99bcce0 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -393,7 +393,7 @@ def _replace_blanks(self, preds): def global_metrics(self): global_metrics = {} if self.metric_mode == "micro": - wer_metric, cer_metric = self.get_asr_metrics("diff") + wer_metric, cer_metric = self.get_asr_metrics("regular") if wer_metric.scores: global_metrics["wer_micro"] = wer_metric.summarize("WER") global_metrics["cer_micro"] = cer_metric.summarize("WER") From 427be64c4089550031be523fef33907fe29d5740 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 25 May 2025 09:21:31 -0400 Subject: [PATCH 254/270] DASB: Add sampling temperature --- .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 59c31bb99..876235ffc 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -111,6 +111,7 @@ n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -230,6 +231,8 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions maxlenratio: !ref nq: !ref nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface source: !ref From 1a9aed4b3a03ab9227341aa78bee9fa5681bd121 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 26 May 2025 19:25:31 -0400 Subject: [PATCH 255/270] DASB: Fix the WER calculation bug --- benchmarks/DASB/utils/eval.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 9e99bcce0..1694355ec 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -321,6 +321,7 @@ def evaluate( length=length_ref, text=text, sample_rate=sample_rate_ref, + metric_key="ref", ) details.update( {f"{key}_ref": value for key, value in details_ref.items()} @@ -459,7 +460,7 @@ def __init__( self.unbatch = unbatch self.to(device) - def evaluate_samples(self, wavs, length, text, sample_rate): + def evaluate_samples(self, wavs, length, text, sample_rate, metric_key="regular"): """Evaluates a batch of samples Arguments @@ -472,6 +473,8 @@ def evaluate_samples(self, wavs, length, text, sample_rate): Text labels corresponding to the waveforms sample_rate : int The sample rate of the waveforms + metric_key : str + The key for metrics Returns ------- @@ -487,24 +490,25 @@ def evaluate_samples(self, wavs, length, text, sample_rate): torch.ones(1, device=wavs.device), text[idx : idx + 1], sample_rate, + metric_key, ) for idx in range(batch_size) ] result = { + "pred": [result["pred"][0] for result in results], + "target": text, "wer": torch.stack( [result["wer"] for result in results] ).squeeze(-1), "cer": torch.stack( [result["cer"] for result in results] ).squeeze(-1), - "pred": [result["pred"][0] for result in results], - "target": text, } return result else: return self._evaluate_samples(wavs, length, text, sample_rate) - def _evaluate_samples(self, wavs, length, text, sample_rate): + def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key): """Evaluates a batch of samples. This function is meant to be used internally. evaluate_samples will call it multiple times if unbatch is enabled. @@ -519,6 +523,8 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): Text labels corresponding to the waveforms sample_rate : int The sample rate of the waveforms + metric_key : bool + Whether to compute the metrics Returns ------- @@ -537,7 +543,7 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): ) predicted_words = [self.normalize(text) for text in predicted_words] ids = range(1, len(wavs) + 1) - wer_metric, cer_metric = self.get_asr_metrics() + wer_metric, cer_metric = self.get_asr_metrics(metric_key) predicted_words_split = [item.split(" ") for item in predicted_words] text_split = [item.split(" ") for item in text] wer_metric.append(ids, predicted_words_split, text_split) @@ -549,12 +555,13 @@ def _evaluate_samples(self, wavs, length, text, sample_rate): cer = torch.tensor( [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device ) - return { + result = { "wer": wer, "cer": cer, "pred": predicted_words, "target": text, } + return result def normalize(self, text): """Performs text normalization (uppercase, remove whitespace, From 8e95fd1be00e3126d97323003d8dd9b1553c9864 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 27 May 2025 14:40:49 -0400 Subject: [PATCH 256/270] DASB: VALL-E: Add sample selection to other tokenizers --- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 23 ++++++++++++++++++ .../TTS/valle/hparams/train_discrete_ssl.yaml | 24 +++++++++++++++++++ .../TTS/valle/hparams/train_encodec.yaml | 22 +++++++++++++++++ .../TTS/valle/hparams/train_mimi.yaml | 23 ++++++++++++++++++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 23 ++++++++++++++++++ 5 files changed, 115 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index cc52722b1..58007632c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -110,6 +110,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -157,6 +159,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + freeze_lm_head: False ####################### Model parameters ########################### @@ -218,6 +226,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.DACTokenizer model_type: !ref @@ -260,3 +271,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 7bf2be75f..c43c01244 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -129,6 +129,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -198,6 +200,13 @@ sample_dataloader_opts: token_model_kwargs: SSL_layers: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + + ####################### Model parameters ########################### # Transformer d_model: 1024 # @orion_step1: --d_model~"choices([512, 768, 1024])" @@ -258,6 +267,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer save_path: !ref / @@ -301,3 +313,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 0114bdcb1..2407a9e0e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -109,6 +109,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -156,6 +158,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + freeze_lm_head: False ####################### Model parameters ########################### @@ -217,6 +225,8 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: !ref @@ -260,3 +270,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index daf37c2a7..b242f353e 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -110,6 +110,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -157,6 +159,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + ####################### Model parameters ########################### # Transformer @@ -218,6 +226,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 0.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.MimiTokenizer source: !ref @@ -257,3 +268,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 7780a3fc3..53df3ebeb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -109,6 +109,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -156,6 +158,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + ####################### Model parameters ########################### # Transformer @@ -219,6 +227,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref @@ -260,3 +271,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file From 6f60f5ae73d8f79b0910693db1f262968bbc8e04 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 27 May 2025 15:04:34 -0400 Subject: [PATCH 257/270] DASB: VALL-E: Add sample selection --- .../valle/hparams/train_speech_tokenizer.yaml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 407562365..7f6a4bef6 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -109,6 +109,8 @@ spk_prompt_length: 150 n_ctx: !ref + + infer_max_audio_length: !ref max_length_ratio: 10.0 +inference_nbest: 1 +inference_sampling_temperature: 1.0 debug_infer_max_audio_length: 10 # Label encoder @@ -156,6 +158,12 @@ sample_dataloader_opts: padding_kwargs: value: !ref +# Sample Selector +sample_selection: default +sample_selector_source: openai/whisper-small +sample_selector_sample_rate: 16000 +sample_selector_debug: False + ####################### Model parameters ########################### # Transformer @@ -217,6 +225,9 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions minlenratio: 1.0 maxlenratio: !ref nq: !ref + nbest: !ref + sampling_temperature: !ref + tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper source: !ref # Only the 24kHz version supports mono audio @@ -256,3 +267,15 @@ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler seed: !ref + +sample_selector: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + default: !name:model.valle.DefaultSampleSelector + asr: !name:model.valle.WhisperASRSampleSelector + tokenizer: !ref + source: !ref + sample_rate: !ref + tokenizer_sample_rate: !ref + savedir: !ref + debug: !ref \ No newline at end of file From 1c7e41fb272141c3f2cdc3d66032019fdf7e0075 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 27 May 2025 23:05:50 -0400 Subject: [PATCH 258/270] DASB: VALL-E: Device fixes --- benchmarks/DASB/model/valle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 665a5a570..e0d8c08ba 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -1327,7 +1327,8 @@ def __init__( else: self.model = Whisper( source, savedir, sample_rate, freeze=True, freeze_encoder=True, - ) + ).to("cuda") + self.model.device = "cuda" self.model.tokenizer.set_prefix_tokens(language, "transcribe", False) self.searcher = S2SWhisperGreedySearcher( self.model, From c7d8866404960b4f36972fb581d296520baa7256 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 27 May 2025 23:24:18 -0400 Subject: [PATCH 259/270] DASB: Device fixes --- benchmarks/DASB/model/valle.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index e0d8c08ba..1f23e7cd0 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -1317,18 +1317,20 @@ def __init__( language="english", token_shift=0, offsets=None, - debug=False + debug=False, + device="cuda" ): self.tokenizer = tokenizer self.sample_rate = sample_rate self.tokenizer_sample_rate = tokenizer_sample_rate + # TODO: Pass the device if model is not None: self.model = model else: self.model = Whisper( source, savedir, sample_rate, freeze=True, freeze_encoder=True, - ).to("cuda") - self.model.device = "cuda" + ).to(device) + self.model.device = device self.model.tokenizer.set_prefix_tokens(language, "transcribe", False) self.searcher = S2SWhisperGreedySearcher( self.model, @@ -1338,6 +1340,11 @@ def __init__( self.token_shift = token_shift self.offsets = offsets self.debug = debug + tokenizer.device = device + if hasattr(tokenizer, "codec_vocoder"): + tokenizer.codec_vocoder.to(device) + tokenizer.codec_vocoder.device = device + def select(self, tokens, scores, text): tokens, length = batch_pad_right(tokens) From d3e94a01d9f9d13a18a29e7462fbd52358063083 Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 2 Jun 2025 22:53:29 -0400 Subject: [PATCH 260/270] DASB: Inference Fit: Device Fix --- benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py index 79fcea5d6..e90fd62d7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -111,6 +111,7 @@ def evaluate(self, dataset, params): return metrics def evaluate_batch(self, batch, params): + batch = batch.to(self.device) audio_tokens, audio_length = self.inference(batch, params) wav = self.create_waveform(audio_tokens, audio_length) wav = wav.squeeze(1) From fc1b0cec64f852a10dccda65a8aebe7fc79fbf2d Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 4 Jun 2025 23:09:02 -0400 Subject: [PATCH 261/270] DASB: add resume logic --- .../DASB/LibriTTS/TTS/valle/inference_fit.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py index e90fd62d7..cc9bef811 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -7,6 +7,7 @@ * Artem Ploujnikov 2024 """ +import json import speechbrain as sb import sys import csv @@ -72,11 +73,32 @@ def fit(self, dataset): logger.info("Parameter Space: %s", format_space(self.space)) evaluations = self.enumerate_param_space() for idx, params in enumerate(tqdm(evaluations, desc="Parameter space")): - eval_result = self.evaluate(dataset, params) + if self.is_completed(params): + eval_result = self.get_result(params) + else: + eval_result = self.evaluate(dataset, params) self.result.append({"idx": idx, **params, **eval_result}) self.best = self.find_best() return self.result, self.best - + + def is_completed(self, params): + folder_name = params_to_folder_name(params) + path = self.output_folder / folder_name / "summary.json" + return path.exists() + + def get_result(self, params): + params_str = format_params(params) + logger.info("Retrieving params for completed run %s", params_str) + folder_name = params_to_folder_name(params) + path = self.output_folder / folder_name / "summary.json" + with open(path) as summary_file: + summary = json.load(summary_file) + result = { + key: summary.get(value, 0.0) + for key, value in self.hparams.inference_fit_metrics.items() + } + return result + def find_best(self): best = self.result[0] op = ( From 0de5ad2b03a156ea2ccb174ed93d01239b415a5c Mon Sep 17 00:00:00 2001 From: flexthink Date: Wed, 4 Jun 2025 23:34:04 -0400 Subject: [PATCH 262/270] DASB: Add top_k customization --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 3 +++ .../DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml | 2 ++ benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml | 2 ++ .../DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml | 2 ++ benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml | 2 ++ .../LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml | 2 ++ .../DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml | 3 ++- 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 58007632c..3c59482b8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -112,6 +112,8 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -228,6 +230,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.DACTokenizer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index c43c01244..d8ea29110 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -131,6 +131,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -269,6 +270,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.DiscreteSSLTokenizer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 2407a9e0e..b7b19bb0d 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -111,6 +111,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -227,6 +228,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.EncodecTokenizer source: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 876235ffc..04f0dceff 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -112,6 +112,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -232,6 +233,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index b242f353e..91249b59b 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -112,6 +112,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -228,6 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.MimiTokenizer diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index 7f6a4bef6..e306f9802 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -111,6 +111,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -227,6 +228,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.SpeechTokenizerWrapper diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 53df3ebeb..9b1733257 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -111,6 +111,7 @@ infer_max_audio_length: !ref max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 +inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder @@ -229,7 +230,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref - + top_k: !ref tokenizer: !new:utils.tokenizer_interface.WavTokenizerWrapper source: !ref From b59d0e4b3b2e8c2ec30be4753d29d1c89eaa0070 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 5 Jun 2025 11:01:20 -0400 Subject: [PATCH 263/270] DASB: Remove a duplicate setting --- benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index 3c59482b8..db6104f1c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -113,7 +113,6 @@ max_length_ratio: 10.0 inference_nbest: 1 inference_sampling_temperature: 1.0 inference_top_k: 20 -inference_top_k: 20 debug_infer_max_audio_length: 10 # Label encoder From 62be9d2e1ebde3dc510a2e539302461f94b0df3e Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 10 Jun 2025 21:56:10 -0400 Subject: [PATCH 264/270] DASB: Add a generator saver/loader for better reproducibility when interrupted --- .../TTS/tokotron/hparams/train_dac.yaml | 3 + .../tokotron/hparams/train_discrete_ssl.yaml | 3 + .../TTS/tokotron/hparams/train_encodec.yaml | 3 + .../hparams/train_espnet_encodec.yaml | 3 + .../hparams/train_fairseq_hubert.yaml | 314 ++++++++++++++++++ .../TTS/tokotron/hparams/train_mimi.yaml | 3 + .../hparams/train_speech_tokenizer.yaml | 3 + .../tokotron/hparams/train_wavtokenizer.yaml | 3 + .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 3 + .../TTS/valle/hparams/train_discrete_ssl.yaml | 3 + .../TTS/valle/hparams/train_encodec.yaml | 3 + .../valle/hparams/train_espnet_encodec.yaml | 3 + .../TTS/valle/hparams/train_mimi.yaml | 3 + benchmarks/DASB/model/custom_model.py | 65 ++++ 14 files changed, 415 insertions(+) create mode 100644 benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml index 01c818370..3fa047b31 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_dac.yaml @@ -275,12 +275,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml index 4efa9f75c..1fe2ebca9 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_discrete_ssl.yaml @@ -324,12 +324,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml index e45794171..3820c8407 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_encodec.yaml @@ -277,12 +277,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml index e45794171..3820c8407 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_espnet_encodec.yaml @@ -277,12 +277,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml new file mode 100644 index 000000000..2b18c0657 --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml @@ -0,0 +1,314 @@ +# ############################################################################ +# Model: Tokenized TTS (WhisperSpeech-inspired) +# Authors: Artem Ploujnikov +# ############################################################################ +# Seed needs to be set at top of yaml, before objects with parameters are made + +seed: 74443 +__set_seed: !apply:torch.manual_seed [!ref ] +run_name: !PLACEHOLDER +output_folder: !ref results/transformer/ +save_folder: !ref /save +train_log: !ref /train_log.txt +testing: True # If set to True, the test evlaution is done, otherwise skipped. + +# Data files +data_folder: !PLACEHOLDER +cached_data_folder: !PLACEHOLDER +data_folder_alignments: null # e.g., /path/to/LibriSpeech +prepare_save_folder: !ref +pretrained_model_save_folder: !ref +representation_mode: discrete +prepare_archive_path: null +prepare_skip_ignore_folders: False +data_mode: lite +train_json: !ref /train.json +valid_json: !ref /valid.json +test_json: !ref /test.json +train_split: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + lite: ["train-clean-100"] + clean: ["train-clean-100", "train-clean-360"] + full: ["train-clean-100", "train-clean-360", "train-other-500"] +valid_split: ["dev-clean"] +test_split: ["test-clean"] +frozen_split_path: null +sample_path: null +progress_folder: !ref /progress +progress_current: !ref /current +init_from: null +num_audio_samples: 32 +samples_interval: 5 + +tokens_folder: !PLACEHOLDER # Path to the folder where extracted tokens are saved. + +tokens_loader: !new:utils.tokens.TokensLoader + data_path: !ref + +# Position shift +use_position_shift: True +max_position_shift: 1000 +position_shift_seed: 42 +position_shift_probability: 1.0 + +freeze_token_model: True + +g2p_src: flexthink/soundchoice-g2p +kmeans_cache_dir: !ref /kmeans_checkpoint +kmeans_dataset: LibriSpeech +model_path: !ref /fairseq-hubert +feature_extractor_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher.pt +kmeans_path: !ref /mhubert_base_25hz_cp_mls_cv_sp_fisher_L11_km500.bin +vocoder_dense_model_name: "mhubert-base-25hz" +vocoder_quantizer_model_name: "kmeans" +vocoder_vocab_size: 500 + +available_speech_model_layers: [1, 3, 7, 12, 18, 23] +speech_model_layers: !ref +select_layers: null +token_offset: 1 +spk_emb_src: speechbrain/spkrec-ecapa-voxceleb-mel-spec +spk_emb_shuffle: True + +splits: ["train", "valid", "test"] +split_ratio: [90, 5, 5] + + +ckpt_interval_minutes: 30 # save checkpoint every N min + +# Training parameters +input: text +number_of_epochs: 1000 +reset_annealing_epoch: null +batch_size: 16 +valid_batch_size: !ref +batch_size_guided: 2 +extract_features_batch_size: 32 +grad_accumulation_factor: 1 +max_grad_norm: 0.01 +sorting: random +num_workers: 4 +skip_prep: False +overfit_test: False +overfit_test_sample_count: !ref +overfit_test_epoch_data_count: 1000 + + +# index +pad_index: 0 +bos_index: 0 +bos_width: 1 +eos_index: 0 +eos_width: 1 +audio_token_shift: 0 + +# stages related parameters +lr: 0.0005 # @orion_step1: --lr~"loguniform(0.00001,0.005)" +lr_warmup_steps: 10000 +lr_annealing_mode: step +guided_attention_weight: 50.0 +guided_attention_sigma: 0.5 +gate_loss_weight: 1.0 +gate_threshold: 0.5 +gate_loss_beta: 0.2 +gate_loss_gamma: 0.01 +gate_loss_max_weight: 1. + +# Inference parameters +eos_mode: gate +scale_factor: 4 + +# Embedding Injection +spk_emb_injection: null + +# Beam Search-specific parameters +min_decode_ratio: 1.0 +max_decode_ratio: 10.0 +beam_size: 5 + + +# Feature parameters +sample_rate: 24000 +model_sample_rate: 16000 +max_audio_length: 2000 +infer_max_audio_length: !ref +debug_infer_max_audio_length: 10 + +# Label encoder +label_encoder: !new:speechbrain.dataio.encoder.TextEncoder +token_list_file_text: char_en.txt +token_list_file_phn: arpabet.txt +token_list_file: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref + +# Gate offset +gate_offset: !apply:Tokotron.distance_diff_loss_ramp + beta: !ref + gamma: !ref + max_weight: !ref + +silence_padding: !ref +use_silence_padding: True + + +spk_emb_model: !name:speechbrain.inference.encoders.MelSpectrogramEncoder.from_hparams + source: !ref + savedir: !ref /ecapa + +# Dataloader options +train_dataloader_opts: + batch_size: !ref + shuffle: True + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +valid_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +test_dataloader_opts: + batch_size: 1 + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +sample_dataloader_opts: + batch_size: !ref + num_workers: !ref + collate_fn: !name:speechbrain.dataio.batch.PaddedBatch + padding_kwargs: + value: !ref + +token_model_kwargs: + SSL_layers: !ref + +####################### Model parameters ########################### +# Transformer +d_model: 512 +nhead: 4 +enc_num_layers: 6 # @orion_step1: --enc_num_layers~"choices([3, 6, 12])" +dec_num_layers: 12 # @orion_step1: --dec_num_layers~"choices([3, 6, 12])" +layerwise_renorm: True +d_ffn: 2048 +transformer_dropout: 0.2 +target_dropout: 0.2 +activation: !name:torch.nn.GELU +vocab_size: 1000 +audio_dim: 1024 +audio_emb_size: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + discrete: 1024 + continuous: 128 +audio_emb_freeze: False +audio_emb_lr: 0.00001 +audio_emb_weight_decay: 0.001 +audio_emb_pretrained: False +text_num_tokens: 39 +phn_num_tokens: 52 +input_num_tokens: !apply:speechbrain.utils.hparams.choice + value: !ref + choices: + text: !ref + phonemes: !ref +audio_tokens_per_step: 6 +attention_type: regularMHA +############################## models ################################ +emb: + spk: + kind: "pretrained" + dim: 192 + injection: !ref + +model: !new:Tokotron.TokotronTransformerModel + input_num_tokens: !ref # yamllint disable-line rule:line-length + audio_num_tokens: !ref + audio_tokens_per_step: !ref + d_model: !ref + d_ffn: !ref + nhead: !ref + enc_num_layers: !ref + dec_num_layers: !ref + dropout: !ref + target_dropout: !ref + activation: !ref + attention_type: !ref + gate_threshold: !ref + gate_offset: !ref + audio_emb_size: !ref + audio_emb_freeze: !ref + max_audio_length: !ref + eos_mode: !ref + infer_max_audio_length: !ref + audio_token_shift: !ref + scale_factor: !ref + representation_mode: !ref + emb: !ref + +vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name + dense_model_name: !ref #"mhubert-base-25hz" + quantizer_model_name: !ref # "kmeans", + vocab_size: !ref #500 + +tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer + feat_extractor_path: !ref + km_path: !ref + layer: !ref + vocoder: !ref + +modules: + model: !ref + tokenizer: !ref + compute_cost: !ref + +# define two optimizers here for two-stage training +opt_class: !name:torch.optim.Adam + lr: !ref +compute_cost: !new:Tokotron.TokotronLoss + guided_attention_weight: !ref + guided_attention_sigma: !ref + gate_weight: !ref + gate_beta: !ref + gate_gamma: !ref + gate_max_weight: !ref + silence_padding: !ref + eos_mode: !ref + eos_index: !ref + eos_width: !ref + audio_tokens_per_step: !ref + audio_token_shift: !ref + representation_mode: !ref + + +lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler + lr_initial: !ref + n_warmup_steps: !ref + +generator: !new:model.custom_model.SaveableGenerator + +checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer + checkpoints_dir: !ref + recoverables: + model: !ref + lr_scheduler: !ref + counter: !ref + generator: !ref + +epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter + limit: !ref + +train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger + save_file: !ref + +spk_sampler: !name:speechbrain.dataio.sampler.ReproducibleRandomSampler + seed: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml index 156e05b02..8d74e195c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_mimi.yaml @@ -264,12 +264,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml index ffb68f2a5..1dfc9a1d7 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -273,12 +273,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml index f4f745716..78975b1a0 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_wavtokenizer.yaml @@ -264,12 +264,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index db6104f1c..e437d7007 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -258,12 +258,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index d8ea29110..82ab4d736 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -300,12 +300,15 @@ lr_annealing: !new:model.Tokotron.TargetedNoamScheduler n_warmup_steps: !ref param_group: 0 +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index b7b19bb0d..23d6ff2b5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -257,12 +257,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index 04f0dceff..e9b28ac7c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -263,12 +263,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 91249b59b..7c1f269ba 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -255,12 +255,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index cdfcd5ced..0a619e43b 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -1,7 +1,14 @@ import math +import re +import speechbrain as sb import torch + from speechbrain.nnet.linear import Linear from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens +from speechbrain.utils.logger import get_logger + + +logger = get_logger(__name__) class AttentionMLP(torch.nn.Module): @@ -222,3 +229,61 @@ def forward(self, logits): dim=1 ) return token_logits + + +@sb.utils.checkpoints.register_checkpoint_hooks +class SaveableGenerator: + """A wrapper that can be used to store the state of + the random number generator in a checkpoint. It helps + with reproducibility in long-running experiments. + + Currently, this only supports CPU and Cuda devices + natively. If you need training on other architectures, + consider implementing a custom generator. + + Running it on an unsupported device not using the Torch + generator interface will simply fail to restore the + state but will not cause an error. + + Arguments + --------- + generators : list, optional + A list of generator objects. If not provided, + """ + + def __init__(self, generators=None): + if generators is None: + generators = { + "default": torch.default_generator + } + if torch.cuda.is_available(): + for idx, generator in torch.cuda.default_generators: + generators[f"cuda:{idx}"] = generator + self.generators = generators + + @sb.utils.checkpoints.mark_as_saver + def _save(self, path): + save_dict = { + key: generator.get_state() + for key, generator in self.generators.items() + } + torch.save(save_dict, path) + + @sb.utils.checkpoints.mark_as_loader + def _recover(self, path, end_of_epoch): + del end_of_epoch + save_dict = torch.load(path) + for key, state in save_dict.items(): + if key == "default": + torch.default_generator.set_state(state) + continue + match = re.match(r"cuda:(\d+)", key) + if match: + if not torch.cuda.is_available(): + logger.warn("Unable to restore RNG for %s, CUDA unavailable", key) + continue + idx = match.group(1) + if idx > torch.cuda.device_count() - 1: + logger.warn("Unable to restore RNG for %s, device not found", key) + continue + torch.cuda.default_generators[idx].set_state(state) From 13f13458ff2cf3d906cf2007cb46e54080580b83 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 12 Jun 2025 15:59:07 -0400 Subject: [PATCH 265/270] DASB: Fixed the saveable generator wrapper to account for CUDA deprecations --- benchmarks/DASB/model/custom_model.py | 43 ++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 0a619e43b..19f1fa3ab 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -257,8 +257,9 @@ def __init__(self, generators=None): "default": torch.default_generator } if torch.cuda.is_available(): - for idx, generator in torch.cuda.default_generators: - generators[f"cuda:{idx}"] = generator + for idx in range(torch.cuda.device_count()): + generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper(idx) + self.generators = generators @sb.utils.checkpoints.mark_as_saver @@ -282,8 +283,42 @@ def _recover(self, path, end_of_epoch): if not torch.cuda.is_available(): logger.warn("Unable to restore RNG for %s, CUDA unavailable", key) continue - idx = match.group(1) + idx = int(match.group(1)) if idx > torch.cuda.device_count() - 1: logger.warn("Unable to restore RNG for %s, device not found", key) continue - torch.cuda.default_generators[idx].set_state(state) + self.generators[key].set_state(state) + + +class _CudaDefaultGeneratorWrapper: + """A generator wrapper for default generators - because torch no longer + exposes default_generators + + This class should not be used outside of SaveableGenerator + + Arguments + --------- + device : int|str + The device index or identifier""" + def __init__(self, device): + self.device = device + + def get_state(self): + """Returns the generator state + + Returns + ------- + result : torch.Tensor + The generator state + """ + return torch.cuda.get_rng_state(self.device) + + def set_state(self, new_state): + """"Sets the generator state + + Arguments + --------- + new_state : dict + The new state + """ + torch.cuda.set_rng_state(new_state, self.device) From cf90559334ea71c6b32dc64956354748d153a497 Mon Sep 17 00:00:00 2001 From: flexthink Date: Sun, 15 Jun 2025 23:31:28 -0400 Subject: [PATCH 266/270] DASB: Fix an issue with Discrete SSL + generators --- .../TTS/valle/hparams/train_discrete_ssl.yaml | 1 + .../TTS/valle/hparams/train_speech_tokenizer.yaml | 3 +++ .../TTS/valle/hparams/train_wavtokenizer.yaml | 3 +++ benchmarks/DASB/model/valle.py | 12 +++++++++--- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 82ab4d736..1950f2886 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -329,4 +329,5 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref + token_model_kwargs: !ref debug: !ref \ No newline at end of file diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index e306f9802..b201aaba4 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -254,12 +254,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index 9b1733257..ae2ce2d95 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -257,12 +257,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 1f23e7cd0..1e098463a 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -1303,6 +1303,9 @@ class WhisperASRSampleSelector(SampleSelector): debug : bool Whether debug mode is enabled. This will trigger more verbose logging, including a WER report + token_model_kwargs : dict + Additional arguments for the tokenizer + decoding function """ def __init__( self, @@ -1318,7 +1321,8 @@ def __init__( token_shift=0, offsets=None, debug=False, - device="cuda" + token_model_kwargs=None, + device="cuda", ): self.tokenizer = tokenizer self.sample_rate = sample_rate @@ -1340,19 +1344,21 @@ def __init__( self.token_shift = token_shift self.offsets = offsets self.debug = debug + if token_model_kwargs is None: + token_model_kwargs = {} + self.token_model_kwargs = token_model_kwargs tokenizer.device = device if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(device) tokenizer.codec_vocoder.device = device - def select(self, tokens, scores, text): tokens, length = batch_pad_right(tokens) tokens_shift = tokens - self.token_shift if self.offsets is not None: tokens_shift = tokens_shift - self.offsets tokens_shift = tokens_shift.clip(0) - wav = self.tokenizer.tokens_to_sig(tokens_shift) + wav = self.tokenizer.tokens_to_sig(tokens_shift, **self.token_model_kwargs) if self.sample_rate != self.tokenizer_sample_rate: wav = torchaudio.functional.resample( wav, From b9488e4f051de424024470dff3b8770deb0b99ee Mon Sep 17 00:00:00 2001 From: flexthink Date: Mon, 7 Jul 2025 16:10:00 -0400 Subject: [PATCH 267/270] DASB: Cosmetic changes --- .../TTS/tokotron/hparams/train_mimi.yaml | 4 +- .../hparams/train_speech_tokenizer.yaml | 2 +- .../TTS/tokotron/hparams/train_sqcodec.yaml | 2 +- .../DASB/LJSpeech/TTS/tokotron/train.py | 68 +++-- .../TTS/valle/hparams/train_discrete_ssl.yaml | 2 +- .../TTS/valle/hparams/train_encodec.yaml | 2 +- .../valle/hparams/train_espnet_encodec.yaml | 16 +- .../TTS/valle/hparams/train_mimi.yaml | 2 +- .../TTS/valle/hparams/train_sqcodec.yaml | 4 +- .../TTS/valle/hparams/train_wavtokenizer.yaml | 2 +- benchmarks/DASB/LJSpeech/TTS/valle/train.py | 50 ++-- .../hparams/train_fairseq_hubert.yaml | 16 +- .../DASB/LibriTTS/TTS/tokotron/train.py | 72 +++-- .../DASB/LibriTTS/TTS/valle/evaluation.py | 12 +- .../DASB/LibriTTS/TTS/valle/hparams/eval.yaml | 4 +- .../LibriTTS/TTS/valle/hparams/train_dac.yaml | 6 +- .../TTS/valle/hparams/train_discrete_ssl.yaml | 4 +- .../TTS/valle/hparams/train_encodec.yaml | 4 +- .../valle/hparams/train_espnet_encodec.yaml | 19 +- .../TTS/valle/hparams/train_mimi.yaml | 4 +- .../valle/hparams/train_speech_tokenizer.yaml | 4 +- .../TTS/valle/hparams/train_sqcodec.yaml | 9 +- .../TTS/valle/hparams/train_wavtokenizer.yaml | 4 +- .../DASB/LibriTTS/TTS/valle/inference_fit.py | 60 ++-- .../LibriTTS/TTS/valle/tokenizer_prepare.py | 108 +++++++ benchmarks/DASB/LibriTTS/TTS/valle/train.py | 215 ++++++++------ benchmarks/DASB/LibriTTS/libritts_prepare.py | 81 ++++- benchmarks/DASB/model/Tokotron.py | 21 +- benchmarks/DASB/model/custom_model.py | 81 +++-- benchmarks/DASB/model/sq_codec.py | 49 +-- benchmarks/DASB/model/valle.py | 278 +++++++++++------- benchmarks/DASB/utils/eval.py | 20 +- benchmarks/DASB/utils/tokenizer_interface.py | 21 +- 33 files changed, 811 insertions(+), 435 deletions(-) create mode 100644 benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml index 505460dfa..3842caa8f 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_mimi.yaml @@ -154,7 +154,7 @@ transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU audio_num_tokens: 2048 -audio_emb_size: 1024 +audio_emb_size: 1024 audio_emb_freeze: False audio_emb_pretrained: False audio_token_offsets: False @@ -166,7 +166,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 -flatten: false +flatten: False attention_type: regularMHA ############################## models ################################ diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml index 0ff172529..cb420591f 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_speech_tokenizer.yaml @@ -165,7 +165,7 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice text: !ref phonemes: !ref audio_tokens_per_step: 2 -flatten: false +flatten: False bandwidth: 1.5 attention_type: regularMHA diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml index f0ab3d9c1..6e87dedfe 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/hparams/train_sqcodec.yaml @@ -156,7 +156,7 @@ transformer_dropout: 0.2 target_dropout: 0.2 activation: !name:torch.nn.GELU audio_num_tokens: 19683 -audio_emb_size: 36 +audio_emb_size: 36 audio_emb_freeze: False audio_emb_pretrained: False audio_token_offsets: False diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 5b9082da5..229d645fe 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -283,7 +283,9 @@ def on_stage_start(self, stage, epoch): self.hparams, "token_model_kwargs", {} ) - self.transform_audio = getattr(self.hparams, "transform_audio", torch.nn.Identity()) + self.transform_audio = getattr( + self.hparams, "transform_audio", torch.nn.Identity() + ) def on_stage_end(self, stage, stage_loss, epoch): """Gets called at the end of an epoch. @@ -330,14 +332,14 @@ def on_stage_end(self, stage, stage_loss, epoch): valid_stats=stage_stats, ) - # Save the current checkpoint and delete previous checkpoints. + # Save the current checkpoint and delete previous checkpoints. ckpt_kwargs = { f"{self.hparams.ckpt_key_kind}_keys": [self.hparams.ckpt_key], } self.checkpointer.save_and_keep_only( meta={"loss": stage_stats["loss"], **eval_summary_stats}, num_to_keep=hparams["ckpt_keep"], - **ckpt_kwargs + **ckpt_kwargs, ) def get_summary_stats(self): @@ -578,7 +580,7 @@ def audio_ref_pipeline(wav): hparams["speech_model_layers"] if "speech_model_layers" in hparams else audio_tokens_per_step - ) + ), ) if silence_token.dim() == 2: silence_token = silence_token.squeeze(-1) @@ -652,6 +654,22 @@ def audio_pipeline(id): datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + sort_datasets(datasets, hparams) + apply_data_scale(datasets, hparams) + + return datasets, silence_padding + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ # Sorting training data with ascending order makes the code much # faster because we minimize zero-padding. In most of the cases, this # does not harm the performance. @@ -666,13 +684,25 @@ def audio_pipeline(id): hparams["train_dataloader_opts"]["shuffle"] = False elif hparams["sorting"] == "random": - hparams["train_dataloader_opts"]["shuffle"] = True - pass - + if not hparams["overfit_test"]: + hparams["train_dataloader_opts"]["shuffle"] = True else: raise NotImplementedError( "sorting must be random, ascending or descending" ) + + +def apply_data_scale(datasets, hparams): + """Selects a fractional dataset if the corresponding parameter is specified, + using random sampling + + Arguments + --------- + datasets : dict + a dictionary of datasets + hparams : dict + parsed hyperparameters + """ data_scale = hparams.get("data_scale") if data_scale: scaled_data_count = int(len(datasets["train"]) * data_scale) @@ -680,8 +710,6 @@ def audio_pipeline(id): select_n=scaled_data_count ) - return datasets, silence_padding - def init_sequence_encoder(hparams): """Initialize a sequence encoder @@ -926,27 +954,33 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + test_summary_file = ( + Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + ) if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + test_summary_file = ( + Path(hparams["output_folder"]) + / "eval" + / "test" + / "summary.json" + ) if test_summary_file.exists(): - logging.info("Test run already completed: %s", test_summary_file) + logging.info( + "Test run already completed: %s", test_summary_file + ) else: eval_kwargs = {} test_key_kind = hparams.get("test_key_kind", "min") test_key = hparams.get("test_key") if test_key: - eval_kwargs = { - f"{test_key_kind}_key": test_key - } + eval_kwargs = {f"{test_key_kind}_key": test_key} tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs + **eval_kwargs, ) - # Save final checkpoint (fixed name) tts_brain.checkpointer.save_checkpoint(name="latest") diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml index bba258f8d..541fc2917 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_discrete_ssl.yaml @@ -207,7 +207,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 6 -flatten: false +flatten: False freeze_lm_head: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml index cae286efd..bfc8c58e4 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_encodec.yaml @@ -175,7 +175,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False bandwidth: 6 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml index 5aae5e0db..8a1e65e2c 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_espnet_encodec.yaml @@ -173,7 +173,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False espnet_repo: https://github.com/espnet/espnet espnet_commit: 1974048563d7c57e11f670d24bac8fb4b5aba4ef model_hub: espnet/libritts_encodec_24k @@ -205,13 +205,13 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions top_k: !ref tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface - source: !ref - model_config: !ref - n_codebook: !ref - save_path: !ref - sample_rate: !ref - model_ckpt: !ref - espnet_commit: !ref + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref modules: model: !ref diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml index edae05d51..21d95dbf9 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_mimi.yaml @@ -174,7 +174,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False bandwidth: 6 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml index fb1ca4d33..d36a0cff0 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_sqcodec.yaml @@ -169,9 +169,9 @@ input_num_tokens: !apply:speechbrain.utils.hparams.choice model_vocab_size: !ref * 2 audio_token_shift: 19683 - + audio_tokens_per_step: 4 -flatten: true +flatten: True ternary_num_digits: 10 pred_mode: ternary freeze_lm_head: False diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml index 110839413..730eb08a5 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LJSpeech/TTS/valle/hparams/train_wavtokenizer.yaml @@ -177,7 +177,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice text: !ref + phonemes: !ref + -audio_tokens_per_step: 1 +audio_tokens_per_step: 1 bandwidth: 6 diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index c932fc872..986eb9a7c 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -198,7 +198,7 @@ def compute_loss_stats( logits_nar, targets_nar, mask, - reduction="batch" + reduction="batch", ): """Computes an autoregressive/non-autoregressive loss breakdown, to be used for metrics/stats @@ -213,7 +213,7 @@ def compute_loss_stats( The non-autoregressive predictions targets_nar : torch.Tensor The targets for non-autoregressive prediction - + Returns ------- stats: dict @@ -222,13 +222,11 @@ def compute_loss_stats( stats = {} if self.train_ar: stats["loss_ar"] = self.hparams.compute_cost( - logits_ar, targets=targets_ar, mask=mask, - reduction=reduction, + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, ) if self.train_nar: stats["loss_nar"] = self.hparams.compute_cost( - logits_nar, targets=targets_nar, mask=mask, - reduction=reduction, + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, ) return stats @@ -280,11 +278,13 @@ def apply_curriculum(self): if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: # NOTE: If there is only one track it's autoregressive self.train_nar = False - elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: - self.train_nar = False elif ( - self.hparams.number_of_epochs_nar is not None - and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ): + self.train_nar = False + elif self.hparams.number_of_epochs_nar is not None and epoch <= ( + self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar ): self.train_ar = False if self.hparams.freeze_lm_head: @@ -367,7 +367,7 @@ def evaluate_batch(self, batch, stage): audio_tokens, audio_length = self.inference(batch) if self.hparams.flip_layers: audio_tokens = audio_tokens.flip(2) - wav = self.create_waveform(audio_tokens, audio_length) + wav = self.create_waveform(audio_tokens, audio_length) wav = wav.squeeze(1) self.save_samples( batch=batch, wav=wav, length=audio_length, stage=stage @@ -438,7 +438,7 @@ def on_stage_end(self, stage, stage_loss, epoch): self.checkpointer.save_and_keep_only( meta={"loss": stage_stats["loss"], **eval_summary_stats}, num_to_keep=hparams["ckpt_keep"], - **ckpt_kwargs + **ckpt_kwargs, ) elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( @@ -498,8 +498,7 @@ def _get_inference_opts(self): if not self.hparams.use_token_offsets: tracks = torch.zeros_like(tracks) track_start = ( - self.hparams.audio_token_shift - + tracks * self.hparams.vocab_size + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size ) if self.hparams.flip_layers: track_start = track_start.flip(0) @@ -523,7 +522,9 @@ def save_samples(self, batch, wav, length, stage): samples = undo_padding_tensor(wav, length) for uttid, sample in zip(batch.uttid, samples): file_name = output_folder / f"pred_{uttid}.wav" - write_audio(file_name, sample.detach().cpu(), self.hparams.model_sample_rate) + write_audio( + file_name, sample.detach().cpu(), self.hparams.model_sample_rate + ) def save_eval(self, stage): """Saves evaluation results @@ -652,7 +653,12 @@ def sig_pipeline(wav): sig = sb.dataio.dataio.read_audio(wav) return sig - dynamic_items = [sig_pipeline, text_pipeline, tokens_pipeline, prompt_pipeline] + dynamic_items = [ + sig_pipeline, + text_pipeline, + tokens_pipeline, + prompt_pipeline, + ] init_sequence_encoder(hparams) use_spk_emb = hparams.get("use_spk_emb", False) @@ -761,7 +767,7 @@ def init_sequence_encoder(hparams): an encoder instance""" encoder = hparams["label_encoder"] token_list_file_name = hparams["token_list_file"] - tokens = read_token_list(token_list_file_name) + tokens = read_token_list(token_list_file_name) encoder.add_unk() for token in hparams["special_tokens"]: token_key = token.replace("<", "").replace(">", "") @@ -990,17 +996,17 @@ def undo_padding_tensor(batch, lengths): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + test_summary_file = ( + Path(hparams["output_folder"]) / "eval" / "test" / "summary.json" + ) if test_summary_file.exists(): logging.info("Test run already completed: %s", test_summary_file) else: test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] - eval_kwargs = { - f"{test_key_kind}_key": test_key - } + eval_kwargs = {f"{test_key_kind}_key": test_key} tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs + **eval_kwargs, ) diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml index 2b18c0657..d30420925 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/hparams/train_fairseq_hubert.yaml @@ -254,17 +254,17 @@ model: !new:Tokotron.TokotronTransformerModel scale_factor: !ref representation_mode: !ref emb: !ref - + vocoder: !apply:textless.vocoders.hifigan.vocoder.CodeHiFiGANVocoder.by_name - dense_model_name: !ref #"mhubert-base-25hz" - quantizer_model_name: !ref # "kmeans", - vocab_size: !ref #500 + dense_model_name: !ref #"mhubert-base-25hz" + quantizer_model_name: !ref # "kmeans", + vocab_size: !ref #500 tokenizer: !new:utils.tokenizer_interface.FairseqHuBERTTokenizer - feat_extractor_path: !ref - km_path: !ref - layer: !ref - vocoder: !ref + feat_extractor_path: !ref + km_path: !ref + layer: !ref + vocoder: !ref modules: model: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 323134e90..7dc4c4ab2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -335,21 +335,33 @@ def on_fit_start(self): def check_init(self): init_from = getattr(self.hparams, "init_from", None) if init_from is not None: - logger.info("Initializing with pre-trained weights from %s", init_from) + logger.info( + "Initializing with pre-trained weights from %s", init_from + ) init_from_path = Path(init_from) model_path = init_from_path / "model.ckpt" with open(model_path, "rb") as model_file: - model_state_dict = torch.load(model_file, map_location=self.device) + model_state_dict = torch.load( + model_file, map_location=self.device + ) tgt_state_dict = self.modules.model.state_dict() ignore_keys = [] for k, v in model_state_dict.items(): - if k in tgt_state_dict and tgt_state_dict[k].shape != v.shape: + if ( + k in tgt_state_dict + and tgt_state_dict[k].shape != v.shape + ): logger.warning("Ignoring shape mismatch for %s", k) ignore_keys.append(k) for k in ignore_keys: del model_state_dict[k] - self.modules.model.load_state_dict(model_state_dict, strict=False) - logger.info("Successfully initialized with pre-trained weights from %s", init_from) + self.modules.model.load_state_dict( + model_state_dict, strict=False + ) + logger.info( + "Successfully initialized with pre-trained weights from %s", + init_from, + ) @torch.no_grad() def evaluate_batch(self, batch, stage): @@ -525,7 +537,7 @@ def tokens_pipeline(label): hparams["speech_model_layers"] if "speech_model_layers" in hparams else audio_tokens_per_step - ) + ), ) else: silence_padding = get_silence_repr(hparams["ssl_model"],) @@ -552,11 +564,9 @@ def tokens_pipeline(label): tokens_loader = hparams.get("tokens_loader") if layer_idx is not None: - tokens_loader_kwargs = { - "num_codebooks": layer_idx - } + tokens_loader_kwargs = {"num_codebooks": layer_idx} else: - tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} + tokens_loader_kwargs = {"num_codebooks": audio_tokens_per_step} @sb.utils.data_pipeline.takes("uttid") @sb.utils.data_pipeline.provides("audio_pad", "audio_bos") @@ -633,6 +643,28 @@ def spk_emb_random_match(uttid, dataset, spk_sample): ) resample_fn[dataset](epoch=0) + sort_datasets(datasets, hparams) + # Exclude samples without phonemes + if hparams["input"] == "phonemes": + for key in datasets: + datasets[key] = datasets[key].filtered_sorted( + key_test={"phn": lambda value: value} + ) + datasets["sample"] = select_sample(hparams, datasets) + return datasets, silence_padding, resample_fn + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ + # Sorting training data with ascending order makes the code much # faster because we minimize zero-padding. In most of the cases, this # does not harm the performance. @@ -655,15 +687,6 @@ def spk_emb_random_match(uttid, dataset, spk_sample): "sorting must be random, ascending or descending" ) - # Exclude samples without phonemes - if hparams["input"] == "phonemes": - for key in datasets: - datasets[key] = datasets[key].filtered_sorted( - key_test={"phn": lambda value: value} - ) - datasets["sample"] = select_sample(hparams, datasets) - return datasets, silence_padding, resample_fn - def select_sample(hparams, datasets): """Selects a sample of files for sample generation, freezing the sample if @@ -1015,17 +1038,18 @@ def apply_overfit_test(hparams, dataset): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) + test_summary_file = next( + Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), + None, + ) if test_summary_file is not None: logging.info("Test run already completed: %s", test_summary_file) else: test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] - eval_kwargs = { - f"{test_key_kind}_key": test_key - } + eval_kwargs = {f"{test_key_kind}_key": test_key} tts_brain.evaluate( test_set=datasets["test"], test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs + **eval_kwargs, ) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py index 8ee32cb9d..58fdd5abb 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -246,10 +246,14 @@ def summarize(self, field=None): ).items() } for evaluator_key in self.enabled_evaluators: - result.update({ - f"{evaluator_key}_{stat_key}": value - for stat_key, value in - self.evaluators[evaluator_key].global_metrics().items()}) + result.update( + { + f"{evaluator_key}_{stat_key}": value + for stat_key, value in self.evaluators[evaluator_key] + .global_metrics() + .items() + } + ) if field is not None: result = result[field] return result diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml index 1e41dd473..f4e975175 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/eval.yaml @@ -71,8 +71,8 @@ eval_summary_log: spk_sim: spk_sim_score_mean inference_fit_space: - top_k: !ref - sampling_temperature: !ref + top_k: !ref + sampling_temperature: !ref inference_fit_metrics: utmos: utmos_utmos_mean diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml index e437d7007..85020bdff 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_dac.yaml @@ -201,7 +201,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 2 -flatten: false +flatten: False # Model Settings model_type: 24khz @@ -229,7 +229,7 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions nq: !ref nbest: !ref sampling_temperature: !ref - top_k: !ref + top_k: !ref tokenizer: !new:utils.tokenizer_interface.DACTokenizer @@ -287,4 +287,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml index 1950f2886..24316c3d2 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_discrete_ssl.yaml @@ -244,7 +244,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 6 -flatten: false +flatten: False freeze_lm_head: False @@ -330,4 +330,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice tokenizer_sample_rate: !ref savedir: !ref token_model_kwargs: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml index 23d6ff2b5..e78119670 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_encodec.yaml @@ -200,7 +200,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False # Model Settings model_hub: facebook/encodec_24khz @@ -286,4 +286,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml index e9b28ac7c..31b425824 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_espnet_encodec.yaml @@ -58,7 +58,6 @@ duration_min: null duration_max: null - ckpt_key: dwer ckpt_key_kind: min ckpt_keep: 2 @@ -200,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False # Model Settings espnet_repo: https://github.com/espnet/espnet @@ -237,13 +236,13 @@ inference_opts: !name:model.valle.SpeechLMInferenceOptions tokenizer: !new:utils.tokenizer_interface.ESPNetEncodecInterface - source: !ref - model_config: !ref - n_codebook: !ref - save_path: !ref - sample_rate: !ref - model_ckpt: !ref - espnet_commit: !ref + source: !ref + model_config: !ref + n_codebook: !ref + save_path: !ref + sample_rate: !ref + model_ckpt: !ref + espnet_commit: !ref modules: model: !ref @@ -292,4 +291,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml index 7c1f269ba..a0d19ce8c 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_mimi.yaml @@ -200,7 +200,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False # Model Settings model_hub: kyutai/mimi @@ -284,4 +284,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml index b201aaba4..52dec45c3 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_speech_tokenizer.yaml @@ -199,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 8 -flatten: false +flatten: False # Model Settings model_hub: fnlp/SpeechTokenizer @@ -283,4 +283,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml index 66fb3535a..93b8bcd09 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_sqcodec.yaml @@ -184,7 +184,7 @@ model_vocab_size: !ref * 2 audio_token_shift: 19683 audio_tokens_per_step: 4 -flatten: true +flatten: True ternary_num_digits: 10 pred_mode: ternary @@ -211,7 +211,7 @@ model: !new:model.valle.ValleLM # yamllint disable-line rule:line-length qk_norm: !ref lm_head: !ref emb: !ref - logits_to_probs: !ref + logits_to_probs: !ref inference_opts: !name:model.valle.SpeechLMInferenceOptions start: !ref @@ -240,7 +240,7 @@ logits_to_probs: !apply:speechbrain.utils.hparams.choice emb: !new:speechbrain.nnet.containers.Sequential ternary: !new:model.sq_codec.TernaryEmbedding - num_digits: !ref + num_digits: !ref flat: True linear: !new:speechbrain.nnet.linear.Linear input_size: !ref * @@ -276,12 +276,15 @@ lr_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: !ref n_warmup_steps: !ref +generator: !new:model.custom_model.SaveableGenerator + checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref lr_scheduler: !ref counter: !ref + generator: !ref epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml index ae2ce2d95..38831c660 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml +++ b/benchmarks/DASB/LibriTTS/TTS/valle/hparams/train_wavtokenizer.yaml @@ -199,7 +199,7 @@ audio_token_shift: !apply:speechbrain.utils.hparams.choice phonemes: !ref + audio_tokens_per_step: 1 -flatten: false +flatten: False # Model Settings model_hub: novateur/WavTokenizer-medium-music-audio-75token @@ -286,4 +286,4 @@ sample_selector: !apply:speechbrain.utils.hparams.choice sample_rate: !ref tokenizer_sample_rate: !ref savedir: !ref - debug: !ref \ No newline at end of file + debug: !ref diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py index cc9bef811..f81252b3a 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -32,8 +32,10 @@ logger = get_logger(__name__) + class InferenceFit: """An inference fit wrapper""" + def __init__(self, hparams, run_opts): device = run_opts.get("device", "cpu") self.hparams = SimpleNamespace(**hparams) @@ -50,7 +52,9 @@ def __init__(self, hparams, run_opts): if not self.hparams.use_token_offsets: self.offsets = torch.zeros_like(self.offsets) self.output_folder_rel = "eval/inference_fit" - self.output_folder = Path(self.hparams.output_folder) / self.output_folder_rel + self.output_folder = ( + Path(self.hparams.output_folder) / self.output_folder_rel + ) self.token_model_kwargs = getattr( self.hparams, "token_model_kwargs", {} ) @@ -120,8 +124,12 @@ def evaluate(self, dataset, params): params_str = format_params(params) logger.info("Starting evaluation of %s", params_str) folder_name = params_to_folder_name(params) - self.evaluation_metric.on_evaluation_start(f"{self.output_folder_rel}/{folder_name}") - for batch in tqdm(dataloader, desc="Evaluation run", total=len(dataset)): + self.evaluation_metric.on_evaluation_start( + f"{self.output_folder_rel}/{folder_name}" + ) + for batch in tqdm( + dataloader, desc="Evaluation run", total=len(dataset) + ): self.evaluate_batch(batch, params) logger.info("Finished evaluation of %s", params_str) self.evaluation_metric.on_evaluation_end() @@ -184,13 +192,13 @@ def inference(self, batch, params): inference = self.modules.model.inference inference_results = [ inference( - prefix=prefix_item.unsqueeze(0), opts=self._get_inference_opts(params) + prefix=prefix_item.unsqueeze(0), + opts=self._get_inference_opts(params), ) for prefix_item in prefix_items ] inferred_tokens = [ - self._pad_inferred_sample(result) - for result in inference_results + self._pad_inferred_sample(result) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) audio_length = audio_length.to(self.device) @@ -220,10 +228,7 @@ def _pad_inferred_sample(self, result): min_length = getattr(self.hparams, "infer_min_length", 10) sample_length, tracks = sample.shape if sample_length < min_length: - sample = pad_right_to( - sample, - (min_length, tracks), - )[0] + sample = pad_right_to(sample, (min_length, tracks),)[0] return sample def create_waveform(self, audio, length): @@ -246,9 +251,7 @@ def create_waveform(self, audio, length): if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device - wav = tokenizer.tokens_to_sig( - audio, **self.token_model_kwargs - ) + wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) wav = clean_padding(wav, length) wav = wav.to(self.device) return wav @@ -263,8 +266,7 @@ def _get_inference_opts(self, params): if not self.hparams.use_token_offsets: tracks = torch.zeros_like(tracks) track_start = ( - self.hparams.audio_token_shift - + tracks * self.hparams.vocab_size + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size ) if self.hparams.flip_layers: track_start = track_start.flip(0) @@ -280,17 +282,13 @@ def _get_inference_opts(self, params): ).expand_as(mask) ] = True return self.hparams.inference_opts( - masks={self.hparams.bos_index: mask}, - **params, - device=self.device, + masks={self.hparams.bos_index: mask}, **params, device=self.device, ) def recover(self): test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] - kwargs = { - f"{test_key_kind}_key": test_key - } + kwargs = {f"{test_key_kind}_key": test_key} logger.info("Revovering a checkpoint") ckpt = self.hparams.checkpointer.recover_if_possible(**kwargs) if not ckpt: @@ -317,23 +315,16 @@ def enumerate_space(space, entry=None, points=None): def format_space(space): return ", ".join( - f"{parameter}: {values}" - for parameter, values in space.items() + f"{parameter}: {values}" for parameter, values in space.items() ) def format_params(params): - return ", ".join( - f"{key}={value}" - for key, value in params.items() - ) + return ", ".join(f"{key}={value}" for key, value in params.items()) def params_to_folder_name(params): - params_str = "-".join( - f"{key}-{value}" - for key, value in params.items() - ) + params_str = "-".join(f"{key}-{value}" for key, value in params.items()) return f"eval-{params_str}" @@ -361,8 +352,11 @@ def params_to_folder_name(params): "%s not found - not using evaluation hyperparameters", eval_hparams_file, ) - hparams = load_hyperpyyaml(yaml_content, overrides, overrides_must_match=True) - from train import dataio_prepare, select_eval_subset # noqa + hparams = load_hyperpyyaml( + yaml_content, overrides, overrides_must_match=True + ) + from train import dataio_prepare, select_eval_subset # noqa + datasets, _ = dataio_prepare(hparams) dataset = datasets["valid"] dataset = select_eval_subset(dataset, hparams) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py new file mode 100644 index 000000000..3fe83556b --- /dev/null +++ b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py @@ -0,0 +1,108 @@ +"""A script to prepare annotations for tokenizers + +""" + +import json +import os +import re +import speechbrain as sb + +from pathlib import Path +from speechbrain.lobes.models.g2p.dataio import build_token_char_map +from speechbrain.utils.logger import get_logger + + +logger = get_logger(__name__) +MULTI_SPACE = re.compile(r"\s{2,}") + + +def phn2txt(phn, phoneme_map): + """Encodes phonemes using a character map for use with SentencePiece + + Arguments + --------- + phn: list + a list of original phonemes (ARPABET) + phoneme_map: dict + the phoneme-to-character map + + Returns + ------- + value: str + the mapped string representation + """ + value = "".join(phoneme_map[phoneme] for phoneme in phn).strip() + value = MULTI_SPACE.sub(" ", value) + return value + + +def prepare_annotation(src, destination_file_name, phonemes): + """Prepares the annotation file + + Arguments + --------- + src: datasets.arrow_dataset.Dataset + the source dataset + destination_file_name: str + the path to the annotation file to be created + phonemes: list + the list of phonemes + """ + phoneme_map = build_token_char_map(phonemes) + annotation = { + key: { + "label": item["label"], + "phonemes": phn2txt(item["phn"], phoneme_map), + } + for key, item in src.items() + } + with open(destination_file_name, "w", encoding="utf-8") as dst_file: + json.dump(annotation, dst_file, indent=2) + + +DATA_SPLITS = ["train", "valid", "test"] + + +def prepare_tokenizer(splits, save_folder, input, phonemes): + """Prepares annotations for the tokenizer + + Arguments + --------- + datasets: list + the list of dataset splits + save_folder: str + the path to the folder where annotations will be saved + input : str + identifies what type of input will be used (text or phonemes) + phonemes: list + the list of phonemes + """ + save_folder = Path(save_folder) + if input == "text": + for key in splits: + src_file_name = save_folder / f"{key}.json" + destination_file_name = ( + save_folder / f"tokenizer_annotation_{key}.json" + ) + destination_file_name.symlink_to(src_file_name) + else: + for key in splits: + destination_file_name = ( + save_folder / f"tokenizer_annotation_{key}.json" + ) + if destination_file_name.exists(): + logger.info( + "Annotation file '%s' already exists", destination_file_name + ) + else: + logger.info( + "Creating tokenizer annotation '%s'", destination_file_name, + ) + data_file_name = save_folder / f"{key}.json" + with open(data_file_name) as data_file: + data = json.load(data_file) + prepare_annotation( + src=data, + destination_file_name=destination_file_name, + phonemes=phonemes, + ) diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 5aae00e3a..13efdaf26 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -38,10 +38,11 @@ sys.path.append(base_dir) from evaluation import SpeechEvaluationMetricStats # noqa: E402 -from model.valle import DefaultSampleSelector +from model.valle import DefaultSampleSelector # noqa: E402 logger = logging.getLogger(__name__) + # Brain class for speech recognition training class VALLEBrain(sb.Brain): """Class that manages the training loop. See speechbrain.core.Brain.""" @@ -85,9 +86,7 @@ def create_waveform(self, audio, length): if hasattr(tokenizer, "codec_vocoder"): tokenizer.codec_vocoder.to(self.device) tokenizer.codec_vocoder.device = self.device - wav = tokenizer.tokens_to_sig( - audio, **self.token_model_kwargs - ) + wav = tokenizer.tokens_to_sig(audio, **self.token_model_kwargs) wav = clean_padding(wav, length) wav = wav.to(self.device) return wav @@ -196,7 +195,7 @@ def compute_objectives(self, predictions, batch, stage): loss = torch.mean(torch.stack(loss_components)) return loss - + def compute_loss_stats( self, logits_ar, @@ -204,11 +203,11 @@ def compute_loss_stats( logits_nar, targets_nar, mask, - reduction="batch" + reduction="batch", ): """Computes an autoregressive/non-autoregressive loss breakdown, to be used for metrics/stats - + Arguments --------- logits_ar : torch.Tensor @@ -219,7 +218,7 @@ def compute_loss_stats( The non-autoregressive predictions targets_nar : torch.Tensor The targets for non-autoregressive prediction - + Returns ------- stats: dict @@ -228,13 +227,11 @@ def compute_loss_stats( stats = {} if self.train_ar: stats["loss_ar"] = self.hparams.compute_cost( - logits_ar, targets=targets_ar, mask=mask, - reduction=reduction, + logits_ar, targets=targets_ar, mask=mask, reduction=reduction, ) if self.train_nar: stats["loss_nar"] = self.hparams.compute_cost( - logits_nar, targets=targets_nar, mask=mask, - reduction=reduction, + logits_nar, targets=targets_nar, mask=mask, reduction=reduction, ) return stats @@ -258,7 +255,7 @@ def on_stage_start(self, stage, epoch): if hasattr(hparams, "speech_model_layers"): self.layer_idx = get_selected_layer_indexes( hparams.available_speech_model_layers, - hparams.speech_model_layers + hparams.speech_model_layers, ) else: self.layer_idx = None @@ -274,7 +271,7 @@ def on_stage_start(self, stage, epoch): self.evaluation_metric.on_evaluation_start() self.is_evaluating = True else: - logger.info("No evaluation on epoch %d", epoch) + logger.info("No evaluation on epoch %d", epoch) elif stage == sb.Stage.TEST: self.evaluation_metric.on_evaluation_start() self.is_evaluating = True @@ -290,14 +287,11 @@ def init_sample_selector(self, stage): if stage == sb.Stage.TRAIN: self.sample_selector = None else: - sample_selector = getattr( - self.hparams, "sample_selector", None - ) + sample_selector = getattr(self.hparams, "sample_selector", None) if not sample_selector: sample_selector = DefaultSampleSelector self.sample_selector = sample_selector( - token_shift=self.hparams.audio_token_shift, - offsets=self.offsets + token_shift=self.hparams.audio_token_shift, offsets=self.offsets ) def apply_curriculum(self): @@ -314,11 +308,13 @@ def apply_curriculum(self): if self.hparams.audio_tokens_per_step == 1 or self.hparams.flatten: # NOTE: If there is only one track it's autoregressive self.train_nar = False - elif self.hparams.number_of_epochs_ar is not None and epoch <= self.hparams.number_of_epochs_ar: - self.train_nar = False elif ( - self.hparams.number_of_epochs_nar is not None - and epoch <= (self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar) + self.hparams.number_of_epochs_ar is not None + and epoch <= self.hparams.number_of_epochs_ar + ): + self.train_nar = False + elif self.hparams.number_of_epochs_nar is not None and epoch <= ( + self.hparams.number_of_epochs_ar + self.hparams.number_of_epochs_nar ): self.train_ar = False if self.hparams.freeze_lm_head: @@ -471,7 +467,7 @@ def on_stage_end(self, stage, stage_loss, epoch): self.checkpointer.save_and_keep_only( meta={"loss": stage_stats["loss"], **eval_summary_stats}, num_to_keep=hparams["ckpt_keep"], - **ckpt_kwargs + **ckpt_kwargs, ) elif stage == sb.Stage.TEST: self.hparams.train_logger.log_stats( @@ -511,16 +507,13 @@ def inference(self, batch): ] logger.info("Running selection") inference_results = [ - self.sample_selector.select( - tokens, - scores, - label + self.sample_selector.select(tokens, scores, label) + for (tokens, scores), label in zip( + inference_results, batch.label_norm_eval ) - for (tokens, scores), label in zip(inference_results, batch.label_norm_eval) ] inferred_tokens = [ - self._pad_inferred_sample(result) - for result in inference_results + self._pad_inferred_sample(result) for result in inference_results ] audio, audio_length = batch_pad_right(inferred_tokens) audio_length = audio_length.to(self.device) @@ -549,10 +542,7 @@ def _pad_inferred_sample(self, result): min_length = getattr(self.hparams, "infer_min_length", 10) sample_length, tracks = sample.shape if sample_length < min_length: - sample = pad_right_to( - sample, - (min_length, tracks), - )[0] + sample = pad_right_to(sample, (min_length, tracks),)[0] return sample def _get_inference_opts(self): @@ -565,8 +555,7 @@ def _get_inference_opts(self): if not self.hparams.use_token_offsets: tracks = torch.zeros_like(tracks) track_start = ( - self.hparams.audio_token_shift - + tracks * self.hparams.vocab_size + self.hparams.audio_token_shift + tracks * self.hparams.vocab_size ) if self.hparams.flip_layers: track_start = track_start.flip(0) @@ -626,7 +615,7 @@ def fit_batch(self, batch): if self.hparams.lr_annealing_mode == "step": self.hparams.lr_annealing(self.optimizer) return loss - + def fit( self, epoch_counter, @@ -688,7 +677,7 @@ def fit( if not ( isinstance(train_set, DataLoader) or isinstance(train_set, LoopedLoader) - ): + ): train_set = self.make_dataloader( train_set, stage=sb.Stage.TRAIN, **train_loader_kwargs ) @@ -698,7 +687,7 @@ def fit( valid_set = sample_dataset( dataset=valid_set, count=self.hparams.valid_inter_data_count, - seed=self.hparams.seed + seed=self.hparams.seed, ) valid_set = self.make_dataloader( @@ -728,7 +717,6 @@ def fit( break - INPUT_FEATURE_MAP = {"text": "label_norm", "phonemes": "phn"} @@ -767,7 +755,7 @@ def dataio_prepare(hparams): hparams["vocab_size"], hparams["audio_tokens_per_step"] ).unsqueeze(0) if not hparams["use_token_offsets"]: - offsets = torch.zeros_like(offsets) + offsets = torch.zeros_like(offsets) if hparams["flip_layers"]: offsets = offsets.flip(-1) @@ -786,7 +774,6 @@ def dataio_prepare(hparams): else: num_codebooks = hparams["audio_tokens_per_step"] - @sb.utils.data_pipeline.takes("label") @sb.utils.data_pipeline.provides("label_norm", "label_norm_eval") def text_pipeline(label): @@ -824,9 +811,7 @@ def spk_prompt(uttid, spk_sample): "audio", "prefix", "prompt", "prefix_length", "length" ) def prompt_pipeline(id, tokens, spk_prompt): - audio = tokens_loader.tokens_by_uttid( - id, num_codebooks=num_codebooks - ) + audio = tokens_loader.tokens_by_uttid(id, num_codebooks=num_codebooks) if hparams["flip_layers"]: audio = audio.flip(-1) yield audio @@ -904,35 +889,80 @@ def sig_pipeline(wav): spk_samplers=spk_samplers, ) resample_fn[dataset](epoch=0) - if hparams["input"] == "phonemes": - dynamic_dataset = dynamic_dataset.filtered_sorted( - key_test={"has_alignments": lambda value: value} - ) - duration_min = hparams.get("duration_min") - duration_max = hparams.get("duration_max") - if duration_min or duration_max: - key_min_value = None - key_max_value = None - if duration_min: - key_min_value = {"duration": duration_min} - if duration_max: - key_max_value = {"duration": duration_max} - dynamic_dataset = dynamic_dataset.filtered_sorted( - key_min_value=key_min_value, - key_max_value=key_max_value, - ) - dynamic_dataset = dynamic_dataset.filtered_sorted( - key_test={ - "wrd": lambda wrd: not any( - "{" in item - for item in wrd - ) - } - ) + dataset = filter_alignments(dataset, hparams) + dataset = filter_duration(dataset, hparams) datasets[dataset] = dynamic_dataset hparams[f"{dataset}_dataloader_opts"]["shuffle"] = False + sort_datasets(datasets, hparams) + + return datasets, resample_fn + + +def filter_duration(dataset, hparams): + """Filters the dataset by sample duration + + Arguments + --------- + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams: dict + Hyperparameters + + Returns + ------- + result : speechbrain.dataio.dataset.DynamicItemDataset + A filtered dataset + """ + duration_min = hparams.get("duration_min") + duration_max = hparams.get("duration_max") + if duration_min or duration_max: + key_min_value = None + key_max_value = None + if duration_min: + key_min_value = {"duration": duration_min} + if duration_max: + key_max_value = {"duration": duration_max} + dataset = dataset.filtered_sorted( + key_min_value=key_min_value, key_max_value=key_max_value, + ) + return dataset + + +def filter_alignments(dataset, hparams): + """Filters the dataset by the presence of alignments if + phonemes are selected as a source + + Arguments + --------- + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset + hparams: dict + Hyperparameters + + Returns + ------- + result : speechbrain.dataio.dataset.DynamicItemDataset + A filtered dataset + """ + if hparams["input"] == "phonemes": + dataset = dataset.filtered_sorted( + key_test={"has_alignments": lambda value: value} + ) + return dataset + + +def sort_datasets(datasets, hparams): + """Sorts datasets according to hyperparameters + + Arguments + --------- + datasets : dict + a key -> value dictionary of datasets (the keys are "train", "valid" and "test") + hparams : dict + a dictionary of hyperparameters + """ # Sorting training data with ascending order makes the code much # faster because we minimize zero-padding. In most of the cases, this # does not harm the performance. @@ -953,7 +983,6 @@ def sig_pipeline(wav): raise NotImplementedError( "sorting must be random, ascending or descending" ) - return datasets, resample_fn def sample_dataset(dataset, count, seed): @@ -974,14 +1003,8 @@ def sample_dataset(dataset, count, seed): generator = torch.Generator() generator.manual_seed(seed) indexes = torch.randperm(len(dataset)).tolist()[:count] - data_ids = [ - dataset.data_ids[idx] - for idx in indexes - ] - return FilteredSortedDynamicItemDataset( - dataset, - data_ids, - ) + data_ids = [dataset.data_ids[idx] for idx in indexes] + return FilteredSortedDynamicItemDataset(dataset, data_ids,) def get_offsets(vocab_size, tracks): @@ -1132,7 +1155,7 @@ def get_selected_layer_indexes(available_layers, selected_layers): Returns ------- - layer_idx : list + layer_idx : list The layer indexes """ if not (selected_layers and available_layers): @@ -1260,9 +1283,13 @@ def select_eval_subset(dataset, hparams, key="eval_subset"): with open(eval_subset_path) as eval_subset_file: eval_subset_ids = [line.strip() for line in eval_subset_file] existing_ids = dataset.data_ids - eval_subset_ids = [uttid for uttid in eval_subset_ids if uttid in existing_ids] + eval_subset_ids = [ + uttid for uttid in eval_subset_ids if uttid in existing_ids + ] if not eval_subset_ids: - raise ValueError("{eval_subset_path}: no items found in the dataset") + raise ValueError( + "{eval_subset_path}: no items found in the dataset" + ) subset = FilteredSortedDynamicItemDataset(dataset, eval_subset_ids) else: subset = dataset @@ -1373,7 +1400,7 @@ def undo_padding_tensor(batch, lengths): "seed": hparams["seed"], "alignments_folder": hparams.get("alignments_folder"), "model_name": hparams["model"].__class__.__name__, - "max_valid_size": hparams.get("max_valid_size", 10000) + "max_valid_size": hparams.get("max_valid_size", 10000), }, ) @@ -1410,21 +1437,27 @@ def undo_padding_tensor(batch, lengths): # Load best checkpoint for evaluation if hparams["testing"]: - test_summary_file = next(Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), None) + test_summary_file = next( + Path(hparams["output_folder"]).glob("eval/test/*/summary.json"), + None, + ) if test_summary_file is not None: - logging.info("Test run already completed: %s", test_summary_file) + logging.info( + "Test run already completed: %s", test_summary_file + ) else: test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] - eval_kwargs = { - f"{test_key_kind}_key": test_key - } + eval_kwargs = {f"{test_key_kind}_key": test_key} eval_dataset_key = hparams["eval_dataset"] - logger.info("Performing final evaluation on the %s dataset", eval_dataset_key) + logger.info( + "Performing final evaluation on the %s dataset", + eval_dataset_key, + ) eval_dataset = datasets[eval_dataset_key] eval_dataset = select_eval_subset(eval_dataset, hparams) tts_brain.evaluate( test_set=eval_dataset, test_loader_kwargs=hparams["test_dataloader_opts"], - **eval_kwargs + **eval_kwargs, ) diff --git a/benchmarks/DASB/LibriTTS/libritts_prepare.py b/benchmarks/DASB/LibriTTS/libritts_prepare.py index 52594eaf9..dda10826d 100644 --- a/benchmarks/DASB/LibriTTS/libritts_prepare.py +++ b/benchmarks/DASB/LibriTTS/libritts_prepare.py @@ -109,16 +109,40 @@ def prepare_libritts( # If specific splits are provided, creates data manifest files accordingly if train_split: wav_list = prepare_split(data_folder, train_split) - create_json(wav_list, save_json_train, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + create_json( + wav_list, + save_json_train, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) if valid_split: wav_list = prepare_split(data_folder, valid_split) # TODO add better way to speedup evaluation if max_valid_size is not None and len(wav_list) > max_valid_size: wav_list = random.sample(wav_list, max_valid_size) - create_json(wav_list, save_json_valid, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + create_json( + wav_list, + save_json_valid, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) if test_split: wav_list = prepare_split(data_folder, test_split) - create_json(wav_list, save_json_test, sample_rate, data_folder, alignments_folder, model_name, skip_resample) + create_json( + wav_list, + save_json_test, + sample_rate, + data_folder, + alignments_folder, + model_name, + skip_resample, + ) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed.") @@ -132,12 +156,29 @@ def prepare_libritts( data_split = split_sets(wav_list, split_ratio) # Creating json files create_json( - data_split["train"], save_json_train, sample_rate, alignments_folder, model_name, skip_resample + data_split["train"], + save_json_train, + sample_rate, + alignments_folder, + model_name, + skip_resample, + ) + create_json( + data_split["valid"], + save_json_valid, + sample_rate, + alignments_folder, + model_name, + skip_resample, ) create_json( - data_split["valid"], save_json_valid, sample_rate, alignments_folder, model_name, skip_resample + data_split["test"], + save_json_test, + sample_rate, + alignments_folder, + model_name, + skip_resample, ) - create_json(data_split["test"], save_json_test, sample_rate, alignments_folder, model_name, skip_resample) def prepare_split(data_folder, split_list): @@ -180,7 +221,15 @@ def prepare_split(data_folder, split_list): return wav_list -def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder=None, model_name=None, skip_resample=False): +def create_json( + wav_list, + json_file, + sample_rate, + data_folder, + alignments_folder=None, + model_name=None, + skip_resample=False, +): """ Creates the json file given a list of wav files. Arguments @@ -266,7 +315,9 @@ def create_json(wav_list, json_file, sample_rate, data_folder, alignments_folder "segment": True if "train" in json_file else False, } if alignments_folder is not None: - alignments_file_name = get_alignment_path(data_folder, alignments_folder, wav_file) + alignments_file_name = get_alignment_path( + data_folder, alignments_folder, wav_file + ) alignments = parse_alignments(alignments_file_name) json_dict[uttid].update(alignments) @@ -309,9 +360,16 @@ def get_alignment_path(data_folder, alignments_folder, file_name): file_name_rel = file_name.relative_to(data_folder) data_slice = file_name_rel.parts[0] - textgrid_folder = file_name_rel.relative_to(Path(data_slice) / "LibriTTS" / data_slice).parent.parent + textgrid_folder = file_name_rel.relative_to( + Path(data_slice) / "LibriTTS" / data_slice + ).parent.parent textgrid_file_name = f"{file_name_rel.stem}.TextGrid" - textgrid_path = Path(alignments_folder) / data_slice / textgrid_folder / textgrid_file_name + textgrid_path = ( + Path(alignments_folder) + / data_slice + / textgrid_folder + / textgrid_file_name + ) return textgrid_path @@ -382,6 +440,7 @@ def check_folders(*folders): return False return True + def parse_alignments(file_name): """Parses a given LibriSpeech-Alignments TextGrid file and converts the results to the desired format (to be used in JSON @@ -417,7 +476,7 @@ def parse_alignments(file_name): "wrd_start": [], "wrd_end": [], "wrd_count": 0, - "unk_count": None + "unk_count": None, } text_grid = textgrids.TextGrid() diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index 6a2de5859..bb414b0d6 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -25,7 +25,11 @@ from speechbrain.nnet.attention import RelPosEncXL from speechbrain.nnet.embedding import Embedding from speechbrain.nnet.linear import Linear -from speechbrain.nnet.losses import kldiv_loss, mse_loss, compute_masked_loss, nll_loss +from speechbrain.nnet.losses import ( + kldiv_loss, + mse_loss, + compute_masked_loss, +) from speechbrain.dataio.dataio import length_to_mask from speechbrain.utils.data_utils import concat_padded_features from speechbrain.nnet.schedulers import NoamScheduler @@ -446,7 +450,7 @@ def __init__( audio_dim=1024, show_inference_progress=True, transform_audio=None, - feed_audio=None + feed_audio=None, ): super().__init__() self.decoder = None @@ -722,7 +726,7 @@ def __init__( emb=None, audio_emb=None, out_proj=None, - multihead_input=True + multihead_input=True, ): super().__init__() self.in_emb = Embedding( @@ -1290,7 +1294,9 @@ def forward( max_len = out_len - 1 if self.multihead_output: out_reshaped = ( - out.transpose(1, 2).reshape(batch_size * heads, out_len, tok_dim) + out.transpose(1, 2).reshape( + batch_size * heads, out_len, tok_dim + ) )[:, :max_len] else: out_reshaped = out @@ -1329,14 +1335,14 @@ def forward( ) audio_reshaped = audio_reshaped[:, :max_len] - if self.multihead_output: + if self.multihead_output: lengths_reshaped = ( audio_length.unsqueeze(-1) .expand(batch_size, heads) .reshape(batch_size * heads) ) else: - lengths_reshaped = audio_length + lengths_reshaped = audio_length seq_loss = self.seq_cost( out_reshaped[:, :tok_len], audio_reshaped, @@ -1903,7 +1909,6 @@ def get_silence_token( unsqueeze=False, device=None, num_codebooks=None, - ): """Attempts to find out the silence tokens for a given model, if applicable @@ -2092,4 +2097,4 @@ def use_silence_padding(dataloader_opts, silence_token, token_keys): "collate_fn": partial( token_collate_fn, silence_token=silence_token, token_keys=token_keys ), - } \ No newline at end of file + } diff --git a/benchmarks/DASB/model/custom_model.py b/benchmarks/DASB/model/custom_model.py index 19f1fa3ab..31110cb58 100644 --- a/benchmarks/DASB/model/custom_model.py +++ b/benchmarks/DASB/model/custom_model.py @@ -4,7 +4,7 @@ import torch from speechbrain.nnet.linear import Linear -from model.sq_codec import tokens_to_ternary, ternary_logits_to_tokens +from model.sq_codec import tokens_to_ternary from speechbrain.utils.logger import get_logger @@ -132,20 +132,16 @@ class TernaryPredictionHead(torch.nn.Module): num_positions : int the number of positions """ + def __init__(self, d_model, num_positions, d_hidden=512, norm=True): super().__init__() self.num_positions = num_positions self.d_model = d_model self.norm = torch.nn.LayerNorm(d_model) if norm else torch.nn.Identity() - self.lin_hidden = Linear( - input_size=d_model, - n_neurons=d_hidden, - ) + self.lin_hidden = Linear(input_size=d_model, n_neurons=d_hidden,) self.act = torch.nn.LeakyReLU() self.lin_p = Linear( - input_size=d_hidden, - n_neurons=num_positions * 3, - bias=False + input_size=d_hidden, n_neurons=num_positions * 3, bias=False ) def forward(self, x, track=None): @@ -193,9 +189,12 @@ class TernaryLogitTokenizer(torch.nn.Module): "probability" : treats the outputs as a probability distribution "argmax" : "hard" mode, only the top probability is used. Cannot be used with top_k sampling with k > 1 - + """ - def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10): + + def __init__( + self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10 + ): super().__init__() self.num_positions = num_positions if num_tokens is None: @@ -204,30 +203,45 @@ def __init__(self, num_positions, num_tokens=None, num_tracks=4, chunk_size=10): self.num_tracks = num_tracks self.chunk_size = chunk_size self.register_buffer("vocab", torch.arange(num_tokens)) - self.register_buffer("vocab_ternary", tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + 1) + self.register_buffer( + "vocab_ternary", + tokens_to_ternary(self.vocab[None, None, None, :], D=num_positions) + + 1, + ) self.register_buffer("idx", torch.arange(3)[None, None, None, None, :]) def forward(self, logits): batch_size, max_len, num_positions, _ = logits.shape logits = logits.softmax(-1) - logits = logits.reshape(batch_size, max_len, self.num_tracks, 1, num_positions // self.num_tracks, 3) + logits = logits.reshape( + batch_size, + max_len, + self.num_tracks, + 1, + num_positions // self.num_tracks, + 3, + ) chunks = logits.chunk( - dim=1, - chunks=math.ceil(logits.size(1) / self.chunk_size) + dim=1, chunks=math.ceil(logits.size(1) / self.chunk_size) ) token_logits_chunks = [] for chunk in chunks: - token_logits_raw = torch.where( - self.vocab_ternary[:, None, None, :, :, None] == self.idx, - chunk, - torch.ones_like(chunk) - ).prod(-1).log().sum(-1).exp() + token_logits_raw = ( + torch.where( + self.vocab_ternary[:, None, None, :, :, None] == self.idx, + chunk, + torch.ones_like(chunk), + ) + .prod(-1) + .log() + .sum(-1) + .exp() + ) token_logits_raw_sum = token_logits_raw.sum(-1, keepdim=True) - token_logits_chunks.append((token_logits_raw / token_logits_raw_sum).squeeze(2)) - token_logits = torch.cat( - token_logits_chunks, - dim=1 - ) + token_logits_chunks.append( + (token_logits_raw / token_logits_raw_sum).squeeze(2) + ) + token_logits = torch.cat(token_logits_chunks, dim=1) return token_logits @@ -248,17 +262,17 @@ class SaveableGenerator: Arguments --------- generators : list, optional - A list of generator objects. If not provided, + A list of generator objects. If not provided, """ def __init__(self, generators=None): if generators is None: - generators = { - "default": torch.default_generator - } + generators = {"default": torch.default_generator} if torch.cuda.is_available(): for idx in range(torch.cuda.device_count()): - generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper(idx) + generators[f"cuda:{idx}"] = _CudaDefaultGeneratorWrapper( + idx + ) self.generators = generators @@ -281,11 +295,15 @@ def _recover(self, path, end_of_epoch): match = re.match(r"cuda:(\d+)", key) if match: if not torch.cuda.is_available(): - logger.warn("Unable to restore RNG for %s, CUDA unavailable", key) + logger.warn( + "Unable to restore RNG for %s, CUDA unavailable", key + ) continue idx = int(match.group(1)) if idx > torch.cuda.device_count() - 1: - logger.warn("Unable to restore RNG for %s, device not found", key) + logger.warn( + "Unable to restore RNG for %s, device not found", key + ) continue self.generators[key].set_state(state) @@ -300,6 +318,7 @@ class _CudaDefaultGeneratorWrapper: --------- device : int|str The device index or identifier""" + def __init__(self, device): self.device = device diff --git a/benchmarks/DASB/model/sq_codec.py b/benchmarks/DASB/model/sq_codec.py index 2c52ee8ac..e5c1ea970 100644 --- a/benchmarks/DASB/model/sq_codec.py +++ b/benchmarks/DASB/model/sq_codec.py @@ -126,7 +126,9 @@ def build_codec_model(self, config): exp_model_config = OmegaConf.load(config) scalar_codec = ScalarModel(**exp_model_config.generator.config) device = next(iter(scalar_codec.parameters())).device - parameter_dict = torch.load(self.ckpt_path, map_location=device, weights_only=False) + parameter_dict = torch.load( + self.ckpt_path, map_location=device, weights_only=False + ) scalar_codec.load_state_dict(parameter_dict["codec_model"]) return scalar_codec @@ -1290,6 +1292,7 @@ class TernaryEmbedding(nn.Module): --------- num_digits : int The number of ternary digits""" + def __init__(self, num_digits, emb_size=512, flat=False): super().__init__() self.num_digits = num_digits @@ -1338,7 +1341,9 @@ def decimal_to_ternary_matrix(decimals, D): corresponds to a batch, and each column is represented as a ternary number. """ B, T = decimals.shape - ternary_matrix = torch.zeros((B, D, T), dtype=torch.long, device=decimals.device) + ternary_matrix = torch.zeros( + (B, D, T), dtype=torch.long, device=decimals.device + ) for pos in range(D): ternary_matrix[:, pos, :] = decimals % 3 # Modulo operation decimals //= 3 # Floor division for next ternary digit @@ -1403,13 +1408,17 @@ def ternary_matrix_to_decimal_torch(matrix): ) = ( matrix.shape ) # B is the batch size, D is the number of digits, N is the number of ternary numbers - powers_of_three = 3 ** torch.arange(D, device=matrix.device) # [3^0, 3^1, ..., 3^(D-1)] + powers_of_three = 3 ** torch.arange( + D, device=matrix.device + ) # [3^0, 3^1, ..., 3^(D-1)] # Reshape powers_of_three for broadcasting: [D] -> [1, D, 1] powers_of_three = powers_of_three[:, None] # Shape [D, 1] # Compute dot product using broadcasting: matrix * powers_of_three along D axis - decimals = torch.sum(matrix * powers_of_three, axis=1) # Sum along the D axis + decimals = torch.sum( + matrix * powers_of_three, axis=1 + ) # Sum along the D axis return decimals @@ -1442,7 +1451,7 @@ def ternary_to_decimal(ternary, n_codebook=4): (Batch x Length x num_positions) - ternary digits n_codebooks : torch.Tensor The number of codebooks - + Returns ------- result: torch.Tensor @@ -1473,7 +1482,9 @@ def ternary_logits_to_tokens(logits, n_codebook=4): Token IDs """ ternary_matrix = logits_to_ternary(logits) - tokens = ternary_to_decimal(ternary_matrix.transpose(-1, -2), n_codebook=n_codebook) + tokens = ternary_to_decimal( + ternary_matrix.transpose(-1, -2), n_codebook=n_codebook + ) return tokens @@ -1498,10 +1509,9 @@ def tokens_to_ternary(tokens, D=9): batch_size = tokens.size(0) n_codebook = tokens.size(2) tokens = tokens.view(batch_size, -1, n_codebook).permute(2, 0, 1).clone() - ternary_matrix = torch.cat([ - decimal_to_ternary_matrix(item, D=D) - 1 - for item in tokens - ], dim=1) + ternary_matrix = torch.cat( + [decimal_to_ternary_matrix(item, D=D) - 1 for item in tokens], dim=1 + ) ternary_matrix = ternary_matrix.transpose(1, 2) if not has_batch: ternary_matrix = ternary_matrix[0] @@ -1525,7 +1535,15 @@ def logits_to_ternary(logits): return ternary -def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ternary", num_positions=9, reduction="mean"): +def ternary_loss( + predictions, + targets, + length=None, + mask=None, + targets_type="ternary", + num_positions=9, + reduction="mean", +): if targets.dim() < 3: targets = targets.unsqueeze(-1) if targets_type == "tokens": @@ -1534,15 +1552,10 @@ def ternary_loss(predictions, targets, length=None, mask=None, targets_type="ter targets_cat = targets + 1 predictions_loss = predictions.permute(0, 3, 1, 2).contiguous() loss = nn.functional.nll_loss( - predictions_loss, - targets_cat, - reduction="none" + predictions_loss, targets_cat, reduction="none" ) if length is not None: - mask = length_to_mask( - length * max_len, - max_len - ) + mask = length_to_mask(length * max_len, max_len) mask = mask.unsqueeze(-1) if mask is not None: loss = loss * mask diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 1e098463a..340dcdc0f 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -135,7 +135,7 @@ def __init__( n_layer=ar_layer, qk_norm=qk_norm, dropout=dropout, - target_dropout=target_dropout + target_dropout=target_dropout, ) if nq > 1: # NOTE: An NAR encoder is not needed if there is only one track @@ -217,9 +217,13 @@ def forward( :, 1: ] # [B, T, V] max_len = dec_seq.size(1) - mask = length_to_mask(dec_seq_lengths * max_len - 1, max_len - 1).bool() + mask = length_to_mask( + dec_seq_lengths * max_len - 1, max_len - 1 + ).bool() mask = mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - h_nar = self.nar_decoder(input_nar_emb, nar_level_idx - 1, mask=mask) + h_nar = self.nar_decoder( + input_nar_emb, nar_level_idx - 1, mask=mask + ) # Logits logits_ar, logits_nar = None, None @@ -255,32 +259,7 @@ def prepare_input(self, dec_seq_emb, prefix_len, level): mask = torch.logical_or(level_mask, prefix_mask) return dec_seq_emb.masked_fill(~mask, 0.0).sum(2) - @torch.no_grad() - def inference( - self, prefix, opts, enc_seq=None, suffix=None, - ): - """Vall-E Inference. - - Arguments - --------- - prefix : torch.Tensor - Prefix part of dec_seq (B, T, nq). - opts : SpeechLMInferenceOptions - inference options. - enc_seq : torch.Tensor - Encoder token sequence (B, T, nq). - suffix : torch.Tensor - suffix part of dec_seq (B, T, nq), - usually the target sequence for teacher-forcing. - - Returns - ------- - gen_tokens_list : list - Generated tokens - gen_scores_list : list - The scores associated with the generated tokens - """ - + def _init_inference(self, prefix, opts, enc_seq, suffix): # (1) initialization cache = self.ar_decoder.init() @@ -324,6 +303,59 @@ def inference( if is_flattened: prev_tok = prev_tok.expand(1, tracks) mask_cache = [] + return ( + prefix_emb, + generated, + finish_idx, + cache, + modality_index, + mask, + mask_cache, + prev_tok, + minlen, + maxlen, + is_flattened, + ) + + @torch.inference_mode() + def inference( + self, prefix, opts, enc_seq=None, suffix=None, + ): + """Vall-E Inference. + + Arguments + --------- + prefix : torch.Tensor + Prefix part of dec_seq (B, T, nq). + opts : SpeechLMInferenceOptions + inference options. + enc_seq : torch.Tensor + Encoder token sequence (B, T, nq). + suffix : torch.Tensor + suffix part of dec_seq (B, T, nq), + usually the target sequence for teacher-forcing. + + Returns + ------- + gen_tokens_list : list + Generated tokens + gen_scores_list : list + The scores associated with the generated tokens + """ + ( + prefix_emb, + generated, + finish_idx, + cache, + modality_index, + mask, + mask_cache, + prev_tok, + minlen, + maxlen, + is_flattened, + ) = self._init_inference(prefix, opts, enc_seq, suffix) + modality_tokens = torch.tensor( list(opts.masks.keys()), device=prefix.device ) @@ -334,15 +366,13 @@ def inference( prev_tok = prev_tok.unsqueeze(1) prev_emb = self.emb(prev_tok).squeeze(2) # [B, 1, D] h_ar = self.ar_decoder(prev_emb, kv_cache=cache) - logits = self.logits_to_probs(self.apply_lm_head(h_ar, 0)) # [B, 1, V] + logits = self.logits_to_probs( + self.apply_lm_head(h_ar, 0) + ) # [B, 1, V] if logits.dim() < 4: logits = logits.unsqueeze(-2) gen_tok, gen_score = logits_to_tokens( - logits, - opts, - mask, - allow_eos=step >= minlen, - nq_level=0, + logits, opts, mask, allow_eos=step >= minlen, nq_level=0, ) # [B, 1, 1] -> [B, 1] gen_tok, gen_score = gen_tok.squeeze(1), gen_score.squeeze(1) @@ -403,10 +433,12 @@ def inference( valid_idx = finish_idx.ne(-1).nonzero(as_tuple=True)[0] if len(valid_idx) == 0: self.ar_decoder.reset() - logging.warning(f"No valid examples. Return None") + logging.warning("No valid examples. Return None") return [], [] elif len(valid_idx) < prefix.size(0): - logging.info(f"Only {len(valid_idx)} of {prefix.size(0)} are valid") + logging.info( + "Only %d of %d are valid", len(valid_idx), prefix.size(0) + ) finish_idx = finish_idx[valid_idx] prefix_emb = prefix_emb[valid_idx] @@ -426,70 +458,18 @@ def inference( self.ar_decoder.reset() # (4) non-auto-regressive loop on the remained code layers - # (4.1) NAR initialization - if opts.search_algo == "teacher_force": - prev_tok = suffix[:, :, 0] - else: - prev_tok = gen_tokens_ar[:, :, 0] - start_token = torch.tensor( - [opts.start], device=prefix.device - )[None, None, :] - - # (4.2) NAR loop if self.nq > 1: - start_emb = self.emb(start_token).squeeze().tile( - len(valid_idx), 1, 1 - ) # [B, 1, D] - prev_emb = torch.cat( - [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 - ) # [B, T, D] - - ones = torch.ones_like(valid_idx) - mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() - mask = mask.unsqueeze(1).unsqueeze(1) - generated = {"token": [], "score": []} - - mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache - vocab_mask = torch.cat(mask_cache, dim=1) - - for step in range(1, opts.nq): - h_nar = self.nar_decoder( - prev_emb, ones * step - 1, mask=mask - ) # [B, T, D] - - logits = self.apply_lm_head(h_nar, step) - logits = self.logits_to_probs(logits) - gen_tok, gen_score = logits_to_tokens( - logits.unsqueeze(2), - opts, - vocab_mask, - search_algo="greedy_search", - allow_eos=False, - nq_level=step, - ) - gen_tok, gen_score = ( - gen_tok.squeeze(2), - gen_score.squeeze(2), - ) # [B, T] - - generated["token"].append(gen_tok[:, prefix.size(1) :]) - generated["score"].append(gen_score[:, prefix.size(1) :]) - - if opts.search_algo == "teacher_force": - prev_tok = suffix[:, :, step] - else: - prev_tok = generated["token"][-1] - prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] - prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb - - # (5) combine AR and NAR results - gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] - gen_scores_nar = torch.stack(generated["score"], dim=2) - - gen_tokens = torch.cat( - [gen_tokens_ar, gen_tokens_nar], dim=2 - ) # [B, T, nq] - gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + gen_tokens, gen_scores = self._nar_inference( + opts, + gen_tokens_ar, + gen_scores_ar, + valid_idx, + prefix_emb, + prefix, + suffix, + finish_idx, + mask_cache, + ) else: gen_tokens = gen_tokens_ar gen_scores = gen_scores_ar @@ -501,6 +481,83 @@ def inference( gen_scores_list.append(gen_scores[b][:item_finish_idx]) return gen_tokens_list, gen_scores_list + def _nar_inference( + self, + opts, + gen_tokens_ar, + gen_scores_ar, + valid_idx, + prefix_emb, + prefix, + suffix, + finish_idx, + mask_cache, + ): + # (4.1) NAR initialization + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, 0] + else: + prev_tok = gen_tokens_ar[:, :, 0] + start_token = torch.tensor([opts.start], device=prefix.device)[ + None, None, : + ] + + start_emb = ( + self.emb(start_token).squeeze().tile(len(valid_idx), 1, 1) + ) # [B, 1, D] + prev_emb = torch.cat( + [prefix_emb[:, 1:], start_emb, self.emb(prev_tok)], dim=1 + ) # [B, T, D] + + ones = torch.ones_like(valid_idx) + mask = length_to_mask(prefix.size(1) + finish_idx + 1).bool() + mask = mask.unsqueeze(1).unsqueeze(1) + generated = {"token": [], "score": []} + + mask_cache = [mask_cache[0]] * prefix.size(1) + mask_cache + vocab_mask = torch.cat(mask_cache, dim=1) + + # (4.2) NAR loop + for step in range(1, opts.nq): + h_nar = self.nar_decoder( + prev_emb, ones * step - 1, mask=mask + ) # [B, T, D] + + logits = self.apply_lm_head(h_nar, step) + logits = self.logits_to_probs(logits) + gen_tok, gen_score = logits_to_tokens( + logits.unsqueeze(2), + opts, + vocab_mask, + search_algo="greedy_search", + allow_eos=False, + nq_level=step, + ) + gen_tok, gen_score = ( + gen_tok.squeeze(2), + gen_score.squeeze(2), + ) # [B, T] + + generated["token"].append(gen_tok[:, prefix.size(1) :]) + generated["score"].append(gen_score[:, prefix.size(1) :]) + + if opts.search_algo == "teacher_force": + prev_tok = suffix[:, :, step] + else: + prev_tok = generated["token"][-1] + prev_emb[:, prefix.size(1) :] += self.emb(prev_tok) # [B, T, D] + prev_emb[:, prefix.size(1) - 1 : prefix.size(1)] += start_emb + + # (5) combine AR and NAR results + gen_tokens_nar = torch.stack(generated["token"], dim=2) # [B, T, nq] + gen_scores_nar = torch.stack(generated["score"], dim=2) + + gen_tokens = torch.cat( + [gen_tokens_ar, gen_tokens_nar], dim=2 + ) # [B, T, nq] + gen_scores = torch.cat([gen_scores_ar, gen_scores_nar], dim=2) + return gen_tokens, gen_scores + def apply_lm_head(self, x, track): """Applies the language model head @@ -630,7 +687,8 @@ class TransformerDecoder(nn.Module): The target dropout probability layer_class : type The layer type to be used - """ + """ + def __init__( self, n_ctx, @@ -1279,7 +1337,7 @@ def select(self, tokens, scores, text): class WhisperASRSampleSelector(SampleSelector): """A selector implemented using Whisper - + Arguments --------- tokenizer: BaseTokenizer @@ -1307,6 +1365,7 @@ class WhisperASRSampleSelector(SampleSelector): Additional arguments for the tokenizer decoding function """ + def __init__( self, tokenizer, @@ -1358,12 +1417,14 @@ def select(self, tokens, scores, text): if self.offsets is not None: tokens_shift = tokens_shift - self.offsets tokens_shift = tokens_shift.clip(0) - wav = self.tokenizer.tokens_to_sig(tokens_shift, **self.token_model_kwargs) + wav = self.tokenizer.tokens_to_sig( + tokens_shift, **self.token_model_kwargs + ) if self.sample_rate != self.tokenizer_sample_rate: wav = torchaudio.functional.resample( wav, orig_freq=self.tokenizer_sample_rate, - new_freq=self.sample_rate + new_freq=self.sample_rate, ) wav = undo_padding_tensor(wav, length) metric = ErrorRateStats() @@ -1377,7 +1438,7 @@ def select(self, tokens, scores, text): "Ground truth text: %s, sample scores: %s, best: #%d", text, sample_scores, - idx + idx, ) if self.debug: sio = StringIO() @@ -1391,13 +1452,15 @@ def predict(self, wav): wav = self.model.pad_or_trim(wav) mels = self.model.log_mel_spectrogram(wav) enc_out = self.model.forward_encoder(mels) - pred, _, _, _ = self.searcher(enc_out.detach(), torch.tensor(1., device=wav.device)) + pred, _, _, _ = self.searcher( + enc_out.detach(), torch.tensor(1.0, device=wav.device) + ) pred = self.model.tokenizer.batch_decode( pred, skip_special_tokens=True )[0] pred = self.normalize(pred) return pred - + def normalize(self, text): """Performs text normalization (uppercase, remove whitespace, remove punctuation) @@ -1416,4 +1479,3 @@ def normalize(self, text): text = text.strip() text = RE_PUNCTUATION.sub("", text) return text - diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 1694355ec..5d90069ef 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -250,7 +250,7 @@ def __call__(self, wavs, length): class ASRSpeechEvaluator(SpeechEvaluator): """A superclass for ASR speech evaluators - + Arguments --------- sample_rate : int @@ -365,10 +365,12 @@ def compute_diff_rate(self, details, device): cer_metric.append(ids, pred, pred_ref) count = len(ids) dwer = torch.tensor( - [score["WER"] for score in wer_metric.scores[-count:]], device=device + [score["WER"] for score in wer_metric.scores[-count:]], + device=device, ) dcer = torch.tensor( - [score["WER"] for score in cer_metric.scores[-count:]], device=device + [score["WER"] for score in cer_metric.scores[-count:]], + device=device, ) return {"dwer": dwer, "dcer": dcer} @@ -460,7 +462,9 @@ def __init__( self.unbatch = unbatch self.to(device) - def evaluate_samples(self, wavs, length, text, sample_rate, metric_key="regular"): + def evaluate_samples( + self, wavs, length, text, sample_rate, metric_key="regular" + ): """Evaluates a batch of samples Arguments @@ -524,7 +528,7 @@ def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key): sample_rate : int The sample rate of the waveforms metric_key : bool - Whether to compute the metrics + Whether to compute the metrics Returns ------- @@ -550,10 +554,12 @@ def _evaluate_samples(self, wavs, length, text, sample_rate, metric_key): cer_metric.append(ids, predicted_words_split, text_split) count = len(ids) wer = torch.tensor( - [score["WER"] for score in wer_metric.scores[-count:]], device=wavs.device + [score["WER"] for score in wer_metric.scores[-count:]], + device=wavs.device, ) cer = torch.tensor( - [score["WER"] for score in cer_metric.scores[-count:]], device=wavs.device + [score["WER"] for score in cer_metric.scores[-count:]], + device=wavs.device, ) result = { "wer": wer, diff --git a/benchmarks/DASB/utils/tokenizer_interface.py b/benchmarks/DASB/utils/tokenizer_interface.py index 1ba9bc21a..bc2a43966 100644 --- a/benchmarks/DASB/utils/tokenizer_interface.py +++ b/benchmarks/DASB/utils/tokenizer_interface.py @@ -7,6 +7,8 @@ --------- * Pooneh Mousavi, 2024 """ + +import importlib import sys import os import torch @@ -559,17 +561,18 @@ def _load(self): filename=self.model_ckpt, source=self.source, savedir=str(self.save_path), - save_filename=str(Path(self.model_ckpt).name) + save_filename=str(Path(self.model_ckpt).name), ) config_file_name = fetch( filename=self.model_config, source=self.source, savedir=str(self.save_path), - save_filename="config.yaml" + save_filename="config.yaml", ) with open(config_file_name) as config_file: config = yaml.safe_load(config_file) from espnet2.gan_codec.encodec.encodec import Encodec as ESPNetEncodec + self.encodec = ESPNetEncodec(**config["codec_conf"]) device = next(iter(self.encodec.parameters())).device state_dict = torch.load(ckpt_file_name, map_location=device) @@ -581,7 +584,7 @@ def _load(self): def _load_espnet(self): try: - import espnet2 + importlib.import_module("espnet2") except ModuleNotFoundError: self._download_espnet() @@ -590,13 +593,17 @@ def _download_espnet(self): espnet_path = self.save_path / "espnet" if not espnet_path.exists(): logger.info("Cloining %s into %s", self.espnet_repo, espnet_path) - cmd = shlex.join(["git", "clone", self.espnet_repo, str(espnet_path)]) + cmd = shlex.join( + ["git", "clone", self.espnet_repo, str(espnet_path)] + ) run_shell(cmd) else: logger.info("%s already exists", espnet_path) if self.espnet_commit: logger.info("Checking out %s", self.espnet_commit) - cmd = shlex.join(["git", "-C", str(espnet_path), "checkout", self.espnet_commit]) + cmd = shlex.join( + ["git", "-C", str(espnet_path), "checkout", self.espnet_commit] + ) run_shell(cmd) logger.info("Installing") cmd = shlex.join(["pip", "install", "-e", str(espnet_path)]) @@ -609,7 +616,7 @@ def sig_to_tokens(self, signal, lengths=None, num_codebooks=None, **kwargs): if signal.dim() < 3: signal = signal.unsqueeze(1) tokens = self.encodec.encode(signal) - return tokens.permute(1, 2, 0)[:, :, :self.n_codebook] + return tokens.permute(1, 2, 0)[:, :, : self.n_codebook] @torch.no_grad() def tokens_to_sig(self, tokens, **kwargs): @@ -628,4 +635,4 @@ def get_pretrained_embeddings( """ raise ValueError( "ESPNet Encodec does not have any trainable quantizer or embedding since it uses scalar quantization." - ) \ No newline at end of file + ) From c6a20d85d8f956be8174450e9bcf10d352e3a2a7 Mon Sep 17 00:00:00 2001 From: flexthink Date: Tue, 8 Jul 2025 00:24:27 -0400 Subject: [PATCH 268/270] DASB: TTS: Fix docstrings --- .../LJSpeech/TTS/tokotron/audio_tokens.py | 1 - .../DASB/LJSpeech/TTS/valle/evaluation.py | 5 + benchmarks/DASB/LJSpeech/TTS/valle/train.py | 13 ++ .../DASB/LibriTTS/TTS/tokotron/evaluate.py | 53 ++++++ .../DASB/LibriTTS/TTS/tokotron/train.py | 20 +++ .../DASB/LibriTTS/TTS/valle/evaluation.py | 5 + .../DASB/LibriTTS/TTS/valle/inference_fit.py | 130 +++++++++++++- .../LibriTTS/TTS/valle/tokenizer_prepare.py | 2 - benchmarks/DASB/LibriTTS/TTS/valle/train.py | 26 +++ benchmarks/DASB/model/Tokotron.py | 26 +++ benchmarks/DASB/model/valle.py | 170 +++++++++++++++--- benchmarks/DASB/utils/eval.py | 28 +++ 12 files changed, 444 insertions(+), 35 deletions(-) delete mode 120000 benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py deleted file mode 120000 index e34e113e5..000000000 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/audio_tokens.py +++ /dev/null @@ -1 +0,0 @@ -../../../utils/audio_tokens.py \ No newline at end of file diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py index d5aaa649d..9700e8363 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/evaluation.py @@ -1,3 +1,8 @@ +"""TTS evaluation tools + +Authors + * Artem Ploujnikov 2024 +""" import json import torch import logging diff --git a/benchmarks/DASB/LJSpeech/TTS/valle/train.py b/benchmarks/DASB/LJSpeech/TTS/valle/train.py index 986eb9a7c..899c0f159 100644 --- a/benchmarks/DASB/LJSpeech/TTS/valle/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/valle/train.py @@ -518,6 +518,19 @@ def _get_inference_opts(self): ) def save_samples(self, batch, wav, length, stage): + """Saves audio samples + + Arguments + --------- + batch : PaddedBatch + An audio batch + wav : torch.Tensor + Generated audio + length : torch.Tensor + Relative lengths + stage : speechbrain.Stage + The training stage + """ output_folder = self._get_eval_output_folder(stage) samples = undo_padding_tensor(wav, length) for uttid, sample in zip(batch.uttid, samples): diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py index aa7ee2c4b..9b2801ee8 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/evaluate.py @@ -169,6 +169,23 @@ def create_reports(self): self.perf_writer.writeheader() def infer(self, tokens, tokens_length, emb): + """Performs inference + + Arguments + --------- + tokens : torch.Tensor + A token sequence + tokens_length : torch.Tensor + Relative lengths + emb : dict + Embeddings for conditioning + + Returns + ------- + wav : torch.Tensor + The waveform + stats : dict + Statistics""" stats = {} if self.hparams.eval_perf: flop_counter = FlopCounterMode() @@ -190,6 +207,21 @@ def infer(self, tokens, tokens_length, emb): return infer_out, stats def vocoder(self, infer_out, emb): + """Runs the vocoder to create a waveform + + Arguments + --------- + infer_out : Tokotron.TokotronInfernceOutput + Inference output + emb : dict + Embeddings for conditioning + + Returns + ------- + wav : torch.Tensor + The waveform + stats : dict + Statistics""" stats = {} if self.hparams.eval_perf: flop_counter = FlopCounterMode() @@ -363,6 +395,14 @@ def write_summary(self): json.dump(summary, output_file, indent=4) def write_perf_stats(self, uttid, details): + """Outputs performance statistics + + Arguments + --------- + uttid : list + A list of utterance IDs + details : dict + Performance details""" self.perf_writer.writerow({"uttid": " ".join(uttid), **details}) self.perf_file.flush() @@ -408,6 +448,19 @@ def flatten(value): def ascii_only(values): + """Retains only ASCII characters from the values in a + dictionary + + Arguments + --------- + values : dict + a key/value dictionary + + Returns + ------- + result : dict + The same dictionary but with non-ASCII characters + """ return { key: RE_NON_ASCII.sub("", value) if isinstance(value, str) else value for key, value in values.items() diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 7dc4c4ab2..7926a3f04 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -134,6 +134,26 @@ def compute_forward(self, batch, stage): return predictions, features def prepare_features(self, batch): + """Prepares Tokotron TTS features + + Arguments + --------- + batch : PaddedBatch + A batch of data + + Returns + ------- + audio_bos : torch.Tensor + Audio represnetations (discrete or continuous) with the BOS marker + audio_bos_length : torch.Tensor + Relative lengths of audio representations with the BOS marker + audio_tgt : torch.Tensor + Audio prediction targets + audio_tgt_length : torch.Tensor + Audio prediction targets - relative lengths + spk_emb : torch.Tensor + Speaker embeddings + """ if self.hparams.spk_emb_shuffle: wav, wav_length = batch.spk_emb_random_match else: diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py index 58fdd5abb..017a5c367 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/evaluation.py @@ -1,3 +1,8 @@ +"""TTS evaluation tools + +Authors + * Artem Ploujnikov 2024 +""" import json import torch import logging diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py index f81252b3a..759014220 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/inference_fit.py @@ -34,7 +34,15 @@ class InferenceFit: - """An inference fit wrapper""" + """A wrapper class for hyperparameter fitting + + Arguments + --------- + hparams : dict + Parsed hyperparameters + run_opts : dict + Parsed run options + """ def __init__(self, hparams, run_opts): device = run_opts.get("device", "cpu") @@ -64,8 +72,8 @@ def fit(self, dataset): Arguments --------- - dataset: DynamicItemDataset - a dataset + dataset: speechbrain.dataio.dataset.DynamicItemDataset + A dataset instance Returns ------- @@ -86,11 +94,35 @@ def fit(self, dataset): return self.result, self.best def is_completed(self, params): + """Determines whether the fitting run has been completed + + Arguments + --------- + params : torch.Tensor + the parameters to evaluate + + Returns + ------- + result : bool + Whether the run has been completed + """ folder_name = params_to_folder_name(params) path = self.output_folder / folder_name / "summary.json" return path.exists() def get_result(self, params): + """Retrieves the result for a completed run + + Arguments + --------- + params : torch.Tensor + A hyperparameter search entry + + Returns + ------- + result : dict + The result of the run + """ params_str = format_params(params) logger.info("Retrieving params for completed run %s", params_str) folder_name = params_to_folder_name(params) @@ -104,6 +136,13 @@ def get_result(self, params): return result def find_best(self): + """Finds the best run result based on the metric chosen + + Returns + ------- + result : dict + The best result + """ best = self.result[0] op = ( operator.lt @@ -117,9 +156,31 @@ def find_best(self): return best def enumerate_param_space(self): + """Enumerates the parameter space + + Returns + ------- + result : generator + The parameter space (each element is a dictionary of hyperparameters) + """ return enumerate_space(self.space) def evaluate(self, dataset, params): + """Performs evaluation at a particular point + in the hyperparameter space + + Arguments + --------- + dataset : speechbrain.dataio.dataset.DynamicItemDataset + A dataset instance + params : dict + The hyperparameter dictionary + + Returns + ------- + metrics : dictionary + a key/value dictionary with the metrics computed + """ dataloader = sb.dataio.dataloader.make_dataloader(dataset) params_str = format_params(params) logger.info("Starting evaluation of %s", params_str) @@ -141,6 +202,14 @@ def evaluate(self, dataset, params): return metrics def evaluate_batch(self, batch, params): + """Evaluates a single batch + + Arguments + --------- + batch : PaddedBatch + A single batch of data + params : dict + A set of hyperparameters to try""" batch = batch.to(self.device) audio_tokens, audio_length = self.inference(batch, params) wav = self.create_waveform(audio_tokens, audio_length) @@ -155,6 +224,7 @@ def evaluate_batch(self, batch, params): ) def write_report(self): + """Outputs the hyperparameter fitting report""" if self.result is None: logger.warning("Nothing to report") return @@ -286,6 +356,7 @@ def _get_inference_opts(self, params): ) def recover(self): + """Recovers a checkpoint according to the settings specified""" test_key_kind = hparams["test_key_kind"] test_key = hparams["test_key"] kwargs = {f"{test_key_kind}_key": test_key} @@ -298,6 +369,24 @@ def recover(self): def enumerate_space(space, entry=None, points=None): + """Enumerates the hyperparameter space for a full + grid search + + Arguments + --------- + space : dict + A key -> value dictionary with hyperparameter names as keys + and sets of values to try as values + entry : dict + The entry being constructed + points : list + The list of points being constructed + + Returns + ------- + result : list + All configurations to try + """ if points is None: points = [] if not space: @@ -314,16 +403,51 @@ def enumerate_space(space, entry=None, points=None): def format_space(space): + """Formats a hyperparameter space for display + + Arguments + --------- + space : dict + A space definition + + Returns + ------- + result : str + A formatted space for display""" return ", ".join( f"{parameter}: {values}" for parameter, values in space.items() ) def format_params(params): + """Formats a set of hyperparameters (a single point in the hyperparameter + space) for display + + Arguments + --------- + params : dict + A dictionary of hyperparameter values + + Returns + ------- + result : str + A formatted hyperparameter dictionary + """ return ", ".join(f"{key}={value}" for key, value in params.items()) def params_to_folder_name(params): + """Formats a dictionary of hyperparameters as a folder name (for ease of reading) + + Arguments + --------- + params : dict + A dictionary of hyperparameter values + + Returns + ------- + result : str + The corresponding folder name""" params_str = "-".join(f"{key}-{value}" for key, value in params.items()) return f"eval-{params_str}" diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py index 3fe83556b..896e6e4f5 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/tokenizer_prepare.py @@ -3,9 +3,7 @@ """ import json -import os import re -import speechbrain as sb from pathlib import Path from speechbrain.lobes.models.g2p.dataio import build_token_char_map diff --git a/benchmarks/DASB/LibriTTS/TTS/valle/train.py b/benchmarks/DASB/LibriTTS/TTS/valle/train.py index 13efdaf26..6cf1c7eca 100644 --- a/benchmarks/DASB/LibriTTS/TTS/valle/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/valle/train.py @@ -575,6 +575,19 @@ def _get_inference_opts(self): ) def save_samples(self, batch, wav, length, stage): + """Saves audio samples + + Arguments + --------- + batch : PaddedBatch + An audio batch + wav : torch.Tensor + Generated audio + length : torch.Tensor + Relative lengths + stage : speechbrain.Stage + The training stage + """ output_folder = self._get_eval_output_folder(stage) samples = undo_padding_tensor(wav, length) for uttid, sample in zip(batch.uttid, samples): @@ -611,6 +624,19 @@ def _get_eval_output_folder(self, stage): return output_folder def fit_batch(self, batch): + """Fit one batch, using the default implementation with per-step + annealing + + Arguments + --------- + batch : list of torch.Tensors + Batch of data to use for training. Default implementation assumes + this batch has two elements: inputs and targets. + + Returns + ------- + detached loss + """ loss = super().fit_batch(batch) if self.hparams.lr_annealing_mode == "step": self.hparams.lr_annealing(self.optimizer) diff --git a/benchmarks/DASB/model/Tokotron.py b/benchmarks/DASB/model/Tokotron.py index bb414b0d6..833e98da4 100644 --- a/benchmarks/DASB/model/Tokotron.py +++ b/benchmarks/DASB/model/Tokotron.py @@ -78,16 +78,22 @@ class EosMode(Enum): + """The method of determining end-of-sequence""" + GATE = "gate" TOKEN = "token" class DecoderMode(Enum): + """The method of determining what type of decoder to use""" + AUTOREGRESSIVE = "autoregressive" FORWARD = "forward" class RepresentationMode(Enum): + """Inidcates the type of representations to use for audio (discrete or continuous)""" + DISCRETE = "discrete" CONTINUOUS = "continuous" @@ -1817,10 +1823,30 @@ def __call__(self, opt): class PositionalEncoding(TransformerPositionalEncoding): + """A wrapper for the positional encoding that does not try + to be loaded from state dictionaries""" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def load_state_dict(self, state_dict, strict=True, assign=False): + """Copy parameters and buffers from :attr:`state_dict` into this module and its descendants. + + Arguments + --------- + state_dict : dict + A dict containing parameters and persistent buffers. + strict : (bool, optional) + Whether to strictly enforce that the keys + assign (bool, optional): whether to assign items in the state + dictionary to their corresponding keys in the module + + Returns + ------- + ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields: + * **missing_keys** is a list of str containing the missing keys + * **unexpected_keys** is a list of str containing the unexpected keys + """ pass diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index 340dcdc0f..d1e1c33bf 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -235,6 +235,23 @@ def forward( return logits_ar, logits_nar def prepare_input(self, dec_seq_emb, prefix_len, level): + """Prepares the input sequence by adding up + embeddings that are not masked + + Arguments + --------- + dec_seq_emb : torch.Tensor + The decoder sequence embedding + prefix_len : torch.Tensor + The prefix lengths + level : int | torch.Tensor + The level number or a level mask + + Returns + ------- + result : torch.Tensor + The combined embedding + """ # NOTE(Jinchuan): have to use "expand" here but maybe lead to extra memory usage. # This is because both prefix_mask and level_mask are broadcastable and will # trigger user warning. @@ -790,7 +807,23 @@ def forward(self, x): class Linear(nn.Linear): + """A linear layer wrapper that performs automatic + type conversions + """ + def forward(self, x: Tensor) -> Tensor: + """Computes the forward pass + + Arguments + --------- + x : torch.Tensor + The input data + + Returns + ------- + result : torch.Tensor + The result + """ return F.linear( x, self.weight.to(x.dtype), @@ -873,6 +906,31 @@ def forward( class ValleNARDecoder(TransformerDecoder): + """The VALL-E non-autoregressive decoder + + Arguments + --------- + n_level : int + The number of levels + n_ctx : int + The context length + n_state : int + The number of states + n_head : int + The number of attention heads + n_layer : int + The number of layers + causal : bool + Whether to operate in causal mode (i.e. avoid attending + to future steps) + qk_norm : bool + Queries/Keys Normalization + dropout : float + The dropout probability + layer_class : type + The layer class to use + """ + def __init__( self, n_level, @@ -885,30 +943,6 @@ def __init__( dropout=0.0, layer_class=ResidualAttentionBlockAdaLN, ): - """The VALL-E non-autoregressive decoder - - Arguments - --------- - n_level : int - The number of levels - n_ctx : int - The context length - n_state : int - The number of states - n_head : int - The number of attention heads - n_layer : int - The number of layers - causal : bool - Whether to operate in causal mode (i.e. avoid attending - to future steps) - qk_norm : bool - Queries/Keys Normalization - dropout : float - The dropout probability - layer_class : type - The layer class to use - """ super().__init__( n_ctx=n_ctx, n_state=n_state, @@ -1125,6 +1159,20 @@ def install_kv_cache_hook(model, cache): hooks = [] def save_to_cache(module, _, output): + """Saves the output in the module cache + + Arguments + --------- + module : torch.Tensor + A module instance + output : torch.Tensor + The module output + + Returns + ------- + result : torch.Tensor + Concatenated outputs + """ if module not in cache: # save as-is, for the first token or cross attention cache[module] = output @@ -1132,8 +1180,15 @@ def save_to_cache(module, _, output): cache[module] = torch.cat([cache[module], output], dim=1).detach() return cache[module] - def install_hooks(layer: torch.nn.Module): - if isinstance(layer, MultiHeadAttention): + def install_hooks(layer): + """Installs the forward/backward hooks + + Arguments + --------- + layer : torch.nn.Module + A layer instance + """ + if isinstance(layer): hooks.append(layer.key.register_forward_hook(save_to_cache)) hooks.append(layer.value.register_forward_hook(save_to_cache)) @@ -1255,8 +1310,22 @@ def install_continuous_features( def modality_index_to_mask( - modality_index: torch.Tensor, inference_opts: SpeechLMInferenceOptions, + modality_index, inference_opts, ): + """Converts a modality index to a mask + + Arguments + --------- + modality_index : int + The modality index + inference_opts : SpeechLMInferenceOptions + The inference options + + Returns + ------- + result : torch.Tensor + The result + """ assert modality_index.dim() == 1 modality_index = modality_index.cpu().tolist() mask = torch.stack( @@ -1305,7 +1374,7 @@ def masked_nll_loss( class SampleSelector: """A base class for sample selectors""" - def select(self, tokens, scores, label): + def select(self, tokens, scores, text): """Performs selection Arguments @@ -1316,17 +1385,33 @@ def select(self, tokens, scores, label): scores : list The scores - label : str + text : str The label for the sample """ raise NotImplementedError() class DefaultSampleSelector(SampleSelector): + """A default no-op sample selector that simply selects the + first sample (useful only when nbest=1)""" + def __init__(self, **kwargs): pass def select(self, tokens, scores, text): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + text : str + The label for the sample + """ return tokens[0] @@ -1364,6 +1449,8 @@ class WhisperASRSampleSelector(SampleSelector): token_model_kwargs : dict Additional arguments for the tokenizer decoding function + device : str | torch.Device + The target device """ def __init__( @@ -1412,6 +1499,19 @@ def __init__( tokenizer.codec_vocoder.device = device def select(self, tokens, scores, text): + """Performs selection + + Arguments + --------- + tokens : list + The generated tokens + + scores : list + The scores + + text : str + The label for the sample + """ tokens, length = batch_pad_right(tokens) tokens_shift = tokens - self.token_shift if self.offsets is not None: @@ -1447,6 +1547,18 @@ def select(self, tokens, scores, text): return tokens[idx] def predict(self, wav): + """Makes an ASR prediction + + Arguments + --------- + wav : torch.Tensor + A raw waveform + + Returns + ------- + text : str + The text predicted by the ASR + """ if wav.dim() < 2: wav = wav.unsqueeze(0) wav = self.model.pad_or_trim(wav) diff --git a/benchmarks/DASB/utils/eval.py b/benchmarks/DASB/utils/eval.py index 5d90069ef..1e9c7d2ed 100644 --- a/benchmarks/DASB/utils/eval.py +++ b/benchmarks/DASB/utils/eval.py @@ -213,6 +213,12 @@ def on_evaluation_end(self): pass def global_metrics(self): + """Returns global metrics (not tied to a specific sample) + + Returns + ------- + metrics : dict + A dictionary of metrics""" return {} @@ -266,6 +272,7 @@ def __init__(self, sample_rate=16000, metric_mode="macro"): self.metrics = {} def on_evaluation_start(self): + """Invoked when evaluation starts""" self.metrics = {} def evaluate( @@ -375,6 +382,21 @@ def compute_diff_rate(self, details, device): return {"dwer": dwer, "dcer": dcer} def get_asr_metrics(self, kind="regular"): + """Returns the ASR metrics + + Arguments + --------- + kind : the kind of metrics to obtain + 'regular' - a new metric for each sample + 'micro' - a global shared metric + + Returns + ------- + wer_metric : ErrorRateStats + the Word Error Rate (WER) metric + cer_metric : ErrorRateStats + the Character Error Rate (CER) metric + """ if self.metric_mode == "micro": if kind not in self.metrics: metrics = init_asr_metrics() @@ -394,6 +416,12 @@ def _replace_blanks(self, preds): return [" " if item == "" else item for item in preds] def global_metrics(self): + """Returns global metrics (not tied to a specific sample) + + Returns + ------- + metrics : dict + A dictionary of metrics""" global_metrics = {} if self.metric_mode == "micro": wer_metric, cer_metric = self.get_asr_metrics("regular") From ac7d6d6a9908235b85b78dc194804df0a9982fa5 Mon Sep 17 00:00:00 2001 From: flexthink Date: Thu, 10 Jul 2025 17:46:45 -0400 Subject: [PATCH 269/270] DASB: Update a docstring --- benchmarks/DASB/model/valle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/DASB/model/valle.py b/benchmarks/DASB/model/valle.py index d1e1c33bf..98f7067a8 100644 --- a/benchmarks/DASB/model/valle.py +++ b/benchmarks/DASB/model/valle.py @@ -1,4 +1,4 @@ -"""An adaptation of ESPNET VALL-E +"""An adaptation of ESPNET VALL-E for SpeechBrain Originally by Jinchuan Tian https://github.com/espnet/espnet From 17bde9de272428a41d30dcb1cdaf0871316ea842 Mon Sep 17 00:00:00 2001 From: flexthink Date: Fri, 18 Jul 2025 16:20:05 -0400 Subject: [PATCH 270/270] DASB: Cosmetic changes to pass pre-commit --- benchmarks/DASB/LJSpeech/TTS/tokotron/train.py | 2 +- benchmarks/DASB/LJSpeech/extraction/extract.py | 2 +- benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py | 2 +- benchmarks/DASB/LibriSpeech/extraction/extract.py | 2 +- benchmarks/DASB/LibriTTS/TTS/tokotron/train.py | 4 ++-- benchmarks/DASB/run_hparam_optimization.sh | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py index 229d645fe..161a1fd93 100644 --- a/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py +++ b/benchmarks/DASB/LJSpeech/TTS/tokotron/train.py @@ -28,7 +28,7 @@ base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) -from model.Tokotron import ( +from model.Tokotron import ( # noqa: E402 get_silence_token, use_silence_padding, feature_pad_to, diff --git a/benchmarks/DASB/LJSpeech/extraction/extract.py b/benchmarks/DASB/LJSpeech/extraction/extract.py index 556d8a9d0..bb25afa87 100644 --- a/benchmarks/DASB/LJSpeech/extraction/extract.py +++ b/benchmarks/DASB/LJSpeech/extraction/extract.py @@ -80,7 +80,7 @@ if hparams["save_embedding"]: save_folder = pl.Path(hparams["save_folder"]) - logger.info(f"Saving embeddings ...") + logger.info("Saving embeddings ...") tokens_extractor.save_pretrained_embeddings( (save_folder / "embeddings").as_posix(), vocab_size=hparams["vocab_size"], diff --git a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py index 938ce8b96..098986565 100644 --- a/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py +++ b/benchmarks/DASB/LibriSpeech/ASR-on-the-fly/train.py @@ -387,7 +387,7 @@ def text_pipeline(wrd): ) hparams["train_logger"].log_stats( stats_meta={ - f"Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}", + "Codec parameters/buffers (M)": f"{codec_params / 1e6:.2f}", "Model parameters/buffers (M)": f"{model_params / 1e6:.2f}", }, ) diff --git a/benchmarks/DASB/LibriSpeech/extraction/extract.py b/benchmarks/DASB/LibriSpeech/extraction/extract.py index 814d252be..3a649d24f 100644 --- a/benchmarks/DASB/LibriSpeech/extraction/extract.py +++ b/benchmarks/DASB/LibriSpeech/extraction/extract.py @@ -88,7 +88,7 @@ if hparams["save_embedding"]: save_folder = pl.Path(hparams["save_folder"]) - logger.info(f"Saving embeddings ...") + logger.info("Saving embeddings ...") tokens_extractor.save_pretrained_embeddings( (save_folder / "embeddings").as_posix(), vocab_size=hparams["vocab_size"], diff --git a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py index 7926a3f04..abb2cda88 100644 --- a/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py +++ b/benchmarks/DASB/LibriTTS/TTS/tokotron/train.py @@ -30,13 +30,13 @@ base_dir = str(Path(__file__).resolve().parent.parent.parent.parent) sys.path.append(base_dir) -from model.Tokotron import ( +from model.Tokotron import ( # noqa: E402 RepresentationMode, get_silence_repr, get_silence_token, use_silence_padding, feature_pad_to, -) # noqa: E402 +) from evaluate import TokotronEvaluator # noqa: E402 logger = logging.getLogger(__name__) diff --git a/benchmarks/DASB/run_hparam_optimization.sh b/benchmarks/DASB/run_hparam_optimization.sh index c0b06b09a..554ed10f0 100755 --- a/benchmarks/DASB/run_hparam_optimization.sh +++ b/benchmarks/DASB/run_hparam_optimization.sh @@ -224,7 +224,7 @@ while [[ $# -gt 0 ]]; do eval_run_additional_flags+="$name $value " fi additional_flags+="$name $value " # store additional flags - fi + fi shift # past argument ;;