From 9ec46d7b0ecd97d4a0b2101bf58353cd1aa3066c Mon Sep 17 00:00:00 2001 From: zach wener Date: Thu, 29 Jun 2023 15:57:20 -0700 Subject: [PATCH 1/9] bugfix and reduce deps --- settings.ini | 5 +---- uberduck_ml_dev/data/data.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/settings.ini b/settings.ini index 067761b1..1ceddc46 100644 --- a/settings.ini +++ b/settings.ini @@ -24,10 +24,7 @@ license = apache2 # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive status = 2 -# Optional. Same format as setuptools requirements. Torch version seems to effect random number generator (not 100% certain). -# TODO (Sam): our goal is to rely on as few 3rd party packages as possible. We should try to remove as many of these as possible and integrate torch code directly. -# NOTE (Sam): is it possible to specify no-deps here? -requirements = Cython pytest phonemizer inflect librosa>=0.8.0 matplotlib nltk>=3.6.5 numpy>=1.23.5 csvw clldutils pandas pydub scipy scikit-learn soundfile tensorboardX torch>=1.13.0 torchaudio>=0.9.0 unidecode seaborn mdutils wordcloud wordfreq Pillow einops g2p_en@git+https://github.com/uberduck-ai/g2p emoji text-unidecode gdown pre-commit lmdb ray[default] praat-parselmouth>=0.4.3 +requirements = librosa pandas numpy scipy scikit-learn soundfile torch torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p # Optional. Same format as setuptools console_scripts # console_scripts = diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py index 544d44a9..3b486598 100644 --- a/uberduck_ml_dev/data/data.py +++ b/uberduck_ml_dev/data/data.py @@ -529,7 +529,7 @@ def filter_by_duration_(self, dur_min, dur_max): ] def create_speaker_lookup_table(self, data): - speaker_ids = np.sort(np.unique([x["speaker"] for x in data])) + speaker_ids = np.sort(np.unique([int(x["speaker"]) for x in data])) d = {speaker_ids[i]: i for i in range(len(speaker_ids))} print("Number of speakers:", len(d)) print("Speaker IDS", d) From 7d6c13ebbe516999ee32582ed6826153d7eb8cf0 Mon Sep 17 00:00:00 2001 From: zach wener Date: Thu, 29 Jun 2023 22:14:06 -0700 Subject: [PATCH 2/9] some minor changes --- settings.ini | 2 +- uberduck_ml_dev/data/data.py | 16 ++++- uberduck_ml_dev/models/radtts.py | 17 +----- uberduck_ml_dev/models/vits.py | 77 ------------------------- uberduck_ml_dev/trainer/radtts/train.py | 21 +++++-- 5 files changed, 33 insertions(+), 100 deletions(-) diff --git a/settings.ini b/settings.ini index 1ceddc46..71ed7950 100644 --- a/settings.ini +++ b/settings.ini @@ -24,7 +24,7 @@ license = apache2 # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive status = 2 -requirements = librosa pandas numpy scipy scikit-learn soundfile torch torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p +requirements = librosa pandas numpy scipy scikit-learn soundfile torch torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p # Optional. Same format as setuptools console_scripts # console_scripts = diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py index 3b486598..66bfff1e 100644 --- a/uberduck_ml_dev/data/data.py +++ b/uberduck_ml_dev/data/data.py @@ -514,7 +514,7 @@ def load_data(self, datasets, split="|"): return dataset def filter_by_speakers_(self, speakers, include=True): - print("Include spaker {}: {}".format(speakers, include)) + print("Include speaker {}: {}".format(speakers, include)) if include: self.data = [x for x in self.data if x["speaker"] in speakers] else: @@ -609,6 +609,9 @@ def get_speaker_id(self, speaker): if self.speaker_map is not None and speaker in self.speaker_map: speaker = self.speaker_map[speaker] + if speaker not in self.speaker_ids and int(speaker) in self.speaker_ids: + speaker = int(speaker) + return torch.LongTensor([self.speaker_ids[speaker]]) def get_text(self, text): @@ -656,6 +659,17 @@ def __getitem__(self, index): distance_map[distance_map <= 0] = 0.0 f0 = f0 - distance_map + if not os.path.exists(mel_path): + _, audio = read(audiopath) + # sub_path = audiopath.split("resampled_unnormalized.wav")[0] + audio = np.asarray(audio / (np.abs(audio).max() * 2)) + audio_norm = torch.tensor(audio, dtype=torch.float32) + audio_norm = audio_norm.unsqueeze(0) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + melspec = (melspec + 5.5) / 2 + torch.save(melspec.detach(), mel_path) + mel = torch.load(mel_path) energy_avg = None diff --git a/uberduck_ml_dev/models/radtts.py b/uberduck_ml_dev/models/radtts.py index ed103881..fdf8717c 100644 --- a/uberduck_ml_dev/models/radtts.py +++ b/uberduck_ml_dev/models/radtts.py @@ -20,6 +20,7 @@ # DEALINGS IN THE SOFTWARE. from typing import Optional + import torch from torch import nn from .common import ( @@ -84,20 +85,6 @@ def forward(self, z, context, inverse=False, seq_lens=None): return z, log_det_W, log_s -# # NOTE (Sam): comment this out for GPU -# def get_mask_from_lengths(lengths): -# """Constructs binary mask from a 1D torch tensor of input lengths -# Args: -# lengths (torch.tensor): 1D tensor -# Returns: -# mask (torch.tensor): num_sequences x max_length x 1 binary tensor -# """ -# max_len = torch.max(lengths).item() -# ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)) -# mask = (ids < lengths.unsqueeze(1)).bool() -# return mask - - class RADTTS(torch.nn.Module): def __init__( self, @@ -1011,7 +998,7 @@ def remove_norms(self): "n_early_every": 2, "n_group_size": 2, "affine_model": "wavenet", - "include_modules": "decatndpmvpredapm", + "include_modules": ["dec", "atn", "dpm", "vpred", "apm"], "scaling_fn": "tanh", "matrix_decomposition": "LUS", "learn_alignments": True, diff --git a/uberduck_ml_dev/models/vits.py b/uberduck_ml_dev/models/vits.py index 6cc242df..03506f13 100644 --- a/uberduck_ml_dev/models/vits.py +++ b/uberduck_ml_dev/models/vits.py @@ -279,83 +279,6 @@ def forward(self, x, x_lengths, g=None, local_conditioning=None): return z, m, logs, x_mask -# class Generator(torch.nn.Module): -# def __init__( -# self, -# initial_channel, -# resblock, -# resblock_kernel_sizes, -# resblock_dilation_sizes, -# upsample_rates, -# upsample_initial_channel, -# upsample_kernel_sizes, -# gin_channels=0, -# ): -# super(Generator, self).__init__() -# self.num_kernels = len(resblock_kernel_sizes) -# self.num_upsamples = len(upsample_rates) -# self.conv_pre = Conv1d( -# initial_channel, upsample_initial_channel, 7, 1, padding=3 -# ) -# resblock = common.ResBlock1 if resblock == "1" else common.ResBlock2 -# -# self.ups = nn.ModuleList() -# for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): -# self.ups.append( -# weight_norm( -# ConvTranspose1d( -# upsample_initial_channel // (2**i), -# upsample_initial_channel // (2 ** (i + 1)), -# k, -# u, -# padding=(k - u) // 2, -# ) -# ) -# ) -# -# self.resblocks = nn.ModuleList() -# for i in range(len(self.ups)): -# ch = upsample_initial_channel // (2 ** (i + 1)) -# for j, (k, d) in enumerate( -# zip(resblock_kernel_sizes, resblock_dilation_sizes) -# ): -# self.resblocks.append(resblock(ch, k, d)) -# -# self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) -# self.ups.apply(init_weights) -# -# if gin_channels != 0: -# self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) -# -# def forward(self, x, g=None): -# x = self.conv_pre(x) -# if g is not None: -# x = x + self.cond(g) -# -# for i in range(self.num_upsamples): -# x = F.leaky_relu(x, common.LRELU_SLOPE) -# x = self.ups[i](x) -# xs = None -# for j in range(self.num_kernels): -# if xs is None: -# xs = self.resblocks[i * self.num_kernels + j](x) -# else: -# xs += self.resblocks[i * self.num_kernels + j](x) -# x = xs / self.num_kernels -# x = F.leaky_relu(x) -# x = self.conv_post(x) -# x = torch.tanh(x) -# -# return x -# -# def remove_weight_norm(self): -# print("Removing weight norm...") -# for l in self.ups: -# remove_weight_norm(l) -# for l in self.resblocks: -# l.remove_weight_norm() - - class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super(DiscriminatorP, self).__init__() diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py index 2a90e9cd..ea721089 100644 --- a/uberduck_ml_dev/trainer/radtts/train.py +++ b/uberduck_ml_dev/trainer/radtts/train.py @@ -1,7 +1,11 @@ +import ray import torch from torch.cuda.amp import GradScaler +from ray.air import session from ray.air.integrations.wandb import setup_wandb import ray.train as train +from torchvision.models import resnet18 + from .train_epoch import train_epoch from .load import prepare_dataloaders, warmstart @@ -32,23 +36,28 @@ def train_func(config: dict): model = RADTTS( **model_config, ) + # model = resnet18() if train_config["warmstart_checkpoint_path"] != "": warmstart(train_config["warmstart_checkpoint_path"], model) - # NOTE (Sam): find_unused_parameters=True is necessary for num_workers >1 in ScalingConfig. - model = train.torch.prepare_model( - model, parallel_strategy_kwargs=dict(find_unused_parameters=True) - ) - start_epoch = 0 # NOTE (Sam): what is significance of batch_size=6? Think this is overriden within the dataloader. + print("PREPARING DATALOADER") train_loader, valset, collate_fn = prepare_dataloaders( data_config, 2, # 2 gpus by default train_config["batch_size"], ) train_dataloader = train.torch.prepare_data_loader(train_loader) + print("DONE PREPARING DATA LOSDER") + + print("PREPARING MODEL...") + # NOTE (Sam): find_unused_parameters=True is necessary for num_workers >1 in ScalingConfig. + model = train.torch.prepare_model( + model, parallel_strategy_kwargs=dict(find_unused_parameters=True) + ) + print("Done PREPARING MODEL...") optim = RAdam( model.parameters(), @@ -74,7 +83,7 @@ def train_func(config: dict): ) attention_kl_loss = AttentionBinarizationLoss() iteration = 0 - for epoch in range(start_epoch, start_epoch + epochs): + for _ in range(start_epoch, start_epoch + epochs): iteration = train_epoch( train_dataloader, train_config["log_decoder_samples"], From 0f2a00a5d5dc674589bb1b4f2c51db9e60c333fa Mon Sep 17 00:00:00 2001 From: zach wener Date: Fri, 30 Jun 2023 17:47:58 -0700 Subject: [PATCH 3/9] qol improvements --- uberduck_ml_dev/data/collate.py | 6 ++++++ uberduck_ml_dev/data/data.py | 1 + uberduck_ml_dev/trainer/log.py | 16 ++++++++++++---- uberduck_ml_dev/trainer/radtts/load.py | 8 ++++++-- uberduck_ml_dev/trainer/radtts/log.py | 5 ++++- uberduck_ml_dev/trainer/radtts/train.py | 11 ++++------- uberduck_ml_dev/trainer/radtts/train_epoch.py | 2 ++ uberduck_ml_dev/trainer/radtts/train_step.py | 10 ++++++---- 8 files changed, 41 insertions(+), 18 deletions(-) diff --git a/uberduck_ml_dev/data/collate.py b/uberduck_ml_dev/data/collate.py index 0b9582ac..095b3a02 100644 --- a/uberduck_ml_dev/data/collate.py +++ b/uberduck_ml_dev/data/collate.py @@ -218,10 +218,16 @@ def __call__(self, batch): i, : cur_attn_prior.size(0), : cur_attn_prior.size(1) ] = cur_attn_prior + original_text = [ + batch[ids_sorted_decreasing[i]]["text"] + for i in range(len(ids_sorted_decreasing)) + ] + return { "mel": mel_padded, "speaker_ids": speaker_ids, "text": text_padded, + "original_text": original_text, "input_lengths": input_lengths, "output_lengths": output_lengths, "audiopaths": audiopaths, diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py index 66bfff1e..6c49ee15 100644 --- a/uberduck_ml_dev/data/data.py +++ b/uberduck_ml_dev/data/data.py @@ -693,6 +693,7 @@ def __getitem__(self, index): return { "mel": mel, "speaker_id": speaker_id, + "text": text, "text_encoded": text_encoded, "audiopath": audiopath, "attn_prior": attn_prior, diff --git a/uberduck_ml_dev/trainer/log.py b/uberduck_ml_dev/trainer/log.py index 8b947d9e..d5756ea6 100644 --- a/uberduck_ml_dev/trainer/log.py +++ b/uberduck_ml_dev/trainer/log.py @@ -4,11 +4,19 @@ @torch.no_grad() -def log(metrics, audios={}): +def log(metrics, audios=None, images=None): + if session.get_world_rank() != 0: + return + audios = audios or {} + images = images or {} wandb_metrics = dict(metrics) for k, v in audios.items(): - wandb_metrics[k] = wandb.Audio(v, sample_rate=22050) + wandb_metrics[k] = wandb.Audio( + v["audio"], sample_rate=22050, caption=v.get("caption") + ) - if session.get_world_rank() == 0: - wandb.log(wandb_metrics) + for k, v in images.items(): + wandb_metrics[k] = wandb.Image(v) + + wandb.log(wandb_metrics) diff --git a/uberduck_ml_dev/trainer/radtts/load.py b/uberduck_ml_dev/trainer/radtts/load.py index c9795105..19c559f4 100644 --- a/uberduck_ml_dev/trainer/radtts/load.py +++ b/uberduck_ml_dev/trainer/radtts/load.py @@ -14,6 +14,10 @@ def warmstart( checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False ): pretrained_dict = torch.load(checkpoint_path, map_location="cpu") + iteration = 0 + if "iteration" in pretrained_dict: + iteration = pretrained_dict["iteration"] + pretrained_dict = pretrained_dict["state_dict"] is_module = False @@ -29,9 +33,9 @@ def warmstart( model_dict = model.state_dict() model_dict.update(pretrained_dict) model.load_state_dict(model_dict, strict=strict) - print("Warm started from {}".format(checkpoint_path)) + print(f"Warm started from {checkpoint_path}, iteration {iteration}") model.train() - return model + return (model, iteration) def prepare_dataloaders(data_config, n_gpus, batch_size): diff --git a/uberduck_ml_dev/trainer/radtts/log.py b/uberduck_ml_dev/trainer/radtts/log.py index 56368fb7..f5282d80 100644 --- a/uberduck_ml_dev/trainer/radtts/log.py +++ b/uberduck_ml_dev/trainer/radtts/log.py @@ -171,6 +171,9 @@ def get_log_audio( sample_tag = f"sample_attribute_sigma_{attribute_sigma}" if oos_name is not None: sample_tag = f"{sample_tag}_oos_{oos_name}" - audios[sample_tag] = audio + print("ADDING AUDIO WITH CAPTION: ", batch_dict.get("original_text")[0]) + audios[sample_tag] = dict( + audio=audio, caption=batch_dict.get("original_text")[0] + ) return images, audios diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py index ea721089..e4ff01ef 100644 --- a/uberduck_ml_dev/trainer/radtts/train.py +++ b/uberduck_ml_dev/trainer/radtts/train.py @@ -4,8 +4,6 @@ from ray.air import session from ray.air.integrations.wandb import setup_wandb import ray.train as train -from torchvision.models import resnet18 - from .train_epoch import train_epoch from .load import prepare_dataloaders, warmstart @@ -36,10 +34,9 @@ def train_func(config: dict): model = RADTTS( **model_config, ) - # model = resnet18() - + iteration = 0 if train_config["warmstart_checkpoint_path"] != "": - warmstart(train_config["warmstart_checkpoint_path"], model) + _, iteration = warmstart(train_config["warmstart_checkpoint_path"], model) start_epoch = 0 # NOTE (Sam): what is significance of batch_size=6? Think this is overriden within the dataloader. @@ -82,8 +79,7 @@ def train_func(config: dict): loss_weights=train_config["loss_weights"], ) attention_kl_loss = AttentionBinarizationLoss() - iteration = 0 - for _ in range(start_epoch, start_epoch + epochs): + for epoch in range(start_epoch, start_epoch + epochs): iteration = train_epoch( train_dataloader, train_config["log_decoder_samples"], @@ -100,6 +96,7 @@ def train_func(config: dict): binarization_start_iter, iteration, vocoder, + epoch=epoch, ) diff --git a/uberduck_ml_dev/trainer/radtts/train_epoch.py b/uberduck_ml_dev/trainer/radtts/train_epoch.py index b30a6bd4..f6cfee61 100644 --- a/uberduck_ml_dev/trainer/radtts/train_epoch.py +++ b/uberduck_ml_dev/trainer/radtts/train_epoch.py @@ -18,6 +18,7 @@ def train_epoch( binarization_start_iter, iteration, vocoder, + epoch=None, ): # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration): # for batch_idx, ray_batch_df in enumerate( @@ -43,6 +44,7 @@ def train_epoch( kl_loss_start_iter, binarization_start_iter, vocoder, + epoch=epoch, ) iteration += 1 diff --git a/uberduck_ml_dev/trainer/radtts/train_step.py b/uberduck_ml_dev/trainer/radtts/train_step.py index 56128071..c27ae496 100644 --- a/uberduck_ml_dev/trainer/radtts/train_step.py +++ b/uberduck_ml_dev/trainer/radtts/train_step.py @@ -30,6 +30,7 @@ def _train_step( kl_loss_start_iter, binarization_start_iter, vocoder, + epoch=None, ): print(datetime.now(), "entering train step:", iteration) if iteration >= binarization_start_iter: @@ -76,7 +77,9 @@ def _train_step( for k, (v, w) in loss_outputs.items(): if w > 0: loss = v * w if loss is None else loss + v * w - print_list.append(" | {}: {:.3f}".format(k, v)) + print_list.append("{}: {:.3f}".format(k, v)) + print_list.append(f"epoch: {epoch}") + print_list.append(f"iteration: {iteration}") w_bin = criterion.loss_weights.get("binarization_loss_weight", 1.0) if binarize and iteration >= kl_loss_start_iter: @@ -86,7 +89,7 @@ def _train_step( binarization_loss = torch.zeros_like(loss) loss_outputs["binarization_loss"] = (binarization_loss, w_bin) grad_clip_val = 1.0 # TODO (Sam): make this a config option - print(print_list) + print(" | ".join(print_list)) scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optim) @@ -99,7 +102,6 @@ def _train_step( for k, (v, w) in loss_outputs.items(): metrics[k] = v.item() - print("iteration: ", iteration, datetime.now()) log_sample = iteration % steps_per_sample == 0 log_checkpoint = iteration % iters_per_checkpoint == 0 @@ -142,7 +144,7 @@ def _train_step( # audio_embedding_oos=audio_embedding_oos, # ) # audios.update(audios_oos) - log(metrics, audios) + log(metrics, audios, images) model.train() else: log(metrics) From f1dfcedc6680d4539000e67366e9a2dcd494169f Mon Sep 17 00:00:00 2001 From: zach wener Date: Mon, 10 Jul 2023 09:11:33 -0700 Subject: [PATCH 4/9] logging --- uberduck_ml_dev/trainer/radtts/log.py | 7 ++++++- uberduck_ml_dev/trainer/radtts/train.py | 2 ++ uberduck_ml_dev/trainer/radtts/train_epoch.py | 2 ++ uberduck_ml_dev/trainer/radtts/train_step.py | 8 +++++--- uberduck_ml_dev/vocoders/hifigan.py | 3 +++ 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/uberduck_ml_dev/trainer/radtts/log.py b/uberduck_ml_dev/trainer/radtts/log.py index f5282d80..27ecbc6a 100644 --- a/uberduck_ml_dev/trainer/radtts/log.py +++ b/uberduck_ml_dev/trainer/radtts/log.py @@ -7,7 +7,11 @@ from ...utils.utils import ( to_gpu, ) -from ...utils.plot import plot_alignment_to_numpy +from ...utils.plot import ( + plot_alignment_to_numpy, + plot_attention_phonemes, + plot_spectrogram, +) # want to test out of sample but can only do proper inference with zero shot dap so lets just look at zero shot decoder samples @@ -65,6 +69,7 @@ def get_log_audio( images = {} audios = {} + images["mel_gt"] = plot_spectrogram(mel[0].data.cpu().numpy()) if attn_used is not None: images["attention_weights"] = plot_alignment_to_numpy( attn_soft[0, 0].data.cpu().numpy().T, title="audioname" diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py index e4ff01ef..88072682 100644 --- a/uberduck_ml_dev/trainer/radtts/train.py +++ b/uberduck_ml_dev/trainer/radtts/train.py @@ -30,6 +30,7 @@ def train_func(config: dict): sigma = train_config["sigma"] kl_loss_start_iter = train_config["kl_loss_start_iter"] binarization_start_iter = train_config["binarization_start_iter"] + grad_clip_val = train_config["grad_clip_val"] model = RADTTS( **model_config, @@ -97,6 +98,7 @@ def train_func(config: dict): iteration, vocoder, epoch=epoch, + grad_clip_val=grad_clip_val, ) diff --git a/uberduck_ml_dev/trainer/radtts/train_epoch.py b/uberduck_ml_dev/trainer/radtts/train_epoch.py index f6cfee61..57e8a6b2 100644 --- a/uberduck_ml_dev/trainer/radtts/train_epoch.py +++ b/uberduck_ml_dev/trainer/radtts/train_epoch.py @@ -19,6 +19,7 @@ def train_epoch( iteration, vocoder, epoch=None, + grad_clip_val=None, ): # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration): # for batch_idx, ray_batch_df in enumerate( @@ -45,6 +46,7 @@ def train_epoch( binarization_start_iter, vocoder, epoch=epoch, + grad_clip_val=grad_clip_val, ) iteration += 1 diff --git a/uberduck_ml_dev/trainer/radtts/train_step.py b/uberduck_ml_dev/trainer/radtts/train_step.py index c27ae496..044350ad 100644 --- a/uberduck_ml_dev/trainer/radtts/train_step.py +++ b/uberduck_ml_dev/trainer/radtts/train_step.py @@ -31,6 +31,7 @@ def _train_step( binarization_start_iter, vocoder, epoch=None, + grad_clip_val=None, ): print(datetime.now(), "entering train step:", iteration) if iteration >= binarization_start_iter: @@ -88,17 +89,18 @@ def _train_step( else: binarization_loss = torch.zeros_like(loss) loss_outputs["binarization_loss"] = (binarization_loss, w_bin) - grad_clip_val = 1.0 # TODO (Sam): make this a config option print(" | ".join(print_list)) + metrics = {"loss": loss.item()} scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optim) - torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) + norm_ = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) + print("total norm: ", norm_.item()) + metrics["grad_norm"] = norm_.item() scaler.step(optim) scaler.update() - metrics = {"loss": loss.item()} for k, (v, w) in loss_outputs.items(): metrics[k] = v.item() diff --git a/uberduck_ml_dev/vocoders/hifigan.py b/uberduck_ml_dev/vocoders/hifigan.py index 65d6caba..95950487 100644 --- a/uberduck_ml_dev/vocoders/hifigan.py +++ b/uberduck_ml_dev/vocoders/hifigan.py @@ -76,6 +76,9 @@ def load_vocoder(vocoder_state_dict, vocoder_config, to_cuda=True): def get_vocoder(hifi_gan_config_path, hifi_gan_checkpoint_path): print("Getting vocoder") + import os + + print("CWD: ", os.getcwd()) with open(hifi_gan_config_path) as f: hifigan_config = json.load(f) From 11afbb1e8999028168eaa39d82b4e59d1f0db535 Mon Sep 17 00:00:00 2001 From: zach wener Date: Wed, 12 Jul 2023 18:31:21 -0700 Subject: [PATCH 5/9] some useful things --- uberduck_ml_dev/data/data.py | 16 ++++++++++------ uberduck_ml_dev/data/get.py | 18 ++++++++++++------ uberduck_ml_dev/models/radtts.py | 5 ++++- uberduck_ml_dev/text/symbols.py | 2 ++ 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py index 6c49ee15..5cbe4db8 100644 --- a/uberduck_ml_dev/data/data.py +++ b/uberduck_ml_dev/data/data.py @@ -775,25 +775,29 @@ def __init__( resnet_se_model_path, resnet_se_config_path, audiopaths, - subpath_truncation=41, + target_paths, ): self.model = get_pretrained_model( model_path=resnet_se_model_path, config_path=resnet_se_config_path ) self.audiopaths = audiopaths - self.subpath_truncation = subpath_truncation + self.target_paths = target_paths + # self.subpath_truncation = subpath_truncation - def _get_data(self, audiopath): + def _get_data(self, audiopath, target_path): rate, data = read(audiopath) data = torch.FloatTensor(data.astype("float32") / MAX_WAV_VALUE).unsqueeze(0) - sub_path = audiopath[: self.subpath_truncation] + # sub_path = audiopath[: self.subpath_truncation] embedding = self.model(data).squeeze() - emb_path_local = f"{sub_path}/coqui_resnet_512_emb.pt" + # emb_path_local = f"{sub_path}/coqui_resnet_512_emb.pt" + emb_path_local = target_path torch.save(embedding.detach(), emb_path_local) def __getitem__(self, idx): try: - self._get_data(audiopath=self.audiopaths[idx]) + self._get_data( + audiopath=self.audiopaths[idx], target_path=self.target_paths[idx] + ) except Exception as e: print(f"Error while getting data: index = {idx}") diff --git a/uberduck_ml_dev/data/get.py b/uberduck_ml_dev/data/get.py index f31aff06..4d23a3bf 100644 --- a/uberduck_ml_dev/data/get.py +++ b/uberduck_ml_dev/data/get.py @@ -9,8 +9,10 @@ from uberduck_ml_dev.data.collate import CollateBlank -def get_parallel_torch(data): - data_loader = DataLoader(data, batch_size=32, collate_fn=CollateBlank()) +def get_parallel_torch(data, num_workers=0): + data_loader = DataLoader( + data, batch_size=32, collate_fn=CollateBlank(), num_workers=num_workers + ) for batch in data_loader: pass @@ -19,7 +21,7 @@ def get_parallel_torch(data): # NOTE (Sam): assumes data is in a directory structure like: # /tmp/{uuid}/resampled_normalized.wav # These functions add spectrogram.pt, f0.pt, and coqui_resnet_512_emb.pt to each file-specific directory. -def get_mels(paths, data_config, target_paths): +def get_mels(paths, data_config, target_paths, num_workers=0): data = DataMel(audiopaths=paths, data_config=data_config, target_paths=target_paths) collate_fn = CollateBlank() @@ -28,6 +30,7 @@ def get_mels(paths, data_config, target_paths): data, batch_size=32, collate_fn=collate_fn, + num_workers=num_workers, ) for batch in data_loader: pass # computes in loader. @@ -36,15 +39,16 @@ def get_mels(paths, data_config, target_paths): def get_embeddings( paths, data_config, + target_paths, resnet_se_model_path, resnet_se_config_path, - subpath_truncation=41, + num_workers=0, ): data = DataEmbedding( audiopaths=paths, resnet_se_model_path=resnet_se_model_path, resnet_se_config_path=resnet_se_config_path, - subpath_truncation=subpath_truncation, + target_paths=target_paths, ) collate_fn = CollateBlank() @@ -53,6 +57,7 @@ def get_embeddings( data, batch_size=32, collate_fn=collate_fn, + num_workers=num_workers, ) for batch in data_loader: pass # computes in loader. @@ -65,6 +70,7 @@ def get_pitches( target_folders=None, method="parselmouth", sample_rate=None, + num_workers=0, ): data = DataPitch( audiopaths=paths, @@ -73,7 +79,7 @@ def get_pitches( method=method, sample_rate=sample_rate, ) - get_parallel_torch(data) + get_parallel_torch(data, num_workers=num_workers) HUBERT_PATH = "hubert_embedding.pt" diff --git a/uberduck_ml_dev/models/radtts.py b/uberduck_ml_dev/models/radtts.py index fdf8717c..ab5d9866 100644 --- a/uberduck_ml_dev/models/radtts.py +++ b/uberduck_ml_dev/models/radtts.py @@ -135,7 +135,8 @@ def __init__( ) self.n_speaker_dim = n_speaker_dim assert self.n_speaker_dim % 2 == 0 - self.speaker_embedding = torch.nn.Embedding(n_speakers, self.n_speaker_dim) + if n_speakers > 0: + self.speaker_embedding = torch.nn.Embedding(n_speakers, self.n_speaker_dim) self.embedding = torch.nn.Embedding(n_text, n_text_dim) self.flows = torch.nn.ModuleList() self.encoder = Encoder( @@ -352,6 +353,7 @@ def encode_text(self, text, in_lens): text_embeddings = self.embedding(text).transpose(1, 2) # text_enc: b x n_text_dim x encoder_dim (512) + print(text_embeddings.device) if in_lens is None: text_enc = self.encoder.infer(text_embeddings).transpose(1, 2) else: @@ -586,6 +588,7 @@ def forward( z_out.append(mel) z_mel = torch.cat(z_out, 1) + print("SHAPE OF Z MEL: ", z_mel.shape, "SHAPE OF MEL: ", mel.shape) # duration predictor forward pass duration_model_outputs = None diff --git a/uberduck_ml_dev/text/symbols.py b/uberduck_ml_dev/text/symbols.py index cbbdb73b..bd8719dc 100644 --- a/uberduck_ml_dev/text/symbols.py +++ b/uberduck_ml_dev/text/symbols.py @@ -1,3 +1,5 @@ +import re + __all__ = [ "symbols_portuguese", "PORTUGUESE_SYMBOLS", From 5a1037633bcf07b8b95a8f7dd7a3eb39e805b419 Mon Sep 17 00:00:00 2001 From: zach wener Date: Tue, 15 Aug 2023 09:09:20 -0700 Subject: [PATCH 6/9] configs --- configs/config-ljspeech.json | 149 +++++++++++++++++++++++++ configs/config-zeroshot-warmstart.json | 149 +++++++++++++++++++++++++ configs/config-zeroshot.json | 149 +++++++++++++++++++++++++ configs/config.json | 149 +++++++++++++++++++++++++ 4 files changed, 596 insertions(+) create mode 100644 configs/config-ljspeech.json create mode 100644 configs/config-zeroshot-warmstart.json create mode 100644 configs/config-zeroshot.json create mode 100644 configs/config.json diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json new file mode 100644 index 00000000..4cb34a8f --- /dev/null +++ b/configs/config-ljspeech.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "lj": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": false, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 1, + "n_speaker_dim": 16, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json new file mode 100644 index 00000000..26f1a5c7 --- /dev/null +++ b/configs/config-zeroshot-warmstart.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "vctk": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": true, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 0, + "n_speaker_dim": 512, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json new file mode 100644 index 00000000..d0e29b0e --- /dev/null +++ b/configs/config-zeroshot.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "lj": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": true, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 0, + "n_speaker_dim": 512, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config.json b/configs/config.json new file mode 100644 index 00000000..2bcaa0c5 --- /dev/null +++ b/configs/config.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 24000, + "kl_loss_start_iter": 36000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "vctk": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": false, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 109, + "n_speaker_dim": 16, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} From 6c212e90a32365c6fb2fb7cf77c31750649d8ee3 Mon Sep 17 00:00:00 2001 From: zach wener Date: Tue, 15 Aug 2023 18:46:06 -0700 Subject: [PATCH 7/9] save --- uberduck_ml_dev/text/symbols.py | 100 ++++++++++++++++---------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/uberduck_ml_dev/text/symbols.py b/uberduck_ml_dev/text/symbols.py index bd8719dc..eb507350 100644 --- a/uberduck_ml_dev/text/symbols.py +++ b/uberduck_ml_dev/text/symbols.py @@ -199,56 +199,6 @@ NVIDIA_TACO2_SYMBOLS = "nvidia_taco2" GRAD_TTS_SYMBOLS = "gradtts" -SYMBOL_SETS = { - DEFAULT_SYMBOLS: symbols, - IPA_SYMBOLS: symbols_with_ipa, - NVIDIA_TACO2_SYMBOLS: symbols_nvidia_taco2, - GRAD_TTS_SYMBOLS: grad_tts_symbols, - PORTUGUESE_SYMBOLS: symbols_portuguese, - POLISH_SYMBOLS: symbols_polish, - DUTCH_SYMBOLS: symbols_dutch, - SPANISH_SYMBOLS: symbols_spanish, - NORWEGIAN_SYMBOLS: symbols_norwegian, - TURKISH_SYMBOLS: symbols_turkish, - RUSSIAN_SYMBOLS: symbols_russian, - UKRAINIAN_SYMBOLS: symbols_ukrainian, -} - - -import re - -symbol_to_id = { - DEFAULT_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])}, - IPA_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])}, - NVIDIA_TACO2_SYMBOLS: { - s: i for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS]) - }, - GRAD_TTS_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])}, - PORTUGUESE_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])}, - POLISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])}, - DUTCH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])}, - SPANISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])}, - NORWEGIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])}, - TURKISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])}, - RUSSIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])}, - UKRAINIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])}, -} -id_to_symbol = { - DEFAULT_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])}, - IPA_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])}, - NVIDIA_TACO2_SYMBOLS: { - i: s for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS]) - }, - GRAD_TTS_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])}, - PORTUGUESE_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])}, - POLISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])}, - DUTCH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])}, - SPANISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])}, - NORWEGIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])}, - TURKISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])}, - RUSSIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])}, - UKRAINIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])}, -} curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") words_re = re.compile( @@ -413,3 +363,53 @@ def get_symbols(symbol_set): raise Exception("{} symbol set does not exist".format(symbol_set)) return symbols + + +SYMBOL_SETS = { + DEFAULT_SYMBOLS: symbols, + IPA_SYMBOLS: symbols_with_ipa, + NVIDIA_TACO2_SYMBOLS: symbols_nvidia_taco2, + GRAD_TTS_SYMBOLS: grad_tts_symbols, + PORTUGUESE_SYMBOLS: symbols_portuguese, + POLISH_SYMBOLS: symbols_polish, + DUTCH_SYMBOLS: symbols_dutch, + SPANISH_SYMBOLS: symbols_spanish, + NORWEGIAN_SYMBOLS: symbols_norwegian, + TURKISH_SYMBOLS: symbols_turkish, + RUSSIAN_SYMBOLS: symbols_russian, + UKRAINIAN_SYMBOLS: symbols_ukrainian, + "radtts": get_symbols("radtts"), +} + +symbol_to_id = { + DEFAULT_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])}, + IPA_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])}, + NVIDIA_TACO2_SYMBOLS: { + s: i for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS]) + }, + GRAD_TTS_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])}, + PORTUGUESE_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])}, + POLISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])}, + DUTCH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])}, + SPANISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])}, + NORWEGIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])}, + TURKISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])}, + RUSSIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])}, + UKRAINIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])}, +} +id_to_symbol = { + DEFAULT_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])}, + IPA_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])}, + NVIDIA_TACO2_SYMBOLS: { + i: s for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS]) + }, + GRAD_TTS_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])}, + PORTUGUESE_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])}, + POLISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])}, + DUTCH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])}, + SPANISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])}, + NORWEGIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])}, + TURKISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])}, + RUSSIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])}, + UKRAINIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])}, +} From d3516ca73926d997510fdbdb23930e40c578b1ba Mon Sep 17 00:00:00 2001 From: "sjkoelle@gmail.com" Date: Thu, 7 Sep 2023 14:14:54 -0700 Subject: [PATCH 8/9] fix --- configs/config-ljspeech.json | 149 ------------------------- configs/config-zeroshot-warmstart.json | 149 ------------------------- configs/config-zeroshot.json | 149 ------------------------- configs/config.json | 149 ------------------------- 4 files changed, 596 deletions(-) delete mode 100644 configs/config-ljspeech.json delete mode 100644 configs/config-zeroshot-warmstart.json delete mode 100644 configs/config-zeroshot.json delete mode 100644 configs/config.json diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json deleted file mode 100644 index 4cb34a8f..00000000 --- a/configs/config-ljspeech.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "train_config": { - "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/", - "epochs": 10000000, - "optim_algo": "RAdam", - "learning_rate": 0.0001, - "weight_decay": 1e-6, - "sigma": 1.0, - "iters_per_checkpoint": 2500, - "batch_size": 16, - "seed": null, - "checkpoint_path": "", - "ignore_layers": [], - "ignore_layers_warmstart": [], - "steps_per_sample": 500, - "finetune_layers": [], - "include_layers": [], - "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", - "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", - "log_attribute_samples": false, - "log_decoder_samples": true, - "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt", - "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", - "use_amp": true, - "grad_clip_val": 1.0, - "loss_weights": { - "blank_logprob": -1, - "ctc_loss_weight": 0.1, - "binarization_loss_weight": 1.0, - "dur_loss_weight": 1.0, - "f0_loss_weight": 1.0, - "energy_loss_weight": 1.0, - "vpred_loss_weight": 1.0 - }, - "binarization_start_iter": 6000, - "kl_loss_start_iter": 8000, - "unfreeze_modules": "all" - }, - "data_config": { - "training_files": { - "lj": { - "basedir": "/home/zach/code/uberduck-ml-dev/data", - "audiodir": "", - "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", - "lmdbpath": "" - } - }, - "validation_files": {}, - "dur_min": 0.1, - "dur_max": 10.2, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": 8000.0, - "f0_min": 80.0, - "f0_max": 640.0, - "max_wav_value": 32768.0, - "use_f0": true, - "use_log_f0": 0, - "use_energy_avg": true, - "use_scaled_energy": true, - "symbol_set": "radtts", - "cleaner_names": ["radtts_cleaners"], - "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", - "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", - "p_phoneme": 1.0, - "handle_phoneme": "word", - "handle_phoneme_ambiguous": "ignore", - "include_speakers": null, - "n_frames": -1, - "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", - "lmdb_cache_path": "", - "use_attn_prior_masking": true, - "prepend_space_to_text": true, - "append_space_to_text": true, - "add_bos_eos_to_text": false, - "betabinom_scaling_factor": 1.0, - "distance_tx_unvoiced": false, - "is_zero_shot": false, - "mel_noise_scale": 0.0 - }, - "dist_config": { - "dist_backend": "nccl", - "dist_url": "tcp://localhost:54321" - }, - "model_config": { - "n_speakers": 1, - "n_speaker_dim": 16, - "n_text": 185, - "n_text_dim": 512, - "n_flows": 8, - "n_conv_layers_per_step": 4, - "n_mel_channels": 80, - "n_hidden": 1024, - "mel_encoder_n_hidden": 512, - "dummy_speaker_embedding": false, - "n_early_size": 2, - "n_early_every": 2, - "n_group_size": 2, - "affine_model": "wavenet", - "include_modules": "decatn", - "scaling_fn": "tanh", - "matrix_decomposition": "LUS", - "learn_alignments": true, - "use_speaker_emb_for_alignment": false, - "attn_straight_through_estimator": true, - "use_context_lstm": true, - "context_lstm_norm": "spectral", - "context_lstm_w_f0_and_energy": true, - "text_encoder_lstm_norm": "spectral", - "n_f0_dims": 1, - "n_energy_avg_dims": 1, - "use_first_order_features": false, - "unvoiced_bias_activation": "relu", - "decoder_use_partial_padding": true, - "decoder_use_unvoiced_bias": true, - "ap_pred_log_f0": true, - "ap_use_unvoiced_bias": true, - "ap_use_voiced_embeddings": true, - "dur_model_config": null, - "f0_model_config": null, - "energy_model_config": null, - "v_model_config": { - "name": "dap", - "hparams": { - "n_speaker_dim": 16, - "take_log_of_input": false, - "bottleneck_hparams": { - "in_dim": 512, - "reduction_factor": 16, - "norm": "weightnorm", - "non_linearity": "relu" - }, - "arch_hparams": { - "out_dim": 1, - "n_layers": 2, - "n_channels": 256, - "kernel_size": 3, - "p_dropout": 0.5, - "lstm_type": "", - "use_linear": 1 - } - } - } - } -} diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json deleted file mode 100644 index 26f1a5c7..00000000 --- a/configs/config-zeroshot-warmstart.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "train_config": { - "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/", - "epochs": 10000000, - "optim_algo": "RAdam", - "learning_rate": 0.0001, - "weight_decay": 1e-6, - "sigma": 1.0, - "iters_per_checkpoint": 2500, - "batch_size": 16, - "seed": null, - "checkpoint_path": "", - "ignore_layers": [], - "ignore_layers_warmstart": [], - "steps_per_sample": 500, - "finetune_layers": [], - "include_layers": [], - "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", - "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", - "log_attribute_samples": false, - "log_decoder_samples": true, - "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt", - "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", - "use_amp": true, - "grad_clip_val": 1.0, - "loss_weights": { - "blank_logprob": -1, - "ctc_loss_weight": 0.1, - "binarization_loss_weight": 1.0, - "dur_loss_weight": 1.0, - "f0_loss_weight": 1.0, - "energy_loss_weight": 1.0, - "vpred_loss_weight": 1.0 - }, - "binarization_start_iter": 6000, - "kl_loss_start_iter": 8000, - "unfreeze_modules": "all" - }, - "data_config": { - "training_files": { - "vctk": { - "basedir": "/home/zach/code/uberduck-ml-dev/data", - "audiodir": "", - "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", - "lmdbpath": "" - } - }, - "validation_files": {}, - "dur_min": 0.1, - "dur_max": 10.2, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": 8000.0, - "f0_min": 80.0, - "f0_max": 640.0, - "max_wav_value": 32768.0, - "use_f0": true, - "use_log_f0": 0, - "use_energy_avg": true, - "use_scaled_energy": true, - "symbol_set": "radtts", - "cleaner_names": ["radtts_cleaners"], - "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", - "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", - "p_phoneme": 1.0, - "handle_phoneme": "word", - "handle_phoneme_ambiguous": "ignore", - "include_speakers": null, - "n_frames": -1, - "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", - "lmdb_cache_path": "", - "use_attn_prior_masking": true, - "prepend_space_to_text": true, - "append_space_to_text": true, - "add_bos_eos_to_text": false, - "betabinom_scaling_factor": 1.0, - "distance_tx_unvoiced": false, - "is_zero_shot": true, - "mel_noise_scale": 0.0 - }, - "dist_config": { - "dist_backend": "nccl", - "dist_url": "tcp://localhost:54321" - }, - "model_config": { - "n_speakers": 0, - "n_speaker_dim": 512, - "n_text": 185, - "n_text_dim": 512, - "n_flows": 8, - "n_conv_layers_per_step": 4, - "n_mel_channels": 80, - "n_hidden": 1024, - "mel_encoder_n_hidden": 512, - "dummy_speaker_embedding": false, - "n_early_size": 2, - "n_early_every": 2, - "n_group_size": 2, - "affine_model": "wavenet", - "include_modules": "decatn", - "scaling_fn": "tanh", - "matrix_decomposition": "LUS", - "learn_alignments": true, - "use_speaker_emb_for_alignment": false, - "attn_straight_through_estimator": true, - "use_context_lstm": true, - "context_lstm_norm": "spectral", - "context_lstm_w_f0_and_energy": true, - "text_encoder_lstm_norm": "spectral", - "n_f0_dims": 1, - "n_energy_avg_dims": 1, - "use_first_order_features": false, - "unvoiced_bias_activation": "relu", - "decoder_use_partial_padding": true, - "decoder_use_unvoiced_bias": true, - "ap_pred_log_f0": true, - "ap_use_unvoiced_bias": true, - "ap_use_voiced_embeddings": true, - "dur_model_config": null, - "f0_model_config": null, - "energy_model_config": null, - "v_model_config": { - "name": "dap", - "hparams": { - "n_speaker_dim": 16, - "take_log_of_input": false, - "bottleneck_hparams": { - "in_dim": 512, - "reduction_factor": 16, - "norm": "weightnorm", - "non_linearity": "relu" - }, - "arch_hparams": { - "out_dim": 1, - "n_layers": 2, - "n_channels": 256, - "kernel_size": 3, - "p_dropout": 0.5, - "lstm_type": "", - "use_linear": 1 - } - } - } - } -} diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json deleted file mode 100644 index d0e29b0e..00000000 --- a/configs/config-zeroshot.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "train_config": { - "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/", - "epochs": 10000000, - "optim_algo": "RAdam", - "learning_rate": 0.0001, - "weight_decay": 1e-6, - "sigma": 1.0, - "iters_per_checkpoint": 2500, - "batch_size": 16, - "seed": null, - "checkpoint_path": "", - "ignore_layers": [], - "ignore_layers_warmstart": [], - "steps_per_sample": 500, - "finetune_layers": [], - "include_layers": [], - "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", - "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", - "log_attribute_samples": false, - "log_decoder_samples": true, - "warmstart_checkpoint_path": "", - "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", - "use_amp": true, - "grad_clip_val": 1.0, - "loss_weights": { - "blank_logprob": -1, - "ctc_loss_weight": 0.1, - "binarization_loss_weight": 1.0, - "dur_loss_weight": 1.0, - "f0_loss_weight": 1.0, - "energy_loss_weight": 1.0, - "vpred_loss_weight": 1.0 - }, - "binarization_start_iter": 6000, - "kl_loss_start_iter": 8000, - "unfreeze_modules": "all" - }, - "data_config": { - "training_files": { - "lj": { - "basedir": "/home/zach/code/uberduck-ml-dev/data", - "audiodir": "", - "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", - "lmdbpath": "" - } - }, - "validation_files": {}, - "dur_min": 0.1, - "dur_max": 10.2, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": 8000.0, - "f0_min": 80.0, - "f0_max": 640.0, - "max_wav_value": 32768.0, - "use_f0": true, - "use_log_f0": 0, - "use_energy_avg": true, - "use_scaled_energy": true, - "symbol_set": "radtts", - "cleaner_names": ["radtts_cleaners"], - "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", - "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", - "p_phoneme": 1.0, - "handle_phoneme": "word", - "handle_phoneme_ambiguous": "ignore", - "include_speakers": null, - "n_frames": -1, - "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", - "lmdb_cache_path": "", - "use_attn_prior_masking": true, - "prepend_space_to_text": true, - "append_space_to_text": true, - "add_bos_eos_to_text": false, - "betabinom_scaling_factor": 1.0, - "distance_tx_unvoiced": false, - "is_zero_shot": true, - "mel_noise_scale": 0.0 - }, - "dist_config": { - "dist_backend": "nccl", - "dist_url": "tcp://localhost:54321" - }, - "model_config": { - "n_speakers": 0, - "n_speaker_dim": 512, - "n_text": 185, - "n_text_dim": 512, - "n_flows": 8, - "n_conv_layers_per_step": 4, - "n_mel_channels": 80, - "n_hidden": 1024, - "mel_encoder_n_hidden": 512, - "dummy_speaker_embedding": false, - "n_early_size": 2, - "n_early_every": 2, - "n_group_size": 2, - "affine_model": "wavenet", - "include_modules": "decatn", - "scaling_fn": "tanh", - "matrix_decomposition": "LUS", - "learn_alignments": true, - "use_speaker_emb_for_alignment": false, - "attn_straight_through_estimator": true, - "use_context_lstm": true, - "context_lstm_norm": "spectral", - "context_lstm_w_f0_and_energy": true, - "text_encoder_lstm_norm": "spectral", - "n_f0_dims": 1, - "n_energy_avg_dims": 1, - "use_first_order_features": false, - "unvoiced_bias_activation": "relu", - "decoder_use_partial_padding": true, - "decoder_use_unvoiced_bias": true, - "ap_pred_log_f0": true, - "ap_use_unvoiced_bias": true, - "ap_use_voiced_embeddings": true, - "dur_model_config": null, - "f0_model_config": null, - "energy_model_config": null, - "v_model_config": { - "name": "dap", - "hparams": { - "n_speaker_dim": 16, - "take_log_of_input": false, - "bottleneck_hparams": { - "in_dim": 512, - "reduction_factor": 16, - "norm": "weightnorm", - "non_linearity": "relu" - }, - "arch_hparams": { - "out_dim": 1, - "n_layers": 2, - "n_channels": 256, - "kernel_size": 3, - "p_dropout": 0.5, - "lstm_type": "", - "use_linear": 1 - } - } - } - } -} diff --git a/configs/config.json b/configs/config.json deleted file mode 100644 index 2bcaa0c5..00000000 --- a/configs/config.json +++ /dev/null @@ -1,149 +0,0 @@ -{ - "train_config": { - "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/", - "epochs": 10000000, - "optim_algo": "RAdam", - "learning_rate": 0.0001, - "weight_decay": 1e-6, - "sigma": 1.0, - "iters_per_checkpoint": 2500, - "batch_size": 16, - "seed": null, - "checkpoint_path": "", - "ignore_layers": [], - "ignore_layers_warmstart": [], - "steps_per_sample": 500, - "finetune_layers": [], - "include_layers": [], - "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", - "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", - "log_attribute_samples": false, - "log_decoder_samples": true, - "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt", - "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", - "use_amp": true, - "grad_clip_val": 1.0, - "loss_weights": { - "blank_logprob": -1, - "ctc_loss_weight": 0.1, - "binarization_loss_weight": 1.0, - "dur_loss_weight": 1.0, - "f0_loss_weight": 1.0, - "energy_loss_weight": 1.0, - "vpred_loss_weight": 1.0 - }, - "binarization_start_iter": 24000, - "kl_loss_start_iter": 36000, - "unfreeze_modules": "all" - }, - "data_config": { - "training_files": { - "vctk": { - "basedir": "/home/zach/code/uberduck-ml-dev/data", - "audiodir": "", - "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", - "lmdbpath": "" - } - }, - "validation_files": {}, - "dur_min": 0.1, - "dur_max": 10.2, - "sampling_rate": 22050, - "filter_length": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mel_channels": 80, - "mel_fmin": 0.0, - "mel_fmax": 8000.0, - "f0_min": 80.0, - "f0_max": 640.0, - "max_wav_value": 32768.0, - "use_f0": true, - "use_log_f0": 0, - "use_energy_avg": true, - "use_scaled_energy": true, - "symbol_set": "radtts", - "cleaner_names": ["radtts_cleaners"], - "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", - "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", - "p_phoneme": 1.0, - "handle_phoneme": "word", - "handle_phoneme_ambiguous": "ignore", - "include_speakers": null, - "n_frames": -1, - "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", - "lmdb_cache_path": "", - "use_attn_prior_masking": true, - "prepend_space_to_text": true, - "append_space_to_text": true, - "add_bos_eos_to_text": false, - "betabinom_scaling_factor": 1.0, - "distance_tx_unvoiced": false, - "is_zero_shot": false, - "mel_noise_scale": 0.0 - }, - "dist_config": { - "dist_backend": "nccl", - "dist_url": "tcp://localhost:54321" - }, - "model_config": { - "n_speakers": 109, - "n_speaker_dim": 16, - "n_text": 185, - "n_text_dim": 512, - "n_flows": 8, - "n_conv_layers_per_step": 4, - "n_mel_channels": 80, - "n_hidden": 1024, - "mel_encoder_n_hidden": 512, - "dummy_speaker_embedding": false, - "n_early_size": 2, - "n_early_every": 2, - "n_group_size": 2, - "affine_model": "wavenet", - "include_modules": "decatn", - "scaling_fn": "tanh", - "matrix_decomposition": "LUS", - "learn_alignments": true, - "use_speaker_emb_for_alignment": false, - "attn_straight_through_estimator": true, - "use_context_lstm": true, - "context_lstm_norm": "spectral", - "context_lstm_w_f0_and_energy": true, - "text_encoder_lstm_norm": "spectral", - "n_f0_dims": 1, - "n_energy_avg_dims": 1, - "use_first_order_features": false, - "unvoiced_bias_activation": "relu", - "decoder_use_partial_padding": true, - "decoder_use_unvoiced_bias": true, - "ap_pred_log_f0": true, - "ap_use_unvoiced_bias": true, - "ap_use_voiced_embeddings": true, - "dur_model_config": null, - "f0_model_config": null, - "energy_model_config": null, - "v_model_config": { - "name": "dap", - "hparams": { - "n_speaker_dim": 16, - "take_log_of_input": false, - "bottleneck_hparams": { - "in_dim": 512, - "reduction_factor": 16, - "norm": "weightnorm", - "non_linearity": "relu" - }, - "arch_hparams": { - "out_dim": 1, - "n_layers": 2, - "n_channels": 256, - "kernel_size": 3, - "p_dropout": 0.5, - "lstm_type": "", - "use_linear": 1 - } - } - } - } -} From d19fee07c6d3d6fd76c04a2d0701ada50e4f7922 Mon Sep 17 00:00:00 2001 From: "sjkoelle@gmail.com" Date: Thu, 7 Sep 2023 14:19:38 -0700 Subject: [PATCH 9/9] save --- configs/config-ljspeech.json | 149 +++++++++++++++++++++++++ configs/config-zeroshot-warmstart.json | 149 +++++++++++++++++++++++++ configs/config-zeroshot.json | 149 +++++++++++++++++++++++++ configs/config.json | 149 +++++++++++++++++++++++++ 4 files changed, 596 insertions(+) create mode 100644 configs/config-ljspeech.json create mode 100644 configs/config-zeroshot-warmstart.json create mode 100644 configs/config-zeroshot.json create mode 100644 configs/config.json diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json new file mode 100644 index 00000000..4cb34a8f --- /dev/null +++ b/configs/config-ljspeech.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "lj": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": false, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 1, + "n_speaker_dim": 16, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json new file mode 100644 index 00000000..26f1a5c7 --- /dev/null +++ b/configs/config-zeroshot-warmstart.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "vctk": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": true, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 0, + "n_speaker_dim": 512, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json new file mode 100644 index 00000000..d0e29b0e --- /dev/null +++ b/configs/config-zeroshot.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 6000, + "kl_loss_start_iter": 8000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "lj": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": true, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 0, + "n_speaker_dim": 512, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +} diff --git a/configs/config.json b/configs/config.json new file mode 100644 index 00000000..2bcaa0c5 --- /dev/null +++ b/configs/config.json @@ -0,0 +1,149 @@ +{ + "train_config": { + "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/", + "epochs": 10000000, + "optim_algo": "RAdam", + "learning_rate": 0.0001, + "weight_decay": 1e-6, + "sigma": 1.0, + "iters_per_checkpoint": 2500, + "batch_size": 16, + "seed": null, + "checkpoint_path": "", + "ignore_layers": [], + "ignore_layers_warmstart": [], + "steps_per_sample": 500, + "finetune_layers": [], + "include_layers": [], + "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json", + "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust", + "log_attribute_samples": false, + "log_decoder_samples": true, + "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt", + "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt", + "use_amp": true, + "grad_clip_val": 1.0, + "loss_weights": { + "blank_logprob": -1, + "ctc_loss_weight": 0.1, + "binarization_loss_weight": 1.0, + "dur_loss_weight": 1.0, + "f0_loss_weight": 1.0, + "energy_loss_weight": 1.0, + "vpred_loss_weight": 1.0 + }, + "binarization_start_iter": 24000, + "kl_loss_start_iter": 36000, + "unfreeze_modules": "all" + }, + "data_config": { + "training_files": { + "vctk": { + "basedir": "/home/zach/code/uberduck-ml-dev/data", + "audiodir": "", + "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt", + "lmdbpath": "" + } + }, + "validation_files": {}, + "dur_min": 0.1, + "dur_max": 10.2, + "sampling_rate": 22050, + "filter_length": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": 8000.0, + "f0_min": 80.0, + "f0_max": 640.0, + "max_wav_value": 32768.0, + "use_f0": true, + "use_log_f0": 0, + "use_energy_avg": true, + "use_scaled_energy": true, + "symbol_set": "radtts", + "cleaner_names": ["radtts_cleaners"], + "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms", + "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b", + "p_phoneme": 1.0, + "handle_phoneme": "word", + "handle_phoneme_ambiguous": "ignore", + "include_speakers": null, + "n_frames": -1, + "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/", + "lmdb_cache_path": "", + "use_attn_prior_masking": true, + "prepend_space_to_text": true, + "append_space_to_text": true, + "add_bos_eos_to_text": false, + "betabinom_scaling_factor": 1.0, + "distance_tx_unvoiced": false, + "is_zero_shot": false, + "mel_noise_scale": 0.0 + }, + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:54321" + }, + "model_config": { + "n_speakers": 109, + "n_speaker_dim": 16, + "n_text": 185, + "n_text_dim": 512, + "n_flows": 8, + "n_conv_layers_per_step": 4, + "n_mel_channels": 80, + "n_hidden": 1024, + "mel_encoder_n_hidden": 512, + "dummy_speaker_embedding": false, + "n_early_size": 2, + "n_early_every": 2, + "n_group_size": 2, + "affine_model": "wavenet", + "include_modules": "decatn", + "scaling_fn": "tanh", + "matrix_decomposition": "LUS", + "learn_alignments": true, + "use_speaker_emb_for_alignment": false, + "attn_straight_through_estimator": true, + "use_context_lstm": true, + "context_lstm_norm": "spectral", + "context_lstm_w_f0_and_energy": true, + "text_encoder_lstm_norm": "spectral", + "n_f0_dims": 1, + "n_energy_avg_dims": 1, + "use_first_order_features": false, + "unvoiced_bias_activation": "relu", + "decoder_use_partial_padding": true, + "decoder_use_unvoiced_bias": true, + "ap_pred_log_f0": true, + "ap_use_unvoiced_bias": true, + "ap_use_voiced_embeddings": true, + "dur_model_config": null, + "f0_model_config": null, + "energy_model_config": null, + "v_model_config": { + "name": "dap", + "hparams": { + "n_speaker_dim": 16, + "take_log_of_input": false, + "bottleneck_hparams": { + "in_dim": 512, + "reduction_factor": 16, + "norm": "weightnorm", + "non_linearity": "relu" + }, + "arch_hparams": { + "out_dim": 1, + "n_layers": 2, + "n_channels": 256, + "kernel_size": 3, + "p_dropout": 0.5, + "lstm_type": "", + "use_linear": 1 + } + } + } + } +}