From 9ec46d7b0ecd97d4a0b2101bf58353cd1aa3066c Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Thu, 29 Jun 2023 15:57:20 -0700
Subject: [PATCH 1/9] bugfix and reduce deps

---
 settings.ini                 | 5 +----
 uberduck_ml_dev/data/data.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/settings.ini b/settings.ini
index 067761b1..1ceddc46 100644
--- a/settings.ini
+++ b/settings.ini
@@ -24,10 +24,7 @@ license = apache2
 # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
 status = 2
 
-# Optional. Same format as setuptools requirements.  Torch version seems to effect random number generator (not 100% certain).
-# TODO (Sam): our goal is to rely on as few 3rd party packages as possible.  We should try to remove as many of these as possible and integrate torch code directly.
-# NOTE (Sam): is it possible to specify no-deps here?
-requirements = Cython pytest phonemizer inflect librosa>=0.8.0 matplotlib nltk>=3.6.5 numpy>=1.23.5 csvw clldutils pandas pydub scipy scikit-learn soundfile tensorboardX torch>=1.13.0 torchaudio>=0.9.0 unidecode seaborn mdutils wordcloud wordfreq Pillow einops g2p_en@git+https://github.com/uberduck-ai/g2p emoji text-unidecode gdown pre-commit lmdb  ray[default] praat-parselmouth>=0.4.3
+requirements = librosa pandas numpy scipy scikit-learn soundfile torch  torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p
 
 # Optional. Same format as setuptools console_scripts
 # console_scripts =
diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py
index 544d44a9..3b486598 100644
--- a/uberduck_ml_dev/data/data.py
+++ b/uberduck_ml_dev/data/data.py
@@ -529,7 +529,7 @@ def filter_by_duration_(self, dur_min, dur_max):
         ]
 
     def create_speaker_lookup_table(self, data):
-        speaker_ids = np.sort(np.unique([x["speaker"] for x in data]))
+        speaker_ids = np.sort(np.unique([int(x["speaker"]) for x in data]))
         d = {speaker_ids[i]: i for i in range(len(speaker_ids))}
         print("Number of speakers:", len(d))
         print("Speaker IDS", d)

From 7d6c13ebbe516999ee32582ed6826153d7eb8cf0 Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Thu, 29 Jun 2023 22:14:06 -0700
Subject: [PATCH 2/9] some minor changes

---
 settings.ini                            |  2 +-
 uberduck_ml_dev/data/data.py            | 16 ++++-
 uberduck_ml_dev/models/radtts.py        | 17 +-----
 uberduck_ml_dev/models/vits.py          | 77 -------------------------
 uberduck_ml_dev/trainer/radtts/train.py | 21 +++++--
 5 files changed, 33 insertions(+), 100 deletions(-)

diff --git a/settings.ini b/settings.ini
index 1ceddc46..71ed7950 100644
--- a/settings.ini
+++ b/settings.ini
@@ -24,7 +24,7 @@ license = apache2
 # From 1-7: Planning Pre-Alpha Alpha Beta Production Mature Inactive
 status = 2
 
-requirements = librosa pandas numpy scipy scikit-learn soundfile torch  torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p
+requirements = librosa pandas numpy scipy scikit-learn soundfile torch torchaudio einops g2p_en@git+https://github.com/uberduck-ai/g2p
 
 # Optional. Same format as setuptools console_scripts
 # console_scripts =
diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py
index 3b486598..66bfff1e 100644
--- a/uberduck_ml_dev/data/data.py
+++ b/uberduck_ml_dev/data/data.py
@@ -514,7 +514,7 @@ def load_data(self, datasets, split="|"):
         return dataset
 
     def filter_by_speakers_(self, speakers, include=True):
-        print("Include spaker {}: {}".format(speakers, include))
+        print("Include speaker {}: {}".format(speakers, include))
         if include:
             self.data = [x for x in self.data if x["speaker"] in speakers]
         else:
@@ -609,6 +609,9 @@ def get_speaker_id(self, speaker):
         if self.speaker_map is not None and speaker in self.speaker_map:
             speaker = self.speaker_map[speaker]
 
+        if speaker not in self.speaker_ids and int(speaker) in self.speaker_ids:
+            speaker = int(speaker)
+
         return torch.LongTensor([self.speaker_ids[speaker]])
 
     def get_text(self, text):
@@ -656,6 +659,17 @@ def __getitem__(self, index):
             distance_map[distance_map <= 0] = 0.0
             f0 = f0 - distance_map
 
+        if not os.path.exists(mel_path):
+            _, audio = read(audiopath)
+            # sub_path = audiopath.split("resampled_unnormalized.wav")[0]
+            audio = np.asarray(audio / (np.abs(audio).max() * 2))
+            audio_norm = torch.tensor(audio, dtype=torch.float32)
+            audio_norm = audio_norm.unsqueeze(0)
+            melspec = self.stft.mel_spectrogram(audio_norm)
+            melspec = torch.squeeze(melspec, 0)
+            melspec = (melspec + 5.5) / 2
+            torch.save(melspec.detach(), mel_path)
+
         mel = torch.load(mel_path)
 
         energy_avg = None
diff --git a/uberduck_ml_dev/models/radtts.py b/uberduck_ml_dev/models/radtts.py
index ed103881..fdf8717c 100644
--- a/uberduck_ml_dev/models/radtts.py
+++ b/uberduck_ml_dev/models/radtts.py
@@ -20,6 +20,7 @@
 # DEALINGS IN THE SOFTWARE.
 from typing import Optional
 
+
 import torch
 from torch import nn
 from .common import (
@@ -84,20 +85,6 @@ def forward(self, z, context, inverse=False, seq_lens=None):
             return z, log_det_W, log_s
 
 
-# # NOTE (Sam): comment this out for GPU
-# def get_mask_from_lengths(lengths):
-#     """Constructs binary mask from a 1D torch tensor of input lengths
-#     Args:
-#         lengths (torch.tensor): 1D tensor
-#     Returns:
-#         mask (torch.tensor): num_sequences x max_length x 1 binary tensor
-#     """
-#     max_len = torch.max(lengths).item()
-#     ids = torch.arange(0, max_len, out=torch.LongTensor(max_len))
-#     mask = (ids < lengths.unsqueeze(1)).bool()
-#     return mask
-
-
 class RADTTS(torch.nn.Module):
     def __init__(
         self,
@@ -1011,7 +998,7 @@ def remove_norms(self):
     "n_early_every": 2,
     "n_group_size": 2,
     "affine_model": "wavenet",
-    "include_modules": "decatndpmvpredapm",
+    "include_modules": ["dec", "atn", "dpm", "vpred", "apm"],
     "scaling_fn": "tanh",
     "matrix_decomposition": "LUS",
     "learn_alignments": True,
diff --git a/uberduck_ml_dev/models/vits.py b/uberduck_ml_dev/models/vits.py
index 6cc242df..03506f13 100644
--- a/uberduck_ml_dev/models/vits.py
+++ b/uberduck_ml_dev/models/vits.py
@@ -279,83 +279,6 @@ def forward(self, x, x_lengths, g=None, local_conditioning=None):
         return z, m, logs, x_mask
 
 
-# class Generator(torch.nn.Module):
-#     def __init__(
-#         self,
-#         initial_channel,
-#         resblock,
-#         resblock_kernel_sizes,
-#         resblock_dilation_sizes,
-#         upsample_rates,
-#         upsample_initial_channel,
-#         upsample_kernel_sizes,
-#         gin_channels=0,
-#     ):
-#         super(Generator, self).__init__()
-#         self.num_kernels = len(resblock_kernel_sizes)
-#         self.num_upsamples = len(upsample_rates)
-#         self.conv_pre = Conv1d(
-#             initial_channel, upsample_initial_channel, 7, 1, padding=3
-#         )
-#         resblock = common.ResBlock1 if resblock == "1" else common.ResBlock2
-#
-#         self.ups = nn.ModuleList()
-#         for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-#             self.ups.append(
-#                 weight_norm(
-#                     ConvTranspose1d(
-#                         upsample_initial_channel // (2**i),
-#                         upsample_initial_channel // (2 ** (i + 1)),
-#                         k,
-#                         u,
-#                         padding=(k - u) // 2,
-#                     )
-#                 )
-#             )
-#
-#         self.resblocks = nn.ModuleList()
-#         for i in range(len(self.ups)):
-#             ch = upsample_initial_channel // (2 ** (i + 1))
-#             for j, (k, d) in enumerate(
-#                 zip(resblock_kernel_sizes, resblock_dilation_sizes)
-#             ):
-#                 self.resblocks.append(resblock(ch, k, d))
-#
-#         self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
-#         self.ups.apply(init_weights)
-#
-#         if gin_channels != 0:
-#             self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-#
-#     def forward(self, x, g=None):
-#         x = self.conv_pre(x)
-#         if g is not None:
-#             x = x + self.cond(g)
-#
-#         for i in range(self.num_upsamples):
-#             x = F.leaky_relu(x, common.LRELU_SLOPE)
-#             x = self.ups[i](x)
-#             xs = None
-#             for j in range(self.num_kernels):
-#                 if xs is None:
-#                     xs = self.resblocks[i * self.num_kernels + j](x)
-#                 else:
-#                     xs += self.resblocks[i * self.num_kernels + j](x)
-#             x = xs / self.num_kernels
-#         x = F.leaky_relu(x)
-#         x = self.conv_post(x)
-#         x = torch.tanh(x)
-#
-#         return x
-#
-#     def remove_weight_norm(self):
-#         print("Removing weight norm...")
-#         for l in self.ups:
-#             remove_weight_norm(l)
-#         for l in self.resblocks:
-#             l.remove_weight_norm()
-
-
 class DiscriminatorP(torch.nn.Module):
     def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         super(DiscriminatorP, self).__init__()
diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py
index 2a90e9cd..ea721089 100644
--- a/uberduck_ml_dev/trainer/radtts/train.py
+++ b/uberduck_ml_dev/trainer/radtts/train.py
@@ -1,7 +1,11 @@
+import ray
 import torch
 from torch.cuda.amp import GradScaler
+from ray.air import session
 from ray.air.integrations.wandb import setup_wandb
 import ray.train as train
+from torchvision.models import resnet18
+
 
 from .train_epoch import train_epoch
 from .load import prepare_dataloaders, warmstart
@@ -32,23 +36,28 @@ def train_func(config: dict):
     model = RADTTS(
         **model_config,
     )
+    # model = resnet18()
 
     if train_config["warmstart_checkpoint_path"] != "":
         warmstart(train_config["warmstart_checkpoint_path"], model)
 
-    # NOTE (Sam): find_unused_parameters=True is necessary for num_workers >1 in ScalingConfig.
-    model = train.torch.prepare_model(
-        model, parallel_strategy_kwargs=dict(find_unused_parameters=True)
-    )
-
     start_epoch = 0
     # NOTE (Sam): what is significance of batch_size=6?  Think this is overriden within the dataloader.
+    print("PREPARING DATALOADER")
     train_loader, valset, collate_fn = prepare_dataloaders(
         data_config,
         2,  # 2 gpus by default
         train_config["batch_size"],
     )
     train_dataloader = train.torch.prepare_data_loader(train_loader)
+    print("DONE PREPARING DATA LOSDER")
+
+    print("PREPARING MODEL...")
+    # NOTE (Sam): find_unused_parameters=True is necessary for num_workers >1 in ScalingConfig.
+    model = train.torch.prepare_model(
+        model, parallel_strategy_kwargs=dict(find_unused_parameters=True)
+    )
+    print("Done PREPARING MODEL...")
 
     optim = RAdam(
         model.parameters(),
@@ -74,7 +83,7 @@ def train_func(config: dict):
     )
     attention_kl_loss = AttentionBinarizationLoss()
     iteration = 0
-    for epoch in range(start_epoch, start_epoch + epochs):
+    for _ in range(start_epoch, start_epoch + epochs):
         iteration = train_epoch(
             train_dataloader,
             train_config["log_decoder_samples"],

From 0f2a00a5d5dc674589bb1b4f2c51db9e60c333fa Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Fri, 30 Jun 2023 17:47:58 -0700
Subject: [PATCH 3/9] qol improvements

---
 uberduck_ml_dev/data/collate.py               |  6 ++++++
 uberduck_ml_dev/data/data.py                  |  1 +
 uberduck_ml_dev/trainer/log.py                | 16 ++++++++++++----
 uberduck_ml_dev/trainer/radtts/load.py        |  8 ++++++--
 uberduck_ml_dev/trainer/radtts/log.py         |  5 ++++-
 uberduck_ml_dev/trainer/radtts/train.py       | 11 ++++-------
 uberduck_ml_dev/trainer/radtts/train_epoch.py |  2 ++
 uberduck_ml_dev/trainer/radtts/train_step.py  | 10 ++++++----
 8 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/uberduck_ml_dev/data/collate.py b/uberduck_ml_dev/data/collate.py
index 0b9582ac..095b3a02 100644
--- a/uberduck_ml_dev/data/collate.py
+++ b/uberduck_ml_dev/data/collate.py
@@ -218,10 +218,16 @@ def __call__(self, batch):
                     i, : cur_attn_prior.size(0), : cur_attn_prior.size(1)
                 ] = cur_attn_prior
 
+        original_text = [
+            batch[ids_sorted_decreasing[i]]["text"]
+            for i in range(len(ids_sorted_decreasing))
+        ]
+
         return {
             "mel": mel_padded,
             "speaker_ids": speaker_ids,
             "text": text_padded,
+            "original_text": original_text,
             "input_lengths": input_lengths,
             "output_lengths": output_lengths,
             "audiopaths": audiopaths,
diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py
index 66bfff1e..6c49ee15 100644
--- a/uberduck_ml_dev/data/data.py
+++ b/uberduck_ml_dev/data/data.py
@@ -693,6 +693,7 @@ def __getitem__(self, index):
         return {
             "mel": mel,
             "speaker_id": speaker_id,
+            "text": text,
             "text_encoded": text_encoded,
             "audiopath": audiopath,
             "attn_prior": attn_prior,
diff --git a/uberduck_ml_dev/trainer/log.py b/uberduck_ml_dev/trainer/log.py
index 8b947d9e..d5756ea6 100644
--- a/uberduck_ml_dev/trainer/log.py
+++ b/uberduck_ml_dev/trainer/log.py
@@ -4,11 +4,19 @@
 
 
 @torch.no_grad()
-def log(metrics, audios={}):
+def log(metrics, audios=None, images=None):
+    if session.get_world_rank() != 0:
+        return
+    audios = audios or {}
+    images = images or {}
     wandb_metrics = dict(metrics)
 
     for k, v in audios.items():
-        wandb_metrics[k] = wandb.Audio(v, sample_rate=22050)
+        wandb_metrics[k] = wandb.Audio(
+            v["audio"], sample_rate=22050, caption=v.get("caption")
+        )
 
-    if session.get_world_rank() == 0:
-        wandb.log(wandb_metrics)
+    for k, v in images.items():
+        wandb_metrics[k] = wandb.Image(v)
+
+    wandb.log(wandb_metrics)
diff --git a/uberduck_ml_dev/trainer/radtts/load.py b/uberduck_ml_dev/trainer/radtts/load.py
index c9795105..19c559f4 100644
--- a/uberduck_ml_dev/trainer/radtts/load.py
+++ b/uberduck_ml_dev/trainer/radtts/load.py
@@ -14,6 +14,10 @@ def warmstart(
     checkpoint_path, model, include_layers=[], ignore_layers_warmstart=[], strict=False
 ):
     pretrained_dict = torch.load(checkpoint_path, map_location="cpu")
+    iteration = 0
+    if "iteration" in pretrained_dict:
+        iteration = pretrained_dict["iteration"]
+
     pretrained_dict = pretrained_dict["state_dict"]
 
     is_module = False
@@ -29,9 +33,9 @@ def warmstart(
     model_dict = model.state_dict()
     model_dict.update(pretrained_dict)
     model.load_state_dict(model_dict, strict=strict)
-    print("Warm started from {}".format(checkpoint_path))
+    print(f"Warm started from {checkpoint_path}, iteration {iteration}")
     model.train()
-    return model
+    return (model, iteration)
 
 
 def prepare_dataloaders(data_config, n_gpus, batch_size):
diff --git a/uberduck_ml_dev/trainer/radtts/log.py b/uberduck_ml_dev/trainer/radtts/log.py
index 56368fb7..f5282d80 100644
--- a/uberduck_ml_dev/trainer/radtts/log.py
+++ b/uberduck_ml_dev/trainer/radtts/log.py
@@ -171,6 +171,9 @@ def get_log_audio(
                     sample_tag = f"sample_attribute_sigma_{attribute_sigma}"
                 if oos_name is not None:
                     sample_tag = f"{sample_tag}_oos_{oos_name}"
-                audios[sample_tag] = audio
+                print("ADDING AUDIO WITH CAPTION: ", batch_dict.get("original_text")[0])
+                audios[sample_tag] = dict(
+                    audio=audio, caption=batch_dict.get("original_text")[0]
+                )
 
     return images, audios
diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py
index ea721089..e4ff01ef 100644
--- a/uberduck_ml_dev/trainer/radtts/train.py
+++ b/uberduck_ml_dev/trainer/radtts/train.py
@@ -4,8 +4,6 @@
 from ray.air import session
 from ray.air.integrations.wandb import setup_wandb
 import ray.train as train
-from torchvision.models import resnet18
-
 
 from .train_epoch import train_epoch
 from .load import prepare_dataloaders, warmstart
@@ -36,10 +34,9 @@ def train_func(config: dict):
     model = RADTTS(
         **model_config,
     )
-    # model = resnet18()
-
+    iteration = 0
     if train_config["warmstart_checkpoint_path"] != "":
-        warmstart(train_config["warmstart_checkpoint_path"], model)
+        _, iteration = warmstart(train_config["warmstart_checkpoint_path"], model)
 
     start_epoch = 0
     # NOTE (Sam): what is significance of batch_size=6?  Think this is overriden within the dataloader.
@@ -82,8 +79,7 @@ def train_func(config: dict):
         loss_weights=train_config["loss_weights"],
     )
     attention_kl_loss = AttentionBinarizationLoss()
-    iteration = 0
-    for _ in range(start_epoch, start_epoch + epochs):
+    for epoch in range(start_epoch, start_epoch + epochs):
         iteration = train_epoch(
             train_dataloader,
             train_config["log_decoder_samples"],
@@ -100,6 +96,7 @@ def train_func(config: dict):
             binarization_start_iter,
             iteration,
             vocoder,
+            epoch=epoch,
         )
 
 
diff --git a/uberduck_ml_dev/trainer/radtts/train_epoch.py b/uberduck_ml_dev/trainer/radtts/train_epoch.py
index b30a6bd4..f6cfee61 100644
--- a/uberduck_ml_dev/trainer/radtts/train_epoch.py
+++ b/uberduck_ml_dev/trainer/radtts/train_epoch.py
@@ -18,6 +18,7 @@ def train_epoch(
     binarization_start_iter,
     iteration,
     vocoder,
+    epoch=None,
 ):
     # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration):
     # for batch_idx, ray_batch_df in enumerate(
@@ -43,6 +44,7 @@ def train_epoch(
             kl_loss_start_iter,
             binarization_start_iter,
             vocoder,
+            epoch=epoch,
         )
         iteration += 1
 
diff --git a/uberduck_ml_dev/trainer/radtts/train_step.py b/uberduck_ml_dev/trainer/radtts/train_step.py
index 56128071..c27ae496 100644
--- a/uberduck_ml_dev/trainer/radtts/train_step.py
+++ b/uberduck_ml_dev/trainer/radtts/train_step.py
@@ -30,6 +30,7 @@ def _train_step(
     kl_loss_start_iter,
     binarization_start_iter,
     vocoder,
+    epoch=None,
 ):
     print(datetime.now(), "entering train step:", iteration)
     if iteration >= binarization_start_iter:
@@ -76,7 +77,9 @@ def _train_step(
         for k, (v, w) in loss_outputs.items():
             if w > 0:
                 loss = v * w if loss is None else loss + v * w
-            print_list.append("  |  {}: {:.3f}".format(k, v))
+            print_list.append("{}: {:.3f}".format(k, v))
+        print_list.append(f"epoch: {epoch}")
+        print_list.append(f"iteration: {iteration}")
 
         w_bin = criterion.loss_weights.get("binarization_loss_weight", 1.0)
         if binarize and iteration >= kl_loss_start_iter:
@@ -86,7 +89,7 @@ def _train_step(
             binarization_loss = torch.zeros_like(loss)
         loss_outputs["binarization_loss"] = (binarization_loss, w_bin)
     grad_clip_val = 1.0  # TODO (Sam): make this a config option
-    print(print_list)
+    print("  |  ".join(print_list))
     scaler.scale(loss).backward()
     if grad_clip_val > 0:
         scaler.unscale_(optim)
@@ -99,7 +102,6 @@ def _train_step(
     for k, (v, w) in loss_outputs.items():
         metrics[k] = v.item()
 
-    print("iteration: ", iteration, datetime.now())
     log_sample = iteration % steps_per_sample == 0
     log_checkpoint = iteration % iters_per_checkpoint == 0
 
@@ -142,7 +144,7 @@ def _train_step(
         #         audio_embedding_oos=audio_embedding_oos,
         #     )
         #     audios.update(audios_oos)
-        log(metrics, audios)
+        log(metrics, audios, images)
         model.train()
     else:
         log(metrics)

From f1dfcedc6680d4539000e67366e9a2dcd494169f Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Mon, 10 Jul 2023 09:11:33 -0700
Subject: [PATCH 4/9] logging

---
 uberduck_ml_dev/trainer/radtts/log.py         | 7 ++++++-
 uberduck_ml_dev/trainer/radtts/train.py       | 2 ++
 uberduck_ml_dev/trainer/radtts/train_epoch.py | 2 ++
 uberduck_ml_dev/trainer/radtts/train_step.py  | 8 +++++---
 uberduck_ml_dev/vocoders/hifigan.py           | 3 +++
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/uberduck_ml_dev/trainer/radtts/log.py b/uberduck_ml_dev/trainer/radtts/log.py
index f5282d80..27ecbc6a 100644
--- a/uberduck_ml_dev/trainer/radtts/log.py
+++ b/uberduck_ml_dev/trainer/radtts/log.py
@@ -7,7 +7,11 @@
 from ...utils.utils import (
     to_gpu,
 )
-from ...utils.plot import plot_alignment_to_numpy
+from ...utils.plot import (
+    plot_alignment_to_numpy,
+    plot_attention_phonemes,
+    plot_spectrogram,
+)
 
 
 # want to test out of sample but can only do proper inference with zero shot dap so lets just look at zero shot decoder samples
@@ -65,6 +69,7 @@ def get_log_audio(
 
     images = {}
     audios = {}
+    images["mel_gt"] = plot_spectrogram(mel[0].data.cpu().numpy())
     if attn_used is not None:
         images["attention_weights"] = plot_alignment_to_numpy(
             attn_soft[0, 0].data.cpu().numpy().T, title="audioname"
diff --git a/uberduck_ml_dev/trainer/radtts/train.py b/uberduck_ml_dev/trainer/radtts/train.py
index e4ff01ef..88072682 100644
--- a/uberduck_ml_dev/trainer/radtts/train.py
+++ b/uberduck_ml_dev/trainer/radtts/train.py
@@ -30,6 +30,7 @@ def train_func(config: dict):
     sigma = train_config["sigma"]
     kl_loss_start_iter = train_config["kl_loss_start_iter"]
     binarization_start_iter = train_config["binarization_start_iter"]
+    grad_clip_val = train_config["grad_clip_val"]
 
     model = RADTTS(
         **model_config,
@@ -97,6 +98,7 @@ def train_func(config: dict):
             iteration,
             vocoder,
             epoch=epoch,
+            grad_clip_val=grad_clip_val,
         )
 
 
diff --git a/uberduck_ml_dev/trainer/radtts/train_epoch.py b/uberduck_ml_dev/trainer/radtts/train_epoch.py
index f6cfee61..57e8a6b2 100644
--- a/uberduck_ml_dev/trainer/radtts/train_epoch.py
+++ b/uberduck_ml_dev/trainer/radtts/train_epoch.py
@@ -19,6 +19,7 @@ def train_epoch(
     iteration,
     vocoder,
     epoch=None,
+    grad_clip_val=None,
 ):
     # def train_epoch(dataset_shard, batch_size, model, optim, steps_per_sample, scaler, scheduler, criterion, attention_kl_loss, kl_loss_start_iter, binarization_start_iter, epoch, iteration):
     # for batch_idx, ray_batch_df in enumerate(
@@ -45,6 +46,7 @@ def train_epoch(
             binarization_start_iter,
             vocoder,
             epoch=epoch,
+            grad_clip_val=grad_clip_val,
         )
         iteration += 1
 
diff --git a/uberduck_ml_dev/trainer/radtts/train_step.py b/uberduck_ml_dev/trainer/radtts/train_step.py
index c27ae496..044350ad 100644
--- a/uberduck_ml_dev/trainer/radtts/train_step.py
+++ b/uberduck_ml_dev/trainer/radtts/train_step.py
@@ -31,6 +31,7 @@ def _train_step(
     binarization_start_iter,
     vocoder,
     epoch=None,
+    grad_clip_val=None,
 ):
     print(datetime.now(), "entering train step:", iteration)
     if iteration >= binarization_start_iter:
@@ -88,17 +89,18 @@ def _train_step(
         else:
             binarization_loss = torch.zeros_like(loss)
         loss_outputs["binarization_loss"] = (binarization_loss, w_bin)
-    grad_clip_val = 1.0  # TODO (Sam): make this a config option
     print("  |  ".join(print_list))
+    metrics = {"loss": loss.item()}
     scaler.scale(loss).backward()
     if grad_clip_val > 0:
         scaler.unscale_(optim)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)
+        norm_ = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val)
+        print("total norm: ", norm_.item())
+        metrics["grad_norm"] = norm_.item()
 
     scaler.step(optim)
     scaler.update()
 
-    metrics = {"loss": loss.item()}
     for k, (v, w) in loss_outputs.items():
         metrics[k] = v.item()
 
diff --git a/uberduck_ml_dev/vocoders/hifigan.py b/uberduck_ml_dev/vocoders/hifigan.py
index 65d6caba..95950487 100644
--- a/uberduck_ml_dev/vocoders/hifigan.py
+++ b/uberduck_ml_dev/vocoders/hifigan.py
@@ -76,6 +76,9 @@ def load_vocoder(vocoder_state_dict, vocoder_config, to_cuda=True):
 def get_vocoder(hifi_gan_config_path, hifi_gan_checkpoint_path):
     print("Getting vocoder")
 
+    import os
+
+    print("CWD: ", os.getcwd())
     with open(hifi_gan_config_path) as f:
         hifigan_config = json.load(f)
 

From 11afbb1e8999028168eaa39d82b4e59d1f0db535 Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Wed, 12 Jul 2023 18:31:21 -0700
Subject: [PATCH 5/9] some useful things

---
 uberduck_ml_dev/data/data.py     | 16 ++++++++++------
 uberduck_ml_dev/data/get.py      | 18 ++++++++++++------
 uberduck_ml_dev/models/radtts.py |  5 ++++-
 uberduck_ml_dev/text/symbols.py  |  2 ++
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/uberduck_ml_dev/data/data.py b/uberduck_ml_dev/data/data.py
index 6c49ee15..5cbe4db8 100644
--- a/uberduck_ml_dev/data/data.py
+++ b/uberduck_ml_dev/data/data.py
@@ -775,25 +775,29 @@ def __init__(
         resnet_se_model_path,
         resnet_se_config_path,
         audiopaths,
-        subpath_truncation=41,
+        target_paths,
     ):
         self.model = get_pretrained_model(
             model_path=resnet_se_model_path, config_path=resnet_se_config_path
         )
         self.audiopaths = audiopaths
-        self.subpath_truncation = subpath_truncation
+        self.target_paths = target_paths
+        # self.subpath_truncation = subpath_truncation
 
-    def _get_data(self, audiopath):
+    def _get_data(self, audiopath, target_path):
         rate, data = read(audiopath)
         data = torch.FloatTensor(data.astype("float32") / MAX_WAV_VALUE).unsqueeze(0)
-        sub_path = audiopath[: self.subpath_truncation]
+        # sub_path = audiopath[: self.subpath_truncation]
         embedding = self.model(data).squeeze()
-        emb_path_local = f"{sub_path}/coqui_resnet_512_emb.pt"
+        # emb_path_local = f"{sub_path}/coqui_resnet_512_emb.pt"
+        emb_path_local = target_path
         torch.save(embedding.detach(), emb_path_local)
 
     def __getitem__(self, idx):
         try:
-            self._get_data(audiopath=self.audiopaths[idx])
+            self._get_data(
+                audiopath=self.audiopaths[idx], target_path=self.target_paths[idx]
+            )
 
         except Exception as e:
             print(f"Error while getting data: index = {idx}")
diff --git a/uberduck_ml_dev/data/get.py b/uberduck_ml_dev/data/get.py
index f31aff06..4d23a3bf 100644
--- a/uberduck_ml_dev/data/get.py
+++ b/uberduck_ml_dev/data/get.py
@@ -9,8 +9,10 @@
 from uberduck_ml_dev.data.collate import CollateBlank
 
 
-def get_parallel_torch(data):
-    data_loader = DataLoader(data, batch_size=32, collate_fn=CollateBlank())
+def get_parallel_torch(data, num_workers=0):
+    data_loader = DataLoader(
+        data, batch_size=32, collate_fn=CollateBlank(), num_workers=num_workers
+    )
     for batch in data_loader:
         pass
 
@@ -19,7 +21,7 @@ def get_parallel_torch(data):
 # NOTE (Sam): assumes data is in a directory structure like:
 # /tmp/{uuid}/resampled_normalized.wav
 # These functions add spectrogram.pt, f0.pt, and coqui_resnet_512_emb.pt to each file-specific directory.
-def get_mels(paths, data_config, target_paths):
+def get_mels(paths, data_config, target_paths, num_workers=0):
     data = DataMel(audiopaths=paths, data_config=data_config, target_paths=target_paths)
 
     collate_fn = CollateBlank()
@@ -28,6 +30,7 @@ def get_mels(paths, data_config, target_paths):
         data,
         batch_size=32,
         collate_fn=collate_fn,
+        num_workers=num_workers,
     )
     for batch in data_loader:
         pass  # computes in loader.
@@ -36,15 +39,16 @@ def get_mels(paths, data_config, target_paths):
 def get_embeddings(
     paths,
     data_config,
+    target_paths,
     resnet_se_model_path,
     resnet_se_config_path,
-    subpath_truncation=41,
+    num_workers=0,
 ):
     data = DataEmbedding(
         audiopaths=paths,
         resnet_se_model_path=resnet_se_model_path,
         resnet_se_config_path=resnet_se_config_path,
-        subpath_truncation=subpath_truncation,
+        target_paths=target_paths,
     )
 
     collate_fn = CollateBlank()
@@ -53,6 +57,7 @@ def get_embeddings(
         data,
         batch_size=32,
         collate_fn=collate_fn,
+        num_workers=num_workers,
     )
     for batch in data_loader:
         pass  # computes in loader.
@@ -65,6 +70,7 @@ def get_pitches(
     target_folders=None,
     method="parselmouth",
     sample_rate=None,
+    num_workers=0,
 ):
     data = DataPitch(
         audiopaths=paths,
@@ -73,7 +79,7 @@ def get_pitches(
         method=method,
         sample_rate=sample_rate,
     )
-    get_parallel_torch(data)
+    get_parallel_torch(data, num_workers=num_workers)
 
 
 HUBERT_PATH = "hubert_embedding.pt"
diff --git a/uberduck_ml_dev/models/radtts.py b/uberduck_ml_dev/models/radtts.py
index fdf8717c..ab5d9866 100644
--- a/uberduck_ml_dev/models/radtts.py
+++ b/uberduck_ml_dev/models/radtts.py
@@ -135,7 +135,8 @@ def __init__(
         )
         self.n_speaker_dim = n_speaker_dim
         assert self.n_speaker_dim % 2 == 0
-        self.speaker_embedding = torch.nn.Embedding(n_speakers, self.n_speaker_dim)
+        if n_speakers > 0:
+            self.speaker_embedding = torch.nn.Embedding(n_speakers, self.n_speaker_dim)
         self.embedding = torch.nn.Embedding(n_text, n_text_dim)
         self.flows = torch.nn.ModuleList()
         self.encoder = Encoder(
@@ -352,6 +353,7 @@ def encode_text(self, text, in_lens):
         text_embeddings = self.embedding(text).transpose(1, 2)
         # text_enc: b x n_text_dim x encoder_dim (512)
 
+        print(text_embeddings.device)
         if in_lens is None:
             text_enc = self.encoder.infer(text_embeddings).transpose(1, 2)
         else:
@@ -586,6 +588,7 @@ def forward(
 
             z_out.append(mel)
             z_mel = torch.cat(z_out, 1)
+            print("SHAPE OF Z MEL: ", z_mel.shape, "SHAPE OF MEL: ", mel.shape)
 
         # duration predictor forward pass
         duration_model_outputs = None
diff --git a/uberduck_ml_dev/text/symbols.py b/uberduck_ml_dev/text/symbols.py
index cbbdb73b..bd8719dc 100644
--- a/uberduck_ml_dev/text/symbols.py
+++ b/uberduck_ml_dev/text/symbols.py
@@ -1,3 +1,5 @@
+import re
+
 __all__ = [
     "symbols_portuguese",
     "PORTUGUESE_SYMBOLS",

From 5a1037633bcf07b8b95a8f7dd7a3eb39e805b419 Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Tue, 15 Aug 2023 09:09:20 -0700
Subject: [PATCH 6/9] configs

---
 configs/config-ljspeech.json           | 149 +++++++++++++++++++++++++
 configs/config-zeroshot-warmstart.json | 149 +++++++++++++++++++++++++
 configs/config-zeroshot.json           | 149 +++++++++++++++++++++++++
 configs/config.json                    | 149 +++++++++++++++++++++++++
 4 files changed, 596 insertions(+)
 create mode 100644 configs/config-ljspeech.json
 create mode 100644 configs/config-zeroshot-warmstart.json
 create mode 100644 configs/config-zeroshot.json
 create mode 100644 configs/config.json

diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json
new file mode 100644
index 00000000..4cb34a8f
--- /dev/null
+++ b/configs/config-ljspeech.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "lj": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": false,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 1,
+        "n_speaker_dim": 16,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json
new file mode 100644
index 00000000..26f1a5c7
--- /dev/null
+++ b/configs/config-zeroshot-warmstart.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "vctk": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": true,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 0,
+        "n_speaker_dim": 512,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json
new file mode 100644
index 00000000..d0e29b0e
--- /dev/null
+++ b/configs/config-zeroshot.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "lj": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": true,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 0,
+        "n_speaker_dim": 512,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config.json b/configs/config.json
new file mode 100644
index 00000000..2bcaa0c5
--- /dev/null
+++ b/configs/config.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 24000,
+        "kl_loss_start_iter": 36000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "vctk": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": false,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 109,
+        "n_speaker_dim": 16,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}

From 6c212e90a32365c6fb2fb7cf77c31750649d8ee3 Mon Sep 17 00:00:00 2001
From: zach wener <z@uberduck.ai>
Date: Tue, 15 Aug 2023 18:46:06 -0700
Subject: [PATCH 7/9] save

---
 uberduck_ml_dev/text/symbols.py | 100 ++++++++++++++++----------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/uberduck_ml_dev/text/symbols.py b/uberduck_ml_dev/text/symbols.py
index bd8719dc..eb507350 100644
--- a/uberduck_ml_dev/text/symbols.py
+++ b/uberduck_ml_dev/text/symbols.py
@@ -199,56 +199,6 @@
 NVIDIA_TACO2_SYMBOLS = "nvidia_taco2"
 GRAD_TTS_SYMBOLS = "gradtts"
 
-SYMBOL_SETS = {
-    DEFAULT_SYMBOLS: symbols,
-    IPA_SYMBOLS: symbols_with_ipa,
-    NVIDIA_TACO2_SYMBOLS: symbols_nvidia_taco2,
-    GRAD_TTS_SYMBOLS: grad_tts_symbols,
-    PORTUGUESE_SYMBOLS: symbols_portuguese,
-    POLISH_SYMBOLS: symbols_polish,
-    DUTCH_SYMBOLS: symbols_dutch,
-    SPANISH_SYMBOLS: symbols_spanish,
-    NORWEGIAN_SYMBOLS: symbols_norwegian,
-    TURKISH_SYMBOLS: symbols_turkish,
-    RUSSIAN_SYMBOLS: symbols_russian,
-    UKRAINIAN_SYMBOLS: symbols_ukrainian,
-}
-
-
-import re
-
-symbol_to_id = {
-    DEFAULT_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
-    IPA_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
-    NVIDIA_TACO2_SYMBOLS: {
-        s: i for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
-    },
-    GRAD_TTS_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
-    PORTUGUESE_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
-    POLISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
-    DUTCH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
-    SPANISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])},
-    NORWEGIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])},
-    TURKISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])},
-    RUSSIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])},
-    UKRAINIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])},
-}
-id_to_symbol = {
-    DEFAULT_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
-    IPA_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
-    NVIDIA_TACO2_SYMBOLS: {
-        i: s for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
-    },
-    GRAD_TTS_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
-    PORTUGUESE_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
-    POLISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
-    DUTCH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
-    SPANISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])},
-    NORWEGIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])},
-    TURKISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])},
-    RUSSIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])},
-    UKRAINIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])},
-}
 
 curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
 words_re = re.compile(
@@ -413,3 +363,53 @@ def get_symbols(symbol_set):
         raise Exception("{} symbol set does not exist".format(symbol_set))
 
     return symbols
+
+
+SYMBOL_SETS = {
+    DEFAULT_SYMBOLS: symbols,
+    IPA_SYMBOLS: symbols_with_ipa,
+    NVIDIA_TACO2_SYMBOLS: symbols_nvidia_taco2,
+    GRAD_TTS_SYMBOLS: grad_tts_symbols,
+    PORTUGUESE_SYMBOLS: symbols_portuguese,
+    POLISH_SYMBOLS: symbols_polish,
+    DUTCH_SYMBOLS: symbols_dutch,
+    SPANISH_SYMBOLS: symbols_spanish,
+    NORWEGIAN_SYMBOLS: symbols_norwegian,
+    TURKISH_SYMBOLS: symbols_turkish,
+    RUSSIAN_SYMBOLS: symbols_russian,
+    UKRAINIAN_SYMBOLS: symbols_ukrainian,
+    "radtts": get_symbols("radtts"),
+}
+
+symbol_to_id = {
+    DEFAULT_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
+    IPA_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
+    NVIDIA_TACO2_SYMBOLS: {
+        s: i for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
+    },
+    GRAD_TTS_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
+    PORTUGUESE_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
+    POLISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
+    DUTCH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
+    SPANISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])},
+    NORWEGIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])},
+    TURKISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])},
+    RUSSIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])},
+    UKRAINIAN_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])},
+}
+id_to_symbol = {
+    DEFAULT_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
+    IPA_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
+    NVIDIA_TACO2_SYMBOLS: {
+        i: s for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
+    },
+    GRAD_TTS_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
+    PORTUGUESE_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
+    POLISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
+    DUTCH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
+    SPANISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[SPANISH_SYMBOLS])},
+    NORWEGIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[NORWEGIAN_SYMBOLS])},
+    TURKISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[TURKISH_SYMBOLS])},
+    RUSSIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[RUSSIAN_SYMBOLS])},
+    UKRAINIAN_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[UKRAINIAN_SYMBOLS])},
+}

From d3516ca73926d997510fdbdb23930e40c578b1ba Mon Sep 17 00:00:00 2001
From: "sjkoelle@gmail.com" <sjkoelle@gmail.com>
Date: Thu, 7 Sep 2023 14:14:54 -0700
Subject: [PATCH 8/9] fix

---
 configs/config-ljspeech.json           | 149 -------------------------
 configs/config-zeroshot-warmstart.json | 149 -------------------------
 configs/config-zeroshot.json           | 149 -------------------------
 configs/config.json                    | 149 -------------------------
 4 files changed, 596 deletions(-)
 delete mode 100644 configs/config-ljspeech.json
 delete mode 100644 configs/config-zeroshot-warmstart.json
 delete mode 100644 configs/config-zeroshot.json
 delete mode 100644 configs/config.json

diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json
deleted file mode 100644
index 4cb34a8f..00000000
--- a/configs/config-ljspeech.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
-    "train_config": {
-        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/",
-        "epochs": 10000000,
-        "optim_algo": "RAdam",
-        "learning_rate": 0.0001,
-        "weight_decay": 1e-6,
-        "sigma": 1.0,
-        "iters_per_checkpoint": 2500,
-        "batch_size": 16,
-        "seed": null,
-        "checkpoint_path": "",
-        "ignore_layers": [],
-        "ignore_layers_warmstart": [],
-        "steps_per_sample": 500,
-        "finetune_layers": [],
-        "include_layers": [],
-        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
-        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
-        "log_attribute_samples": false,
-        "log_decoder_samples": true,
-        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt",
-        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
-        "use_amp": true,
-        "grad_clip_val": 1.0,
-        "loss_weights": {
-            "blank_logprob": -1,
-            "ctc_loss_weight": 0.1,
-            "binarization_loss_weight": 1.0,
-            "dur_loss_weight": 1.0,
-            "f0_loss_weight": 1.0,
-            "energy_loss_weight": 1.0,
-            "vpred_loss_weight": 1.0
-        },
-        "binarization_start_iter": 6000,
-        "kl_loss_start_iter": 8000,
-        "unfreeze_modules": "all"
-    },
-    "data_config": {
-        "training_files": {
-            "lj": {
-                "basedir": "/home/zach/code/uberduck-ml-dev/data",
-                "audiodir": "",
-                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
-                "lmdbpath": ""
-            }
-        },
-        "validation_files": {},
-        "dur_min": 0.1,
-        "dur_max": 10.2,
-        "sampling_rate": 22050,
-        "filter_length": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mel_channels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000.0,
-        "f0_min": 80.0,
-        "f0_max": 640.0,
-        "max_wav_value": 32768.0,
-        "use_f0": true,
-        "use_log_f0": 0,
-        "use_energy_avg": true,
-        "use_scaled_energy": true,
-        "symbol_set": "radtts",
-        "cleaner_names": ["radtts_cleaners"],
-        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
-        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
-        "p_phoneme": 1.0,
-        "handle_phoneme": "word",
-        "handle_phoneme_ambiguous": "ignore",
-        "include_speakers": null,
-        "n_frames": -1,
-        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
-        "lmdb_cache_path": "", 
-        "use_attn_prior_masking": true,
-        "prepend_space_to_text": true,
-        "append_space_to_text": true,
-        "add_bos_eos_to_text": false,
-        "betabinom_scaling_factor": 1.0,
-        "distance_tx_unvoiced": false,
-        "is_zero_shot": false,
-        "mel_noise_scale": 0.0
-    },
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321"
-    },
-    "model_config": {
-        "n_speakers": 1,
-        "n_speaker_dim": 16,
-        "n_text": 185,
-        "n_text_dim": 512,
-        "n_flows": 8,
-        "n_conv_layers_per_step": 4,
-        "n_mel_channels": 80,
-        "n_hidden": 1024,
-        "mel_encoder_n_hidden": 512,
-        "dummy_speaker_embedding": false,
-        "n_early_size": 2,
-        "n_early_every": 2,
-        "n_group_size": 2,
-        "affine_model": "wavenet",
-        "include_modules": "decatn",
-        "scaling_fn": "tanh",
-        "matrix_decomposition": "LUS",
-        "learn_alignments": true,
-        "use_speaker_emb_for_alignment": false,
-        "attn_straight_through_estimator": true,
-        "use_context_lstm": true,
-        "context_lstm_norm": "spectral",
-        "context_lstm_w_f0_and_energy": true,
-        "text_encoder_lstm_norm": "spectral",
-        "n_f0_dims": 1,
-        "n_energy_avg_dims": 1,
-        "use_first_order_features": false,
-        "unvoiced_bias_activation": "relu",
-        "decoder_use_partial_padding": true,
-        "decoder_use_unvoiced_bias": true,
-        "ap_pred_log_f0": true,
-        "ap_use_unvoiced_bias": true,
-        "ap_use_voiced_embeddings": true,
-        "dur_model_config": null,
-        "f0_model_config": null,
-        "energy_model_config": null,
-        "v_model_config": {
-            "name": "dap",
-            "hparams": {
-                "n_speaker_dim": 16,
-                "take_log_of_input": false,
-                "bottleneck_hparams": {
-                    "in_dim": 512,
-                    "reduction_factor": 16,
-                    "norm": "weightnorm",
-                    "non_linearity": "relu"
-                },
-                "arch_hparams": {
-                    "out_dim": 1,
-                    "n_layers": 2,
-                    "n_channels": 256,
-                    "kernel_size": 3,
-                    "p_dropout": 0.5,
-                    "lstm_type": "",
-                    "use_linear": 1
-                }
-            }
-        }
-    }
-}
diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json
deleted file mode 100644
index 26f1a5c7..00000000
--- a/configs/config-zeroshot-warmstart.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
-    "train_config": {
-        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/",
-        "epochs": 10000000,
-        "optim_algo": "RAdam",
-        "learning_rate": 0.0001,
-        "weight_decay": 1e-6,
-        "sigma": 1.0,
-        "iters_per_checkpoint": 2500,
-        "batch_size": 16,
-        "seed": null,
-        "checkpoint_path": "",
-        "ignore_layers": [],
-        "ignore_layers_warmstart": [],
-        "steps_per_sample": 500,
-        "finetune_layers": [],
-        "include_layers": [],
-        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
-        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
-        "log_attribute_samples": false,
-        "log_decoder_samples": true,
-        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt",
-        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
-        "use_amp": true,
-        "grad_clip_val": 1.0,
-        "loss_weights": {
-            "blank_logprob": -1,
-            "ctc_loss_weight": 0.1,
-            "binarization_loss_weight": 1.0,
-            "dur_loss_weight": 1.0,
-            "f0_loss_weight": 1.0,
-            "energy_loss_weight": 1.0,
-            "vpred_loss_weight": 1.0
-        },
-        "binarization_start_iter": 6000,
-        "kl_loss_start_iter": 8000,
-        "unfreeze_modules": "all"
-    },
-    "data_config": {
-        "training_files": {
-            "vctk": {
-                "basedir": "/home/zach/code/uberduck-ml-dev/data",
-                "audiodir": "",
-                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
-                "lmdbpath": ""
-            }
-        },
-        "validation_files": {},
-        "dur_min": 0.1,
-        "dur_max": 10.2,
-        "sampling_rate": 22050,
-        "filter_length": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mel_channels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000.0,
-        "f0_min": 80.0,
-        "f0_max": 640.0,
-        "max_wav_value": 32768.0,
-        "use_f0": true,
-        "use_log_f0": 0,
-        "use_energy_avg": true,
-        "use_scaled_energy": true,
-        "symbol_set": "radtts",
-        "cleaner_names": ["radtts_cleaners"],
-        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
-        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
-        "p_phoneme": 1.0,
-        "handle_phoneme": "word",
-        "handle_phoneme_ambiguous": "ignore",
-        "include_speakers": null,
-        "n_frames": -1,
-        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
-        "lmdb_cache_path": "", 
-        "use_attn_prior_masking": true,
-        "prepend_space_to_text": true,
-        "append_space_to_text": true,
-        "add_bos_eos_to_text": false,
-        "betabinom_scaling_factor": 1.0,
-        "distance_tx_unvoiced": false,
-        "is_zero_shot": true,
-        "mel_noise_scale": 0.0
-    },
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321"
-    },
-    "model_config": {
-        "n_speakers": 0,
-        "n_speaker_dim": 512,
-        "n_text": 185,
-        "n_text_dim": 512,
-        "n_flows": 8,
-        "n_conv_layers_per_step": 4,
-        "n_mel_channels": 80,
-        "n_hidden": 1024,
-        "mel_encoder_n_hidden": 512,
-        "dummy_speaker_embedding": false,
-        "n_early_size": 2,
-        "n_early_every": 2,
-        "n_group_size": 2,
-        "affine_model": "wavenet",
-        "include_modules": "decatn",
-        "scaling_fn": "tanh",
-        "matrix_decomposition": "LUS",
-        "learn_alignments": true,
-        "use_speaker_emb_for_alignment": false,
-        "attn_straight_through_estimator": true,
-        "use_context_lstm": true,
-        "context_lstm_norm": "spectral",
-        "context_lstm_w_f0_and_energy": true,
-        "text_encoder_lstm_norm": "spectral",
-        "n_f0_dims": 1,
-        "n_energy_avg_dims": 1,
-        "use_first_order_features": false,
-        "unvoiced_bias_activation": "relu",
-        "decoder_use_partial_padding": true,
-        "decoder_use_unvoiced_bias": true,
-        "ap_pred_log_f0": true,
-        "ap_use_unvoiced_bias": true,
-        "ap_use_voiced_embeddings": true,
-        "dur_model_config": null,
-        "f0_model_config": null,
-        "energy_model_config": null,
-        "v_model_config": {
-            "name": "dap",
-            "hparams": {
-                "n_speaker_dim": 16,
-                "take_log_of_input": false,
-                "bottleneck_hparams": {
-                    "in_dim": 512,
-                    "reduction_factor": 16,
-                    "norm": "weightnorm",
-                    "non_linearity": "relu"
-                },
-                "arch_hparams": {
-                    "out_dim": 1,
-                    "n_layers": 2,
-                    "n_channels": 256,
-                    "kernel_size": 3,
-                    "p_dropout": 0.5,
-                    "lstm_type": "",
-                    "use_linear": 1
-                }
-            }
-        }
-    }
-}
diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json
deleted file mode 100644
index d0e29b0e..00000000
--- a/configs/config-zeroshot.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
-    "train_config": {
-        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/",
-        "epochs": 10000000,
-        "optim_algo": "RAdam",
-        "learning_rate": 0.0001,
-        "weight_decay": 1e-6,
-        "sigma": 1.0,
-        "iters_per_checkpoint": 2500,
-        "batch_size": 16,
-        "seed": null,
-        "checkpoint_path": "",
-        "ignore_layers": [],
-        "ignore_layers_warmstart": [],
-        "steps_per_sample": 500,
-        "finetune_layers": [],
-        "include_layers": [],
-        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
-        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
-        "log_attribute_samples": false,
-        "log_decoder_samples": true,
-        "warmstart_checkpoint_path": "",
-        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
-        "use_amp": true,
-        "grad_clip_val": 1.0,
-        "loss_weights": {
-            "blank_logprob": -1,
-            "ctc_loss_weight": 0.1,
-            "binarization_loss_weight": 1.0,
-            "dur_loss_weight": 1.0,
-            "f0_loss_weight": 1.0,
-            "energy_loss_weight": 1.0,
-            "vpred_loss_weight": 1.0
-        },
-        "binarization_start_iter": 6000,
-        "kl_loss_start_iter": 8000,
-        "unfreeze_modules": "all"
-    },
-    "data_config": {
-        "training_files": {
-            "lj": {
-                "basedir": "/home/zach/code/uberduck-ml-dev/data",
-                "audiodir": "",
-                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
-                "lmdbpath": ""
-            }
-        },
-        "validation_files": {},
-        "dur_min": 0.1,
-        "dur_max": 10.2,
-        "sampling_rate": 22050,
-        "filter_length": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mel_channels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000.0,
-        "f0_min": 80.0,
-        "f0_max": 640.0,
-        "max_wav_value": 32768.0,
-        "use_f0": true,
-        "use_log_f0": 0,
-        "use_energy_avg": true,
-        "use_scaled_energy": true,
-        "symbol_set": "radtts",
-        "cleaner_names": ["radtts_cleaners"],
-        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
-        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
-        "p_phoneme": 1.0,
-        "handle_phoneme": "word",
-        "handle_phoneme_ambiguous": "ignore",
-        "include_speakers": null,
-        "n_frames": -1,
-        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
-        "lmdb_cache_path": "", 
-        "use_attn_prior_masking": true,
-        "prepend_space_to_text": true,
-        "append_space_to_text": true,
-        "add_bos_eos_to_text": false,
-        "betabinom_scaling_factor": 1.0,
-        "distance_tx_unvoiced": false,
-        "is_zero_shot": true,
-        "mel_noise_scale": 0.0
-    },
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321"
-    },
-    "model_config": {
-        "n_speakers": 0,
-        "n_speaker_dim": 512,
-        "n_text": 185,
-        "n_text_dim": 512,
-        "n_flows": 8,
-        "n_conv_layers_per_step": 4,
-        "n_mel_channels": 80,
-        "n_hidden": 1024,
-        "mel_encoder_n_hidden": 512,
-        "dummy_speaker_embedding": false,
-        "n_early_size": 2,
-        "n_early_every": 2,
-        "n_group_size": 2,
-        "affine_model": "wavenet",
-        "include_modules": "decatn",
-        "scaling_fn": "tanh",
-        "matrix_decomposition": "LUS",
-        "learn_alignments": true,
-        "use_speaker_emb_for_alignment": false,
-        "attn_straight_through_estimator": true,
-        "use_context_lstm": true,
-        "context_lstm_norm": "spectral",
-        "context_lstm_w_f0_and_energy": true,
-        "text_encoder_lstm_norm": "spectral",
-        "n_f0_dims": 1,
-        "n_energy_avg_dims": 1,
-        "use_first_order_features": false,
-        "unvoiced_bias_activation": "relu",
-        "decoder_use_partial_padding": true,
-        "decoder_use_unvoiced_bias": true,
-        "ap_pred_log_f0": true,
-        "ap_use_unvoiced_bias": true,
-        "ap_use_voiced_embeddings": true,
-        "dur_model_config": null,
-        "f0_model_config": null,
-        "energy_model_config": null,
-        "v_model_config": {
-            "name": "dap",
-            "hparams": {
-                "n_speaker_dim": 16,
-                "take_log_of_input": false,
-                "bottleneck_hparams": {
-                    "in_dim": 512,
-                    "reduction_factor": 16,
-                    "norm": "weightnorm",
-                    "non_linearity": "relu"
-                },
-                "arch_hparams": {
-                    "out_dim": 1,
-                    "n_layers": 2,
-                    "n_channels": 256,
-                    "kernel_size": 3,
-                    "p_dropout": 0.5,
-                    "lstm_type": "",
-                    "use_linear": 1
-                }
-            }
-        }
-    }
-}
diff --git a/configs/config.json b/configs/config.json
deleted file mode 100644
index 2bcaa0c5..00000000
--- a/configs/config.json
+++ /dev/null
@@ -1,149 +0,0 @@
-{
-    "train_config": {
-        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/",
-        "epochs": 10000000,
-        "optim_algo": "RAdam",
-        "learning_rate": 0.0001,
-        "weight_decay": 1e-6,
-        "sigma": 1.0,
-        "iters_per_checkpoint": 2500,
-        "batch_size": 16,
-        "seed": null,
-        "checkpoint_path": "",
-        "ignore_layers": [],
-        "ignore_layers_warmstart": [],
-        "steps_per_sample": 500,
-        "finetune_layers": [],
-        "include_layers": [],
-        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
-        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
-        "log_attribute_samples": false,
-        "log_decoder_samples": true,
-        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt",
-        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
-        "use_amp": true,
-        "grad_clip_val": 1.0,
-        "loss_weights": {
-            "blank_logprob": -1,
-            "ctc_loss_weight": 0.1,
-            "binarization_loss_weight": 1.0,
-            "dur_loss_weight": 1.0,
-            "f0_loss_weight": 1.0,
-            "energy_loss_weight": 1.0,
-            "vpred_loss_weight": 1.0
-        },
-        "binarization_start_iter": 24000,
-        "kl_loss_start_iter": 36000,
-        "unfreeze_modules": "all"
-    },
-    "data_config": {
-        "training_files": {
-            "vctk": {
-                "basedir": "/home/zach/code/uberduck-ml-dev/data",
-                "audiodir": "",
-                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
-                "lmdbpath": ""
-            }
-        },
-        "validation_files": {},
-        "dur_min": 0.1,
-        "dur_max": 10.2,
-        "sampling_rate": 22050,
-        "filter_length": 1024,
-        "hop_length": 256,
-        "win_length": 1024,
-        "n_mel_channels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000.0,
-        "f0_min": 80.0,
-        "f0_max": 640.0,
-        "max_wav_value": 32768.0,
-        "use_f0": true,
-        "use_log_f0": 0,
-        "use_energy_avg": true,
-        "use_scaled_energy": true,
-        "symbol_set": "radtts",
-        "cleaner_names": ["radtts_cleaners"],
-        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
-        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
-        "p_phoneme": 1.0,
-        "handle_phoneme": "word",
-        "handle_phoneme_ambiguous": "ignore",
-        "include_speakers": null,
-        "n_frames": -1,
-        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
-        "lmdb_cache_path": "", 
-        "use_attn_prior_masking": true,
-        "prepend_space_to_text": true,
-        "append_space_to_text": true,
-        "add_bos_eos_to_text": false,
-        "betabinom_scaling_factor": 1.0,
-        "distance_tx_unvoiced": false,
-        "is_zero_shot": false,
-        "mel_noise_scale": 0.0
-    },
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321"
-    },
-    "model_config": {
-        "n_speakers": 109,
-        "n_speaker_dim": 16,
-        "n_text": 185,
-        "n_text_dim": 512,
-        "n_flows": 8,
-        "n_conv_layers_per_step": 4,
-        "n_mel_channels": 80,
-        "n_hidden": 1024,
-        "mel_encoder_n_hidden": 512,
-        "dummy_speaker_embedding": false,
-        "n_early_size": 2,
-        "n_early_every": 2,
-        "n_group_size": 2,
-        "affine_model": "wavenet",
-        "include_modules": "decatn",
-        "scaling_fn": "tanh",
-        "matrix_decomposition": "LUS",
-        "learn_alignments": true,
-        "use_speaker_emb_for_alignment": false,
-        "attn_straight_through_estimator": true,
-        "use_context_lstm": true,
-        "context_lstm_norm": "spectral",
-        "context_lstm_w_f0_and_energy": true,
-        "text_encoder_lstm_norm": "spectral",
-        "n_f0_dims": 1,
-        "n_energy_avg_dims": 1,
-        "use_first_order_features": false,
-        "unvoiced_bias_activation": "relu",
-        "decoder_use_partial_padding": true,
-        "decoder_use_unvoiced_bias": true,
-        "ap_pred_log_f0": true,
-        "ap_use_unvoiced_bias": true,
-        "ap_use_voiced_embeddings": true,
-        "dur_model_config": null,
-        "f0_model_config": null,
-        "energy_model_config": null,
-        "v_model_config": {
-            "name": "dap",
-            "hparams": {
-                "n_speaker_dim": 16,
-                "take_log_of_input": false,
-                "bottleneck_hparams": {
-                    "in_dim": 512,
-                    "reduction_factor": 16,
-                    "norm": "weightnorm",
-                    "non_linearity": "relu"
-                },
-                "arch_hparams": {
-                    "out_dim": 1,
-                    "n_layers": 2,
-                    "n_channels": 256,
-                    "kernel_size": 3,
-                    "p_dropout": 0.5,
-                    "lstm_type": "",
-                    "use_linear": 1
-                }
-            }
-        }
-    }
-}

From d19fee07c6d3d6fd76c04a2d0701ada50e4f7922 Mon Sep 17 00:00:00 2001
From: "sjkoelle@gmail.com" <sjkoelle@gmail.com>
Date: Thu, 7 Sep 2023 14:19:38 -0700
Subject: [PATCH 9/9] save

---
 configs/config-ljspeech.json           | 149 +++++++++++++++++++++++++
 configs/config-zeroshot-warmstart.json | 149 +++++++++++++++++++++++++
 configs/config-zeroshot.json           | 149 +++++++++++++++++++++++++
 configs/config.json                    | 149 +++++++++++++++++++++++++
 4 files changed, 596 insertions(+)
 create mode 100644 configs/config-ljspeech.json
 create mode 100644 configs/config-zeroshot-warmstart.json
 create mode 100644 configs/config-zeroshot.json
 create mode 100644 configs/config.json

diff --git a/configs/config-ljspeech.json b/configs/config-ljspeech.json
new file mode 100644
index 00000000..4cb34a8f
--- /dev/null
+++ b/configs/config-ljspeech.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-ljspeech/model_2500.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "lj": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": false,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 1,
+        "n_speaker_dim": 16,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config-zeroshot-warmstart.json b/configs/config-zeroshot-warmstart.json
new file mode 100644
index 00000000..26f1a5c7
--- /dev/null
+++ b/configs/config-zeroshot-warmstart.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-13-zeroshot-warmstart/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/model_40000.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "vctk": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": true,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 0,
+        "n_speaker_dim": 512,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config-zeroshot.json b/configs/config-zeroshot.json
new file mode 100644
index 00000000..d0e29b0e
--- /dev/null
+++ b/configs/config-zeroshot.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-12-zeroshot/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 6000,
+        "kl_loss_start_iter": 8000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "lj": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/lj-filelist.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": true,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 0,
+        "n_speaker_dim": 512,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}
diff --git a/configs/config.json b/configs/config.json
new file mode 100644
index 00000000..2bcaa0c5
--- /dev/null
+++ b/configs/config.json
@@ -0,0 +1,149 @@
+{
+    "train_config": {
+        "output_directory": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/",
+        "epochs": 10000000,
+        "optim_algo": "RAdam",
+        "learning_rate": 0.0001,
+        "weight_decay": 1e-6,
+        "sigma": 1.0,
+        "iters_per_checkpoint": 2500,
+        "batch_size": 16,
+        "seed": null,
+        "checkpoint_path": "",
+        "ignore_layers": [],
+        "ignore_layers_warmstart": [],
+        "steps_per_sample": 500,
+        "finetune_layers": [],
+        "include_layers": [],
+        "vocoder_config_path": "/home/zach/code/uberduck-ml-dev/models/hifi_gan_config.json",
+        "vocoder_checkpoint_path": "/home/zach/code/uberduck-ml-dev/models/g_hifi_crust",
+        "log_attribute_samples": false,
+        "log_decoder_samples": true,
+        "warmstart_checkpoint_path": "/home/zach/code/uberduck-ml-dev/outputs-2023-07-01/model_477500.pt",
+        "______warmstartasdf": "/home/zach/code/uberduck-ml-dev/outputs/model_115000.pt",
+        "use_amp": true,
+        "grad_clip_val": 1.0,
+        "loss_weights": {
+            "blank_logprob": -1,
+            "ctc_loss_weight": 0.1,
+            "binarization_loss_weight": 1.0,
+            "dur_loss_weight": 1.0,
+            "f0_loss_weight": 1.0,
+            "energy_loss_weight": 1.0,
+            "vpred_loss_weight": 1.0
+        },
+        "binarization_start_iter": 24000,
+        "kl_loss_start_iter": 36000,
+        "unfreeze_modules": "all"
+    },
+    "data_config": {
+        "training_files": {
+            "vctk": {
+                "basedir": "/home/zach/code/uberduck-ml-dev/data",
+                "audiodir": "",
+                "filelist": "/home/zach/code/uberduck-ml-dev/filelists/vctk-radtts.txt",
+                "lmdbpath": ""
+            }
+        },
+        "validation_files": {},
+        "dur_min": 0.1,
+        "dur_max": 10.2,
+        "sampling_rate": 22050,
+        "filter_length": 1024,
+        "hop_length": 256,
+        "win_length": 1024,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 8000.0,
+        "f0_min": 80.0,
+        "f0_max": 640.0,
+        "max_wav_value": 32768.0,
+        "use_f0": true,
+        "use_log_f0": 0,
+        "use_energy_avg": true,
+        "use_scaled_energy": true,
+        "symbol_set": "radtts",
+        "cleaner_names": ["radtts_cleaners"],
+        "heteronyms_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/heteronyms",
+        "phoneme_dict_path": "/home/zach/code/uberduck-ml-dev/uberduck_ml_dev/text/cmudict-0.7b",
+        "p_phoneme": 1.0,
+        "handle_phoneme": "word",
+        "handle_phoneme_ambiguous": "ignore",
+        "include_speakers": null,
+        "n_frames": -1,
+        "betabinom_cache_path": "/home/zach/code/uberduck-ml-dev/data_cache/",
+        "lmdb_cache_path": "", 
+        "use_attn_prior_masking": true,
+        "prepend_space_to_text": true,
+        "append_space_to_text": true,
+        "add_bos_eos_to_text": false,
+        "betabinom_scaling_factor": 1.0,
+        "distance_tx_unvoiced": false,
+        "is_zero_shot": false,
+        "mel_noise_scale": 0.0
+    },
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321"
+    },
+    "model_config": {
+        "n_speakers": 109,
+        "n_speaker_dim": 16,
+        "n_text": 185,
+        "n_text_dim": 512,
+        "n_flows": 8,
+        "n_conv_layers_per_step": 4,
+        "n_mel_channels": 80,
+        "n_hidden": 1024,
+        "mel_encoder_n_hidden": 512,
+        "dummy_speaker_embedding": false,
+        "n_early_size": 2,
+        "n_early_every": 2,
+        "n_group_size": 2,
+        "affine_model": "wavenet",
+        "include_modules": "decatn",
+        "scaling_fn": "tanh",
+        "matrix_decomposition": "LUS",
+        "learn_alignments": true,
+        "use_speaker_emb_for_alignment": false,
+        "attn_straight_through_estimator": true,
+        "use_context_lstm": true,
+        "context_lstm_norm": "spectral",
+        "context_lstm_w_f0_and_energy": true,
+        "text_encoder_lstm_norm": "spectral",
+        "n_f0_dims": 1,
+        "n_energy_avg_dims": 1,
+        "use_first_order_features": false,
+        "unvoiced_bias_activation": "relu",
+        "decoder_use_partial_padding": true,
+        "decoder_use_unvoiced_bias": true,
+        "ap_pred_log_f0": true,
+        "ap_use_unvoiced_bias": true,
+        "ap_use_voiced_embeddings": true,
+        "dur_model_config": null,
+        "f0_model_config": null,
+        "energy_model_config": null,
+        "v_model_config": {
+            "name": "dap",
+            "hparams": {
+                "n_speaker_dim": 16,
+                "take_log_of_input": false,
+                "bottleneck_hparams": {
+                    "in_dim": 512,
+                    "reduction_factor": 16,
+                    "norm": "weightnorm",
+                    "non_linearity": "relu"
+                },
+                "arch_hparams": {
+                    "out_dim": 1,
+                    "n_layers": 2,
+                    "n_channels": 256,
+                    "kernel_size": 3,
+                    "p_dropout": 0.5,
+                    "lstm_type": "",
+                    "use_linear": 1
+                }
+            }
+        }
+    }
+}