diff --git a/.gitignore b/.gitignore
index 67f8d188..ab94a18b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,4 +61,8 @@ logs
source_audio
result
conversion_results
-get_available_gpu.py
\ No newline at end of file
+get_available_gpu.py
+
+#slurm files
+slurm.sh
+*.log
\ No newline at end of file
diff --git a/bins/vc/inference.py b/bins/vc/inference.py
new file mode 100644
index 00000000..f5977902
--- /dev/null
+++ b/bins/vc/inference.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import glob
+from tqdm import tqdm
+import json
+import torch
+import time
+
+
+from models.vc.transformer.transformer_inference import TransformerInference
+from models.vc.vits.vits_inference import VitsInference
+from utils.util import load_config
+from utils.audio_slicer import split_audio, merge_segments_encodec
+from processors import acoustic_extractor, content_extractor
+
+
+def build_inference(args, cfg, infer_type="from_dataset"):
+ supported_inference = {
+ "TransformerVC": TransformerInference,
+ "VitsVC": VitsInference,
+ }
+
+ inference_class = supported_inference[cfg.model_type]
+ return inference_class(args, cfg, infer_type)
+
+
+def prepare_for_audio_file(args, cfg, num_workers=1):
+ preprocess_path = cfg.preprocess.processed_dir
+ audio_name = cfg.inference.source_audio_name
+ temp_audio_dir = os.path.join(preprocess_path, audio_name)
+
+ ### eval file
+ t = time.time()
+ eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
+ args.source = eval_file
+ with open(eval_file, "r") as f:
+ metadata = json.load(f)
+ print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
+
+ ### acoustic features
+ t = time.time()
+ acoustic_extractor.extract_utt_acoustic_features_serial(
+ metadata, temp_audio_dir, cfg
+ )
+ if cfg.preprocess.use_min_max_norm_mel == True:
+ acoustic_extractor.cal_mel_min_max(
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+ )
+ print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
+
+ ### content features
+ t = time.time()
+ content_extractor.extract_utt_content_features_dataloader(
+ cfg, metadata, num_workers
+ )
+ print("Prepare for content features: {:.1f}s".format(time.time() - t))
+ return args, cfg, temp_audio_dir
+
+
+def merge_for_audio_segments(audio_files, args, cfg):
+ audio_name = cfg.inference.source_audio_name
+ target_singer_name = os.path.basename(args.target).split(".")[0]
+
+ merge_segments_encodec(
+ wav_files=audio_files,
+ fs=cfg.preprocess.sample_rate,
+ output_path=os.path.join(
+ args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
+ ),
+ overlap_duration=cfg.inference.segments_overlap_duration,
+ )
+
+ for tmp_file in audio_files:
+ os.remove(tmp_file)
+
+
+def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
+ """
+ Prepare the eval file (json) for an audio
+ """
+
+ audio_chunks_results = split_audio(
+ wav_file=cfg.inference.source_audio_path,
+ target_sr=cfg.preprocess.sample_rate,
+ output_dir=os.path.join(temp_audio_dir, "wavs"),
+ max_duration_of_segment=cfg.inference.segments_max_duration,
+ overlap_duration=cfg.inference.segments_overlap_duration,
+ )
+
+ metadata = []
+ for i, res in enumerate(audio_chunks_results):
+ res["index"] = i
+ res["Dataset"] = audio_name
+ res["Singer"] = audio_name
+ res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
+ metadata.append(res)
+
+ eval_file = os.path.join(temp_audio_dir, "eval.json")
+ with open(eval_file, "w") as f:
+ json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
+
+ return eval_file
+
+
+def cuda_relevant(deterministic=False):
+ torch.cuda.empty_cache()
+ # TF32 on Ampere and above
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.enabled = True
+ torch.backends.cudnn.allow_tf32 = True
+ # Deterministic
+ torch.backends.cudnn.deterministic = deterministic
+ torch.backends.cudnn.benchmark = not deterministic
+ torch.use_deterministic_algorithms(deterministic)
+
+
+def infer(args, cfg, infer_type):
+ # Build inference
+ t = time.time()
+ trainer = build_inference(args, cfg, infer_type)
+ print("Model Init: {:.1f}s".format(time.time() - t))
+
+ # Run inference
+ t = time.time()
+ output_audio_files = trainer.inference()
+ print("Model inference: {:.1f}s".format(time.time() - t))
+ return output_audio_files
+
+
+def build_parser():
+ r"""Build argument parser for inference.py.
+ Anything else should be put in an extra config YAML file.
+ """
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config",
+ type=str,
+ required=True,
+ help="JSON/YAML file for configurations.",
+ )
+ parser.add_argument(
+ "--acoustics_dir",
+ type=str,
+ help="Acoustics model checkpoint directory. If a directory is given, "
+ "search for the latest checkpoint dir in the directory. If a specific "
+ "checkpoint dir is given, directly load the checkpoint.",
+ )
+ parser.add_argument(
+ "--vocoder_dir",
+ type=str,
+ required=True,
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
+ "the acoustics one.",
+ )
+ parser.add_argument(
+ "--target",
+ type=str,
+ required=True,
+ help="Target audio file.",
+ )
+ parser.add_argument(
+ "--trans_key",
+ default=0,
+ help="0: no pitch shift; autoshift: pitch shift; int: key shift.",
+ )
+ parser.add_argument(
+ "--source",
+ type=str,
+ default="source_audio",
+ help="Source audio file or directory. If a JSON file is given, "
+ "inference from dataset is applied. If a directory is given, "
+ "inference from all wav/flac/mp3 audio files in the directory is applied. "
+ "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+ )
+ parser.add_argument(
+ "--output_dir",
+ type=str,
+ default="conversion_results",
+ help="Output directory. Default: ./conversion_results",
+ )
+ parser.add_argument(
+ "--log_level",
+ type=str,
+ default="warning",
+ help="Logging level. Default: warning",
+ )
+ parser.add_argument(
+ "--keep_cache",
+ action="store_true",
+ default=True,
+ help="Keep cache files. Only applicable to inference from files.",
+ )
+ parser.add_argument(
+ "--diffusion_inference_steps",
+ type=int,
+ default=50,
+ help="Number of inference steps. Only applicable to diffusion inference.",
+ )
+ return parser
+
+
+def main():
+ ### Parse arguments and config
+ args = build_parser().parse_args()
+ cfg = load_config(args.config)
+
+ # CUDA settings
+ cuda_relevant()
+
+ if os.path.isdir(args.source):
+ ### Infer from file
+
+ # Get all the source audio files (.wav, .flac, .mp3)
+ source_audio_dir = args.source
+ audio_list = []
+ for suffix in ["wav", "flac", "mp3"]:
+ audio_list += glob.glob(
+ os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+ )
+ print("There are {} source audios: ".format(len(audio_list)))
+
+ # Infer for every file as dataset
+ output_root_path = args.output_dir
+ for audio_path in tqdm(audio_list):
+ audio_name = audio_path.split("/")[-1].split(".")[0]
+ args.output_dir = os.path.join(output_root_path, audio_name)
+ print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+
+ cfg.inference.source_audio_path = audio_path
+ cfg.inference.source_audio_name = audio_name
+ cfg.inference.segments_max_duration = 10.0
+ cfg.inference.segments_overlap_duration = 1.0
+
+ # Prepare metadata and features
+ args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+
+ # Infer from file
+ output_audio_files = infer(args, cfg, infer_type="from_file")
+
+ # Merge the split segments
+ merge_for_audio_segments(output_audio_files, args, cfg)
+
+ # Keep or remove caches
+ if not args.keep_cache:
+ os.removedirs(cache_dir)
+
+ else:
+ ### Infer from dataset
+ infer(args, cfg, infer_type="from_dataset")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bins/vc/preprocess.py b/bins/vc/preprocess.py
new file mode 100644
index 00000000..641ff759
--- /dev/null
+++ b/bins/vc/preprocess.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faulthandler
+
+faulthandler.enable()
+
+import os
+import argparse
+import json
+from multiprocessing import cpu_count
+
+
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+
+
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+ """Extract acoustic features of utterances in the dataset
+
+ Args:
+ dataset (str): name of dataset, e.g. opencpop
+ output_path (str): directory that stores train, test and feature files of datasets
+ cfg (dict): dictionary that stores configurations
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+ """
+ types = ["train", "test", "valid"] if "eval" not in dataset else ["test"]
+ metadata = []
+ dataset_output = os.path.join(output_path, dataset)
+
+ for dataset_type in types:
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+ with open(dataset_file, "r") as f:
+ metadata.extend(json.load(f))
+
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
+ # metadata, dataset_output, cfg, n_workers=n_workers
+ # )
+ acoustic_extractor.extract_utt_acoustic_features_serial(
+ metadata, dataset_output, cfg
+ )
+
+
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+ """Extract content features of utterances in the dataset
+
+ Args:
+ dataset (str): name of dataset, e.g. opencpop
+ output_path (str): directory that stores train, test and feature files of datasets
+ cfg (dict): dictionary that stores configurations
+ """
+ types = ["train", "test", "valid"] if "eval" not in dataset else ["test"]
+ metadata = []
+ for dataset_type in types:
+ dataset_output = os.path.join(output_path, dataset)
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+ with open(dataset_file, "r") as f:
+ metadata.extend(json.load(f))
+
+ content_extractor.extract_utt_content_features_dataloader(
+ cfg, metadata, num_workers
+ )
+
+
+def preprocess(cfg, args):
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+
+ Args:
+ cfg (dict): dictionary that stores configurations
+ args (ArgumentParser): specify the configuration file and num_workers
+ """
+ # Specify the output root path to save the processed data
+ output_path = cfg.preprocess.processed_dir
+ os.makedirs(output_path, exist_ok=True)
+
+ ## Split train and test sets
+ for dataset in cfg.dataset:
+ print("Preprocess {}...".format(dataset))
+ preprocess_dataset(
+ dataset,
+ cfg.dataset_path[dataset],
+ output_path,
+ cfg.preprocess,
+ cfg.task_type,
+ is_custom_dataset=cfg.use_custom_dataset,
+ )
+
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+ try:
+ assert isinstance(
+ cfg.preprocess.data_augment, list
+ ), "Please provide a list of datasets need to be augmented."
+ if len(cfg.preprocess.data_augment) > 0:
+ new_datasets_list = []
+ for dataset in cfg.preprocess.data_augment:
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
+ new_datasets_list.extend(new_datasets)
+ cfg.dataset.extend(new_datasets_list)
+ print("Augmentation datasets: ", cfg.dataset)
+ except:
+ print("No Data Augmentation.")
+
+ # Dump metadata of datasets (singers, train/test durations, etc.)
+ cal_metadata(cfg, dataset_types=["train", "test", "valid"])
+
+ ## Prepare the acoustic features
+ for dataset in cfg.dataset:
+ # Skip augmented datasets which do not need to extract acoustic features
+ # We will copy acoustic features from the original dataset later
+ if (
+ "pitch_shift" in dataset
+ or "formant_shift" in dataset
+ or "equalizer" in dataset in dataset
+ ):
+ continue
+ print(
+ "Extracting acoustic features for {} using {} workers ...".format(
+ dataset, args.num_workers
+ )
+ )
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+ # Calculate the statistics of acoustic features
+ if cfg.preprocess.mel_min_max_norm:
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+
+ if cfg.preprocess.extract_pitch:
+ acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
+
+ # Copy acoustic features for augmented datasets by creating soft-links
+ for dataset in cfg.dataset:
+ if "pitch_shift" in dataset:
+ src_dataset = dataset.replace("_pitch_shift", "")
+ src_dataset_dir = os.path.join(output_path, src_dataset)
+ elif "formant_shift" in dataset:
+ src_dataset = dataset.replace("_formant_shift", "")
+ src_dataset_dir = os.path.join(output_path, src_dataset)
+ elif "equalizer" in dataset:
+ src_dataset = dataset.replace("_equalizer", "")
+ src_dataset_dir = os.path.join(output_path, src_dataset)
+ else:
+ continue
+ dataset_dir = os.path.join(output_path, dataset)
+ metadata = []
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+ with open(metadata_file_path, "r") as f:
+ metadata.extend(json.load(f))
+ print("Copying acoustic features for {}...".format(dataset))
+ acoustic_extractor.copy_acoustic_features(
+ metadata, dataset_dir, src_dataset_dir, cfg
+ )
+ if cfg.preprocess.mel_min_max_norm:
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+
+ if cfg.preprocess.extract_pitch:
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+
+ # Prepare the content features
+ for dataset in cfg.dataset:
+ print("Extracting content features for {}...".format(dataset))
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config", default="config.json", help="json files for configurations."
+ )
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
+
+ args = parser.parse_args()
+ cfg = load_config(args.config)
+
+ preprocess(cfg, args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/bins/vc/train.py b/bins/vc/train.py
new file mode 100644
index 00000000..0444b8e1
--- /dev/null
+++ b/bins/vc/train.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+import torch
+
+
+from models.vc.transformer.transformer_trainer import TransformerTrainer
+from models.vc.vits.vits_trainer import VitsVCTrainer
+from utils.util import load_config
+
+
+def build_trainer(args, cfg):
+ supported_trainer = {"TransformerVC": TransformerTrainer, "VitsVC": VitsVCTrainer}
+
+ trainer_class = supported_trainer[cfg.model_type]
+ trainer = trainer_class(args, cfg)
+ return trainer
+
+
+def cuda_relevant(deterministic=False):
+ torch.cuda.empty_cache()
+ # TF32 on Ampere and above
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.enabled = True
+ torch.backends.cudnn.allow_tf32 = True
+ # Deterministic
+ torch.backends.cudnn.deterministic = deterministic
+ torch.backends.cudnn.benchmark = not deterministic
+ torch.use_deterministic_algorithms(deterministic)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--config",
+ default="config.json",
+ help="json files for configurations.",
+ required=True,
+ )
+ parser.add_argument(
+ "--exp_name",
+ type=str,
+ default="exp_name",
+ help="A specific name to note the experiment",
+ required=True,
+ )
+ parser.add_argument(
+ "--resume",
+ action="store_true",
+ help="If specified, to resume from the existing checkpoint.",
+ )
+ parser.add_argument(
+ "--resume_from_ckpt_path",
+ type=str,
+ default="",
+ help="The specific checkpoint path that you want to resume from.",
+ )
+ parser.add_argument(
+ "--resume_type",
+ type=str,
+ default="",
+ help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
+ )
+
+ parser.add_argument(
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
+ )
+ args = parser.parse_args()
+ cfg = load_config(args.config)
+
+ # Data Augmentation
+ if (
+ type(cfg.preprocess.data_augment) == list
+ and len(cfg.preprocess.data_augment) > 0
+ ):
+ new_datasets_list = []
+ for dataset in cfg.preprocess.data_augment:
+ new_datasets = [
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+ (
+ f"{dataset}_formant_shift"
+ if cfg.preprocess.use_formant_shift
+ else None
+ ),
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+ ]
+ new_datasets_list.extend(filter(None, new_datasets))
+ cfg.dataset.extend(new_datasets_list)
+
+ # CUDA settings
+ cuda_relevant()
+
+ # Build trainer
+ trainer = build_trainer(args, cfg)
+
+ trainer.train_loop()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/config/transformer.json b/config/transformer.json
index be3514e9..60c16127 100644
--- a/config/transformer.json
+++ b/config/transformer.json
@@ -24,6 +24,8 @@
"wenet_sample_rate": 16000,
"extract_mert_feature": false,
"mert_sample_rate": 16000,
+ "extract_hubert_feature": false,
+ "hubert_sample_rate": 16000,
// Default config for whisper
"whisper_frameshift": 0.01,
"whisper_downsample_rate": 2,
@@ -39,6 +41,8 @@
"wenet_frameshift": 0.01,
// wenetspeech is 4, gigaspeech is 6
"wenet_downsample_rate": 4,
+ // Default config for hubert
+ "hubert_frameshift": 0.02,
// Default config
"n_mel": 100,
"win_size": 1024,
@@ -65,6 +69,7 @@
"contentvec_dir": "contentvec",
"wenet_dir": "wenet",
"mert_dir": "mert",
+ "hubert_dir": "hubert",
// Extract content features using dataloader
"pin_memory": true,
"num_workers": 8,
@@ -97,9 +102,10 @@
"n_bins_loudness": 256,
"output_loudness_dim": 384,
"use_whisper": false,
- "use_contentvec": true,
+ "use_contentvec": false,
"use_wenet": false,
"use_mert": false,
+ "use_hubert": false,
"whisper_dim": 1024,
"contentvec_dim": 256,
"mert_dim": 256,
diff --git a/config/vitsvc.json b/config/vitsvc.json
new file mode 100644
index 00000000..c56de793
--- /dev/null
+++ b/config/vitsvc.json
@@ -0,0 +1,193 @@
+{
+ "base_config": "config/base.json",
+ "model_type": "VITS",
+ "task_type": "svc",
+ "preprocess": {
+ "extract_phone": false,
+ "extract_mel": true,
+ "extract_linear_spec": true,
+ "extract_audio": true,
+ "use_linear": true,
+ "use_mel": true,
+ "use_audio": true,
+ "use_text": false,
+ "use_phone": true,
+
+ "fmin": 0,
+ "fmax": null,
+ "f0_min": 50,
+ "f0_max": 1100,
+ // f0_bin in sovits
+ "pitch_bin": 256,
+ // filter_length in sovits
+ "n_fft": 2048,
+ // hop_length in sovits
+ "hop_size": 512,
+ // win_length in sovits
+ "win_size": 2048,
+ "segment_size": 8192,
+ "n_mel": 100,
+ "sample_rate": 44100,
+
+ "mel_min_max_stats_dir": "mel_min_max_stats",
+ "whisper_dir": "whisper",
+ "contentvec_dir": "contentvec",
+ "wenet_dir": "wenet",
+ "mert_dir": "mert",
+ "hubert_dir": "hubert",
+ },
+ "model": {
+ "condition_encoder": {
+ "merge_mode": "add",
+ "input_melody_dim": 1,
+ "use_log_f0": true,
+ "n_bins_melody": 256,
+ //# Quantization (0 for not quantization)
+ "output_melody_dim": 196,
+ "input_loudness_dim": 1,
+ "use_log_loudness": false,
+ "n_bins_loudness": 256,
+ "output_loudness_dim": 196,
+ "use_whisper": false,
+ "use_contentvec": false,
+ "use_wenet": false,
+ "use_mert": false,
+ "whisper_dim": 1024,
+ "contentvec_dim": 256,
+ "mert_dim": 256,
+ "wenet_dim": 512,
+ "content_encoder_dim": 196,
+ "output_singer_dim": 196,
+ "singer_table_size": 512,
+ "output_content_dim": 196,
+ "use_spkid": true
+ },
+ "vits": {
+ "filter_channels": 256,
+ "gin_channels": 256,
+ "hidden_channels": 192,
+ "inter_channels": 192,
+ "kernel_size": 3,
+ "n_flow_layer": 4,
+ "n_heads": 2,
+ "n_layers": 6,
+ "n_layers_q": 3,
+ "n_speakers": 512,
+ "p_dropout": 0.1,
+ "ssl_dim": 256,
+ "use_spectral_norm": false,
+ },
+ "generator": "hifigan",
+ "generator_config": {
+ "hifigan": {
+ "resblock": "1",
+ "resblock_kernel_sizes": [
+ 3,
+ 5,
+ 7
+ ],
+ "upsample_rates": [
+ 8,4,2,2,2
+ ],
+ "upsample_kernel_sizes": [
+ 16,8,4,4,4
+ ],
+ "upsample_initial_channel": 512,
+ "resblock_dilation_sizes": [
+ [1,3,5],
+ [1,3,5],
+ [1,3,5]
+ ]
+ },
+ "melgan": {
+ "ratios": [8, 8, 2, 2, 2],
+ "ngf": 32,
+ "n_residual_layers": 3,
+ "num_D": 3,
+ "ndf": 16,
+ "n_layers": 4,
+ "downsampling_factor": 4
+ },
+ "bigvgan": {
+ "resblock": "1",
+ "activation": "snakebeta",
+ "snake_logscale": true,
+ "upsample_rates": [
+ 8,8,2,2,2,
+ ],
+ "upsample_kernel_sizes": [
+ 16,16,4,4,4,
+ ],
+ "upsample_initial_channel": 512,
+ "resblock_kernel_sizes": [
+ 3,
+ 7,
+ 11
+ ],
+ "resblock_dilation_sizes": [
+ [1,3,5],
+ [1,3,5],
+ [1,3,5]
+ ]
+ },
+ "nsfhifigan": {
+ "resblock": "1",
+ "harmonic_num": 8,
+ "upsample_rates": [
+ 8,4,2,2,2,
+ ],
+ "upsample_kernel_sizes": [
+ 16,8,4,4,4,
+ ],
+ "upsample_initial_channel": 768,
+ "resblock_kernel_sizes": [
+ 3,
+ 5,
+ 7
+ ],
+ "resblock_dilation_sizes": [
+ [1,3,5],
+ [1,3,5],
+ [1,3,5]
+ ]
+ },
+ "apnet": {
+ "ASP_channel": 512,
+ "ASP_resblock_kernel_sizes": [3,7,11],
+ "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "ASP_input_conv_kernel_size": 7,
+ "ASP_output_conv_kernel_size": 7,
+
+ "PSP_channel": 512,
+ "PSP_resblock_kernel_sizes": [3,7,11],
+ "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "PSP_input_conv_kernel_size": 7,
+ "PSP_output_R_conv_kernel_size": 7,
+ "PSP_output_I_conv_kernel_size": 7,
+ }
+ },
+ },
+ "train": {
+ "fp16_run": true,
+ "learning_rate": 2e-4,
+ "betas": [
+ 0.8,
+ 0.99
+ ],
+ "eps": 1e-9,
+ "batch_size": 16,
+ "lr_decay": 0.999875,
+ // "segment_size": 8192,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0,
+ "AdamW": {
+ "betas": [
+ 0.8,
+ 0.99
+ ],
+ "eps": 1e-9,
+ }
+ }
+}
\ No newline at end of file
diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md
index 5fed5492..8df28790 100644
--- a/egs/tts/VITS/README.md
+++ b/egs/tts/VITS/README.md
@@ -143,11 +143,6 @@ Here are some example scenarios to better understand how to use these arguments:
## 4. Inference
-### Pre-trained Model Download
-
-We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech according to the following inference instruction.
-
-
### Configuration
For inference, you need to specify the following configurations when running `run.sh`:
diff --git a/egs/vc/README.md b/egs/vc/README.md
new file mode 100755
index 00000000..f8be4303
--- /dev/null
+++ b/egs/vc/README.md
@@ -0,0 +1,34 @@
+# Amphion Voice Conversion (VC) Recipe
+
+## Quick Start
+
+We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge VC model. Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
+
+## Supported Model Architectures
+
+The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
+
+
+
+

+
+
+
+Until now, Amphion SVC has supported the following features and models:
+
+- **Speaker-agnostic Representations**:
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
+ - Prosody Features: F0 and energy.
+- **Speaker Embeddings**:
+ - Speaker Look-Up Table.
+ - Reference Encoder (👨💻 developing): It can be used for zero-shot SVC.
+- **Acoustic Decoders**:
+ - Diffusion-based models:
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
+ - **[DiffComoSVC](DiffComoSVC)** (👨💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
+ - Transformer-based models:
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
+ - VAE- and Flow-based models:
+ - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
+- **Waveform Synthesizers (Vocoders)**:
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
diff --git a/egs/vc/TransformerVC/README.md b/egs/vc/TransformerVC/README.md
new file mode 100644
index 00000000..0df6692d
--- /dev/null
+++ b/egs/vc/TransformerVC/README.md
@@ -0,0 +1,154 @@
+# Transformer for Voice Conversion
+
+This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for voice conversion.
+
+There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+By default, we utilize the LibriTTS datasets for training. How to download them is detailed [here](../../datasets/README.md).
+
+### Configuration
+
+Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+
+```json
+ "dataset": [
+ "libritts"
+ ],
+ "dataset_path": {
+ // TODO: Fill in your dataset path
+ "libritts": "[LibriTTS dataset path]"
+ },
+```
+
+## 2. Features Extraction
+
+### Content-based Pretrained Models Download
+
+By default, we utilize the Hubert to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+
+### Configuration
+
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+
+```json
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+ "log_dir": "ckpts/vc",
+ "preprocess": {
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
+ "processed_dir": "data",
+ ...
+ },
+```
+
+### Run
+
+Run the `run.sh` as the preproces stage (set `--stage 1`).
+
+```bash
+sh egs/vc/TransformerVC/run.sh --stage 1
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+
+## 3. Training
+
+### Configuration
+Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
+```json
+"model": {
+ ...
+ "transformer":{
+ // 'conformer' or 'transformer'
+ "type": "conformer",
+ "input_dim": 384,
+ "output_dim": 100,
+ "n_heads": 2,
+ "n_layers": 6,
+ "filter_channels":512,
+ "dropout":0.1,
+ }
+ }
+```
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+
+```json
+"train": {
+ "batch_size": 32,
+ ...
+ "adamw": {
+ "lr": 2.0e-4
+ },
+ ...
+ }
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vc/[YourExptName]`.
+
+```bash
+sh egs/vc/TransformerVC/run.sh --stage 2 --name [YourExptName]
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+
+## 4. Inference/Conversion
+
+### Pretrained Vocoder Download
+
+We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
+
+### Run
+
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+
+| Parameters | Description | Example |
+| --------------------------------------------------- |---------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
+| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker` | The audio file of the target speaker you want to convert into.| `[Your path to the target audio file]` |
+
+For example, if you want to make the speaker in `reference.wav` to speake the utterances in the `[Your Audios Folder]`, just run:
+
+```bash
+cd Amphion
+sh egs/vc/TransformerVC/run.sh --stage 3 --gpu "0" \
+ --infer_expt_dir Amphion/ckpts/vc/[YourExptName] \
+ --infer_output_dir Amphion/ckpts/vc/[YourExptName]/result \
+ --infer_source_audio_dir [Your Audios Folder] \
+ --infer_target_speaker "reference.wav"
+```
+
+## Citations
+
+```bibtex
+@inproceedings{transformer,
+ author = {Ashish Vaswani and
+ Noam Shazeer and
+ Niki Parmar and
+ Jakob Uszkoreit and
+ Llion Jones and
+ Aidan N. Gomez and
+ Lukasz Kaiser and
+ Illia Polosukhin},
+ title = {Attention is All you Need},
+ booktitle = {{NIPS}},
+ pages = {5998--6008},
+ year = {2017}
+}
+```
\ No newline at end of file
diff --git a/egs/vc/TransformerVC/exp_config.json b/egs/vc/TransformerVC/exp_config.json
new file mode 100644
index 00000000..ce9f1149
--- /dev/null
+++ b/egs/vc/TransformerVC/exp_config.json
@@ -0,0 +1,119 @@
+{
+ "base_config": "config/transformer.json",
+ "model_type": "TransformerVC",
+ "dataset": [
+ "libritts",
+ ],
+ "dataset_path": {
+ // TODO: Fill in your dataset path
+ "libritts": "/home/mingyang/LibriTTS/LibriTTS",
+ },
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+ "log_dir": "ckpts/vc",
+ "preprocess": {
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
+ "processed_dir": "data",
+ // Config for features extraction
+ "extract_mel": true,
+ "extract_pitch": true,
+ "extract_uv": false,
+ "extract_duration": false,
+ "extract_energy": false,
+ "extract_speaker": true,
+ "extract_whisper_feature": false,
+ "extract_contentvec_feature": false,
+ "extract_wenet_feature": false,
+ "extract_hubert_feature": true,
+ "speaker_dir": "speaker",
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+ "contentvec_batch_size": 1,
+ // Fill in the content-based pretrained model's path
+ "hubert_model_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3.pt",
+ "hubert_km_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin",
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+ "whisper_model": "medium",
+ "whisper_model_path": "pretrained/whisper/medium.pt",
+ // Config for features usage
+ "use_mel": true,
+ "use_min_max_norm_mel": true,
+ "use_frame_pitch": true,
+ "use_frame_energy": false,
+ "use_uv": false,
+ "use_spkid": false,
+ "use_spkemb": true,
+ "use_whisper": false,
+ "use_contentvec": false,
+ "use_wenet": false,
+ "use_hubert": true,
+ "n_mel": 100,
+ "sample_rate": 24000
+ },
+ "model": {
+ "condition_encoder": {
+ // Config for features usage
+ "use_whisper": false,
+ "use_contentvec": false,
+ "use_wenet": false,
+ "use_hubert": true,
+ "spkemb_dim": 256,
+ "whisper_dim": 1024,
+ "contentvec_dim": 256,
+ "wenet_dim": 512,
+ "use_singer_encoder": false,
+ "pitch_min": 50,
+ "pitch_max": 1100,
+ "f0_min": 0,
+ "f0_max": 1,
+ "use_spkemb": true,
+ "use_spkid": false
+ },
+ "transformer": {
+ // 'conformer' or 'transformer'
+ "type": "conformer",
+ "input_dim": 384,
+ "output_dim": 100,
+ "n_heads": 2,
+ "n_layers": 6,
+ "filter_channels": 512,
+ "dropout": 0.1,
+ }
+ },
+ "train": {
+ "batch_size": 128,
+ "gradient_accumulation_step": 1,
+ "max_epoch": -1, // -1 means no limit
+ "save_checkpoint_stride": [
+ 5,
+ 50
+ ],
+ "keep_last": [
+ 5,
+ -1
+ ],
+ "run_eval": [
+ false,
+ true
+ ],
+ "adamw": {
+ "lr": 4.0e-4
+ },
+ "reducelronplateau": {
+ "factor": 0.8,
+ "patience": 10,
+ "min_lr": 1.0e-4
+ },
+ "dataloader": {
+ "num_worker": 8,
+ "pin_memory": true
+ },
+ "sampler": {
+ "holistic_shuffle": false,
+ "drop_last": true
+ }
+ },
+ "inference": {
+
+ }
+}
\ No newline at end of file
diff --git a/egs/vc/TransformerVC/run.sh b/egs/vc/TransformerVC/run.sh
new file mode 120000
index 00000000..f8daac3d
--- /dev/null
+++ b/egs/vc/TransformerVC/run.sh
@@ -0,0 +1 @@
+../_template/run.sh
\ No newline at end of file
diff --git a/egs/vc/VitsVC/README.md b/egs/vc/VitsVC/README.md
new file mode 100644
index 00000000..c3990cb3
--- /dev/null
+++ b/egs/vc/VitsVC/README.md
@@ -0,0 +1,115 @@
+# VITS for Voice Conversion
+
+This is an implementation of VITS as acoustic model for end-to-end voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
+
+There are four stages in total:
+
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+
+## 1. Data Preparation
+
+### Dataset Download
+
+By default, we utilize the LibriTTS datasets for training. How to download them is detailed [here](../../datasets/README.md).
+
+### Configuration
+
+Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+
+```json
+ "dataset": [
+ "libritts"
+ ],
+ "dataset_path": {
+ // TODO: Fill in your dataset path
+ "libritts": "[LibriTTS dataset path]"
+ },
+```
+
+## 2. Features Extraction
+
+### Content-based Pretrained Models Download
+
+By default, we utilize Hubert to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+
+### Configuration
+
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+
+```json
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+ "log_dir": "ckpts/vc",
+ "preprocess": {
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
+ "processed_dir": "data",
+ ...
+ },
+```
+
+### Run
+
+Run the `run.sh` as the preproces stage (set `--stage 1`).
+
+```bash
+sh egs/vc/VitsVC/run.sh --stage 1
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+
+## 3. Training
+
+### Configuration
+
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+
+```json
+"train": {
+ "batch_size": 32,
+ ...
+ "adamw": {
+ "lr": 2.0e-4
+ },
+ ...
+ }
+```
+
+### Run
+
+Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+
+```bash
+sh egs/vc/VitsVC/run.sh --stage 2 --name [YourExptName]
+```
+
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+
+## 4. Inference/Conversion
+
+### Run
+
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+
+| Parameters | Description | Example |
+| --------------------------------------------------- |---------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
+| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker` | The audio file of the target speaker you want to convert into.| `[Your path to the target audio file]` |
+
+For example, if you want to make the speaker in `reference.wav` to speake the utterances in the `[Your Audios Folder]`, just run:
+
+```bash
+sh egs/vc/VitsVC/run.sh --stage 3 --gpu "0" \
+ --infer_expt_dir Amphion/ckpts/vc/[YourExptName] \
+ --infer_output_dir Amphion/ckpts/vc/[YourExptName]/result \
+ --infer_source_audio_dir [Your Audios Folder] \
+ --infer_target_speaker "reference.wav"
+```
\ No newline at end of file
diff --git a/egs/vc/VitsVC/exp_config.json b/egs/vc/VitsVC/exp_config.json
new file mode 100644
index 00000000..a6497a5c
--- /dev/null
+++ b/egs/vc/VitsVC/exp_config.json
@@ -0,0 +1,167 @@
+{
+ "base_config": "config/vitsvc.json",
+ "model_type": "VitsVC",
+ "dataset": [
+ "libritts"
+ ],
+ "dataset_path": {
+ // TODO: Fill in your dataset path
+ "libritts": "/home/mingyang/LibriTTS/LibriTTS"
+ },
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+ "log_dir": "ckpts/vc",
+ "preprocess": {
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
+ "processed_dir": "data",
+ "speaker_dir": "speaker",
+ "f0_min": 50,
+ "f0_max": 1100,
+ // f0_bin in sovits
+ "pitch_bin": 256,
+ // filter_length in sovits
+ "n_fft": 1024,
+ // hop_length in sovits
+ "hop_size": 256,
+ // win_length in sovits
+ "win_size": 1024,
+ "segment_size": 8192,
+ "n_mel": 100,
+ "sample_rate": 24000,
+
+ // Config for features extraction
+ "extract_mel": true,
+ "extract_pitch": true,
+ "pitch_extractor": "parselmouth",
+ "extract_energy": false,
+ "extract_speaker": true,
+ "extract_uv": false,
+ "extract_linear_spec": true,
+ "extract_audio": true,
+ // contentvec
+ "extract_contentvec_feature": false,
+ "contentvec_sample_rate": 16000,
+ "contentvec_batch_size": 1,
+ "contentvec_frameshift": 0.02,
+ // whisper
+ "extract_whisper_feature": false,
+ "whisper_sample_rate": 16000,
+ "whisper_frameshift": 0.01,
+ "whisper_downsample_rate": 2,
+ // hubert
+ "extract_hubert_feature": true,
+ "hubert_sample_rate": 16000,
+ "hubert_frameshift": 0.02,
+ // Fill in the content-based pretrained model's path
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+ "whisper_model": "medium",
+ "whisper_model_path": "pretrained/whisper/medium.pt",
+ "hubert_model_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3.pt",
+ "hubert_km_path": "pretrained/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin",
+ // Config for features usage
+ "use_mel": true,
+ "use_frame_pitch": true,
+ "use_uv": false,
+ "use_spkid": false,
+ "use_spkemb": true,
+ "use_contentvec": false,
+ "use_whisper": false,
+ "use_hubert": true,
+ "use_text": false,
+ "use_phone": false,
+
+ // Extract content features using dataloader
+ "pin_memory": true,
+ "num_workers": 8,
+ "content_feature_batch_size": 16,
+ // Meta file
+ "train_file": "train.json",
+ "valid_file": "test.json",
+ "spk2id": "singers.json",
+ "utt2spk": "utt2singer"
+ },
+ "model": {
+ "condition_encoder": {
+ // Config for features usage
+ "merge_mode": "add",
+ "input_melody_dim": 1,
+ "use_log_f0": true,
+ "n_bins_melody": 256,
+ //# Quantization (0 for not quantization)
+ "output_melody_dim": 192,
+
+ "use_contentvec": false,
+ "use_whisper": false,
+ "use_mert": false,
+ "use_wenet": false,
+ "use_hubert": true,
+ "whisper_dim": 1024,
+ "contentvec_dim": 256,
+ "content_encoder_dim": 192,
+ "output_singer_dim": 192,
+ "singer_table_size": 512,
+ "output_content_dim": 192,
+ "use_spkid": false,
+ "use_spkemb": true,
+ "spkemb_dim": 256,
+ "f0_min": 0,
+ "f0_max": 1,
+ "pitch_max": 1100.0,
+ "pitch_min": 50.0
+ },
+ "vits": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 256,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0.1,
+ "ssl_dim": 256,
+ "n_flow_layer": 4,
+ "n_layers_q": 3,
+ "gin_channels": 256,
+ "n_speakers": 512,
+ "use_spectral_norm": false
+ },
+ "generator": "nsfhifigan"
+ },
+ "train": {
+ "batch_size": 32,
+ "learning_rate": 2e-4,
+ "gradient_accumulation_step": 1,
+ "max_epoch": -1, // -1 means no limit
+ "save_checkpoint_stride": [
+ 3,
+ 50
+ ],
+ "keep_last": [
+ 3,
+ 2
+ ],
+ "run_eval": [
+ true,
+ true
+ ],
+ "adamw": {
+ "lr": 2.0e-4
+ },
+ "reducelronplateau": {
+ "factor": 0.8,
+ "patience": 30,
+ "min_lr": 1.0e-4
+ },
+ "dataloader": {
+ "num_worker": 8,
+ "pin_memory": true
+ },
+ "sampler": {
+ "holistic_shuffle": false,
+ "drop_last": true
+ }
+ },
+ "inference": {
+ "batch_size": 1
+ }
+}
\ No newline at end of file
diff --git a/egs/vc/VitsVC/run.sh b/egs/vc/VitsVC/run.sh
new file mode 120000
index 00000000..f8daac3d
--- /dev/null
+++ b/egs/vc/VitsVC/run.sh
@@ -0,0 +1 @@
+../_template/run.sh
\ No newline at end of file
diff --git a/egs/vc/_template/run.sh b/egs/vc/_template/run.sh
new file mode 100644
index 00000000..64e7917e
--- /dev/null
+++ b/egs/vc/_template/run.sh
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+
+while true; do
+ case $1 in
+ # Experimental Configuration File
+ -c | --config) shift; exp_config=$1 ; shift ;;
+ # Experimental Name
+ -n | --name) shift; exp_name=$1 ; shift ;;
+ # Running Stage
+ -s | --stage) shift; running_stage=$1 ; shift ;;
+ # Visible GPU machines. The default value is "0".
+ --gpu) shift; gpu=$1 ; shift ;;
+
+ # [Only for Training] Resume configuration
+ --resume) shift; resume=$1 ; shift ;;
+ # [Only for Training] The specific checkpoint path that you want to resume from.
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+ --resume_type) shift; resume_type=$1 ; shift ;;
+
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+ # [Only for Inference] Specify the audio file of the target speaker you want to convert into.
+ --infer_target_audio) shift; infer_target=$1 ; shift ;;
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+
+ --) shift ; break ;;
+ *) echo "Invalid option: $1" exit 1 ;;
+ esac
+done
+
+
+### Value check ###
+if [ -z "$running_stage" ]; then
+ echo "[Error] Please specify the running stage"
+ exit 1
+fi
+
+if [ -z "$exp_config" ]; then
+ exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+
+if [ -z "$gpu" ]; then
+ gpu="0"
+fi
+
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vc/preprocess.py \
+ --config $exp_config \
+ --num_workers 4
+fi
+
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+ if [ -z "$exp_name" ]; then
+ echo "[Error] Please specify the experiments name"
+ exit 1
+ fi
+ echo "Exprimental Name: $exp_name"
+
+ if [ "$resume" = true ]; then
+ echo "Automatically resume from the experimental dir..."
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vc/train.py \
+ --config "$exp_config" \
+ --exp_name "$exp_name" \
+ --log_level info \
+ --resume
+ else
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vc/train.py \
+ --config "$exp_config" \
+ --exp_name "$exp_name" \
+ --log_level info \
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
+ --resume_type "$resume_type"
+ fi
+fi
+
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+ if [ -z "$infer_expt_dir" ]; then
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+ exit 1
+ fi
+
+ if [ -z "$infer_output_dir" ]; then
+ infer_output_dir="$expt_dir/result"
+ fi
+
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+ exit 1
+ fi
+
+ if [ -z "$infer_source_file" ]; then
+ infer_source=$infer_source_audio_dir
+ fi
+
+ if [ -z "$infer_source_audio_dir" ]; then
+ infer_source=$infer_source_file
+ fi
+
+ if [ -z "$infer_target_speaker" ]; then
+ echo "[Error] Please specify the target audio file."
+ exit 1
+ fi
+
+ if [ -z "$infer_key_shift" ]; then
+ infer_key_shift="autoshift"
+ fi
+
+ if [ -z "$infer_vocoder_dir" ]; then
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+ fi
+
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vc/inference.py \
+ --config $exp_config \
+ --acoustics_dir $infer_expt_dir \
+ --vocoder_dir $infer_vocoder_dir \
+ --target $infer_target \
+ --trans_key $infer_key_shift \
+ --source $infer_source \
+ --output_dir $infer_output_dir \
+ --log_level debug
+fi
\ No newline at end of file
diff --git a/models/base/base_dataset.py b/models/base/base_dataset.py
index 8c1216a2..769df5c1 100644
--- a/models/base/base_dataset.py
+++ b/models/base/base_dataset.py
@@ -51,6 +51,20 @@ def __init__(self, cfg, dataset, is_valid=False):
utt, spk = line.strip().split("\t")
self.utt2spk[utt] = spk
+ if cfg.preprocess.use_spkemb:
+ self.utt2spk_path = {}
+ for utt_info in self.metadata:
+ dataset = utt_info["Dataset"]
+ uid = utt_info["Uid"]
+ utt = "{}_{}".format(dataset, uid)
+
+ self.utt2spk_path[utt] = os.path.join(
+ cfg.preprocess.processed_dir,
+ dataset,
+ cfg.preprocess.speaker_dir,
+ uid + ".npy",
+ )
+
if cfg.preprocess.use_uv:
self.utt2uv_path = {}
for utt_info in self.metadata:
@@ -208,6 +222,8 @@ def __getitem__(self, index):
single_feature["spk_id"] = np.array(
[self.spk2id[self.utt2spk[utt]]], dtype=np.int32
)
+ if self.cfg.preprocess.use_spkemb:
+ single_feature["spkemb"] = np.load(self.utt2spk_path[utt])
if self.cfg.preprocess.use_mel:
mel = np.load(self.utt2mel_path[utt])
diff --git a/models/vc/__init__.py b/models/vc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/vc/base/__init__.py b/models/vc/base/__init__.py
new file mode 100644
index 00000000..e19ec0dd
--- /dev/null
+++ b/models/vc/base/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .vc_inference import VCInference
+from .vc_trainer import VCTrainer
diff --git a/models/vc/base/vc_dataset.py b/models/vc/base/vc_dataset.py
new file mode 100644
index 00000000..65bb0703
--- /dev/null
+++ b/models/vc/base/vc_dataset.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+import json
+import os
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+import resemblyzer
+from utils.data_utils import *
+from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema
+from processors.content_extractor import (
+ ContentvecExtractor,
+ WhisperExtractor,
+ WenetExtractor,
+ HubertExtractor,
+)
+from models.base.base_dataset import (
+ BaseCollator,
+ BaseDataset,
+)
+from models.base.new_dataset import BaseTestDataset
+
+EPS = 1.0e-12
+
+
+class VCDataset(BaseDataset):
+ def __init__(self, cfg, dataset, is_valid=False):
+ BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid)
+
+ cfg = self.cfg
+ if cfg.preprocess.segment_size is not None:
+ metadata_new = []
+ for item in self.metadata:
+ if (
+ item["Duration"] * cfg.preprocess.sample_rate
+ > cfg.preprocess.segment_size
+ ):
+ metadata_new.append(item)
+ self.metadata = metadata_new
+
+ if cfg.model.condition_encoder.use_whisper:
+ self.whisper_aligner = WhisperExtractor(self.cfg)
+ self.utt2whisper_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
+ )
+
+ if cfg.model.condition_encoder.use_contentvec:
+ self.contentvec_aligner = ContentvecExtractor(self.cfg)
+ self.utt2contentVec_path = load_content_feature_path(
+ self.metadata,
+ cfg.preprocess.processed_dir,
+ cfg.preprocess.contentvec_dir,
+ )
+
+ if cfg.model.condition_encoder.use_mert:
+ self.utt2mert_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
+ )
+ if cfg.model.condition_encoder.use_wenet:
+ self.wenet_aligner = WenetExtractor(self.cfg)
+ self.utt2wenet_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
+ )
+ if cfg.model.condition_encoder.use_hubert:
+ self.hubert_aligner = HubertExtractor(self.cfg)
+ self.utt2hubert_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.hubert_dir
+ )
+
+ def __getitem__(self, index):
+ single_feature = BaseDataset.__getitem__(self, index)
+
+ utt_info = self.metadata[index]
+ dataset = utt_info["Dataset"]
+ uid = utt_info["Uid"]
+ utt = "{}_{}".format(dataset, uid)
+
+ if self.cfg.preprocess.use_frame_pitch:
+ assert "frame_pitch" in single_feature.keys()
+ scaler = MinMaxScaler()
+ scaler.fit(single_feature["frame_pitch"].reshape(-1, 1))
+ single_feature["frame_pitch"] = scaler.transform(
+ single_feature["frame_pitch"].reshape(-1, 1)
+ )
+ single_feature["frame_pitch"] = single_feature["frame_pitch"].reshape(-1)
+
+ if self.cfg.model.condition_encoder.use_whisper:
+ assert "target_len" in single_feature.keys()
+ aligned_whisper_feat = self.whisper_aligner.offline_align(
+ np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
+ )
+ single_feature["whisper_feat"] = aligned_whisper_feat
+
+ if self.cfg.model.condition_encoder.use_contentvec:
+ assert "target_len" in single_feature.keys()
+ aligned_contentvec = self.contentvec_aligner.offline_align(
+ np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
+ )
+ single_feature["contentvec_feat"] = aligned_contentvec
+
+ if self.cfg.model.condition_encoder.use_mert:
+ assert "target_len" in single_feature.keys()
+ aligned_mert_feat = align_content_feature_length(
+ np.load(self.utt2mert_path[utt]),
+ single_feature["target_len"],
+ source_hop=self.cfg.preprocess.mert_hop_size,
+ )
+ single_feature["mert_feat"] = aligned_mert_feat
+
+ if self.cfg.model.condition_encoder.use_wenet:
+ assert "target_len" in single_feature.keys()
+ aligned_wenet_feat = self.wenet_aligner.offline_align(
+ np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
+ )
+ single_feature["wenet_feat"] = aligned_wenet_feat
+
+ if self.cfg.model.condition_encoder.use_hubert:
+ assert "target_len" in single_feature.keys()
+ aligned_hubert_feat = self.hubert_aligner.offline_align(
+ np.load(self.utt2hubert_path[utt]), single_feature["target_len"]
+ )
+ single_feature["hubert_feat"] = aligned_hubert_feat.astype(np.int32)
+
+ # print(single_feature.keys())
+ # for k, v in single_feature.items():
+ # if type(v) in [torch.Tensor, np.ndarray]:
+ # print(k, v.shape)
+ # else:
+ # print(k, v)
+ # exit()
+
+ return self.clip_if_too_long(single_feature)
+
+ def __len__(self):
+ return len(self.metadata)
+
+ def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812):
+ """
+ ending_ts: to avoid invalid whisper features for over 30s audios
+ 2812 = 30 * 24000 // 256
+ """
+ ts = max(feature_seq_len - max_seq_len, 0)
+ ts = min(ts, ending_ts - max_seq_len)
+
+ start = random.randint(0, ts)
+ end = start + max_seq_len
+ return start, end
+
+ def clip_if_too_long(self, sample, max_seq_len=512):
+ """
+ sample :
+ {
+ 'spk_id': (1,),
+ 'target_len': int
+ 'mel': (seq_len, dim),
+ 'frame_pitch': (seq_len,)
+ 'frame_energy': (seq_len,)
+ 'content_vector_feat': (seq_len, dim)
+ }
+ """
+
+ if sample["target_len"] <= max_seq_len:
+ return sample
+
+ start, end = self.random_select(sample["target_len"], max_seq_len)
+ sample["target_len"] = end - start
+
+ for k in sample.keys():
+ if k == "audio":
+ # audio should be clipped in hop_size scale
+ sample[k] = sample[k][
+ start
+ * self.cfg.preprocess.hop_size : end
+ * self.cfg.preprocess.hop_size
+ ]
+ elif k == "audio_len":
+ sample[k] = (end - start) * self.cfg.preprocess.hop_size
+ elif k not in ["spk_id", "target_len", "spkemb"]:
+ sample[k] = sample[k][start:end]
+
+ return sample
+
+
+class VCCollator(BaseCollator):
+ """Zero-pads model inputs and targets based on number of frames per step"""
+
+ def __init__(self, cfg):
+ BaseCollator.__init__(self, cfg)
+
+ def __call__(self, batch):
+ parsed_batch_features = BaseCollator.__call__(self, batch)
+ return parsed_batch_features
+
+
+class VCTestDataset(BaseTestDataset):
+ def __init__(self, args, cfg, infer_type):
+ BaseTestDataset.__init__(self, args, cfg, infer_type)
+ self.metadata = self.get_metadata()
+
+ self.target = args.target
+ self.cfg = cfg
+ self.trans_key = args.trans_key
+
+ self.target_dataset = cfg.dataset[0]
+ if cfg.preprocess.mel_min_max_norm:
+ self.target_mel_extrema = load_mel_extrema(
+ cfg.preprocess, self.target_dataset
+ )
+ self.target_mel_extrema = torch.as_tensor(
+ self.target_mel_extrema[0]
+ ), torch.as_tensor(self.target_mel_extrema[1])
+
+ ######### Load source acoustic features #########
+ if cfg.preprocess.use_spkid:
+ spk2id_path = os.path.join(args.acoustics_dir, cfg.preprocess.spk2id)
+ # utt2sp_path = os.path.join(self.data_root, cfg.preprocess.utt2spk)
+
+ with open(spk2id_path, "r") as f:
+ self.spk2id = json.load(f)
+ # print("self.spk2id", self.spk2id)
+
+ if cfg.preprocess.use_spkemb:
+ self.utt2spk_path = {}
+ for utt_info in self.metadata:
+ dataset = utt_info["Dataset"]
+ uid = utt_info["Uid"]
+ utt = "{}_{}".format(dataset, uid)
+
+ self.utt2spk_path[utt] = os.path.join(
+ cfg.preprocess.processed_dir,
+ dataset,
+ cfg.preprocess.speaker_dir,
+ uid + ".npy",
+ )
+
+ if cfg.preprocess.use_uv:
+ self.utt2uv_path = {
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+ cfg.preprocess.processed_dir,
+ utt_info["Dataset"],
+ cfg.preprocess.uv_dir,
+ utt_info["Uid"] + ".npy",
+ )
+ for utt_info in self.metadata
+ }
+
+ if cfg.preprocess.use_frame_pitch:
+ self.utt2frame_pitch_path = {
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+ cfg.preprocess.processed_dir,
+ utt_info["Dataset"],
+ cfg.preprocess.pitch_dir,
+ utt_info["Uid"] + ".npy",
+ )
+ for utt_info in self.metadata
+ }
+
+ if cfg.preprocess.use_frame_energy:
+ self.utt2frame_energy_path = {
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+ cfg.preprocess.processed_dir,
+ utt_info["Dataset"],
+ cfg.preprocess.energy_dir,
+ utt_info["Uid"] + ".npy",
+ )
+ for utt_info in self.metadata
+ }
+
+ if cfg.preprocess.use_mel:
+ self.utt2mel_path = {
+ f'{utt_info["Dataset"]}_{utt_info["Uid"]}': os.path.join(
+ cfg.preprocess.processed_dir,
+ utt_info["Dataset"],
+ cfg.preprocess.mel_dir,
+ utt_info["Uid"] + ".npy",
+ )
+ for utt_info in self.metadata
+ }
+
+ ######### Load source content features' path #########
+ if cfg.model.condition_encoder.use_whisper:
+ self.whisper_aligner = WhisperExtractor(cfg)
+ self.utt2whisper_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.whisper_dir
+ )
+
+ if cfg.model.condition_encoder.use_contentvec:
+ self.contentvec_aligner = ContentvecExtractor(cfg)
+ self.utt2contentVec_path = load_content_feature_path(
+ self.metadata,
+ cfg.preprocess.processed_dir,
+ cfg.preprocess.contentvec_dir,
+ )
+
+ if cfg.model.condition_encoder.use_mert:
+ self.utt2mert_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.mert_dir
+ )
+ if cfg.model.condition_encoder.use_wenet:
+ self.wenet_aligner = WenetExtractor(cfg)
+ self.utt2wenet_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.wenet_dir
+ )
+ if cfg.model.condition_encoder.use_hubert:
+ self.hubert_aligner = HubertExtractor(cfg)
+ self.utt2hubert_path = load_content_feature_path(
+ self.metadata, cfg.preprocess.processed_dir, cfg.preprocess.hubert_dir
+ )
+
+ def __getitem__(self, index):
+ single_feature = {}
+
+ utt_info = self.metadata[index]
+ dataset = utt_info["Dataset"]
+ uid = utt_info["Uid"]
+ utt = "{}_{}".format(dataset, uid)
+
+ source_dataset = self.metadata[index]["Dataset"]
+
+ if self.cfg.preprocess.use_spkid:
+ single_feature["spk_id"] = np.array(
+ [self.spk2id[f"{self.target_dataset}_{self.target_singer}"]],
+ dtype=np.int32,
+ )
+
+ if self.cfg.preprocess.use_spkemb:
+ voice_encoder = resemblyzer.VoiceEncoder("cpu", verbose=False)
+ target_wav = resemblyzer.preprocess_wav(self.target)
+ single_feature["spkemb"] = voice_encoder.embed_utterance(target_wav)
+
+ ######### Get Acoustic Features Item #########
+ if self.cfg.preprocess.use_mel:
+ mel = np.load(self.utt2mel_path[utt])
+ assert mel.shape[0] == self.cfg.preprocess.n_mel # [n_mels, T]
+ if self.cfg.preprocess.use_min_max_norm_mel:
+ # mel norm
+ mel = cal_normalized_mel(mel, source_dataset, self.cfg.preprocess)
+
+ if "target_len" not in single_feature.keys():
+ single_feature["target_len"] = mel.shape[1]
+ single_feature["mel"] = mel.T # [T, n_mels]
+
+ if self.cfg.preprocess.use_frame_pitch:
+ frame_pitch_path = self.utt2frame_pitch_path[utt]
+ frame_pitch = np.load(frame_pitch_path)
+
+ if "target_len" not in single_feature.keys():
+ single_feature["target_len"] = len(frame_pitch)
+ aligned_frame_pitch = align_length(
+ frame_pitch, single_feature["target_len"]
+ )
+ single_feature["frame_pitch"] = aligned_frame_pitch
+ scaler = MinMaxScaler()
+ scaler.fit(single_feature["frame_pitch"].reshape(-1, 1))
+ single_feature["frame_pitch"] = scaler.transform(
+ single_feature["frame_pitch"].reshape(-1, 1)
+ )
+ single_feature["frame_pitch"] = single_feature["frame_pitch"].reshape(-1)
+
+ if self.cfg.preprocess.use_uv:
+ frame_uv_path = self.utt2uv_path[utt]
+ frame_uv = np.load(frame_uv_path)
+ aligned_frame_uv = align_length(frame_uv, single_feature["target_len"])
+ aligned_frame_uv = [
+ 0 if frame_uv else 1 for frame_uv in aligned_frame_uv
+ ]
+ aligned_frame_uv = np.array(aligned_frame_uv)
+ single_feature["frame_uv"] = aligned_frame_uv
+
+ if self.cfg.preprocess.use_frame_energy:
+ frame_energy_path = self.utt2frame_energy_path[utt]
+ frame_energy = np.load(frame_energy_path)
+ if "target_len" not in single_feature.keys():
+ single_feature["target_len"] = len(frame_energy)
+ aligned_frame_energy = align_length(
+ frame_energy, single_feature["target_len"]
+ )
+ single_feature["frame_energy"] = aligned_frame_energy
+
+ ######### Get Content Features Item #########
+ if self.cfg.model.condition_encoder.use_whisper:
+ assert "target_len" in single_feature.keys()
+ aligned_whisper_feat = self.whisper_aligner.offline_align(
+ np.load(self.utt2whisper_path[utt]), single_feature["target_len"]
+ )
+ single_feature["whisper_feat"] = aligned_whisper_feat
+
+ if self.cfg.model.condition_encoder.use_contentvec:
+ assert "target_len" in single_feature.keys()
+ aligned_contentvec = self.contentvec_aligner.offline_align(
+ np.load(self.utt2contentVec_path[utt]), single_feature["target_len"]
+ )
+ single_feature["contentvec_feat"] = aligned_contentvec
+
+ if self.cfg.model.condition_encoder.use_mert:
+ assert "target_len" in single_feature.keys()
+ aligned_mert_feat = align_content_feature_length(
+ np.load(self.utt2mert_path[utt]),
+ single_feature["target_len"],
+ source_hop=self.cfg.preprocess.mert_hop_size,
+ )
+ single_feature["mert_feat"] = aligned_mert_feat
+
+ if self.cfg.model.condition_encoder.use_wenet:
+ assert "target_len" in single_feature.keys()
+ aligned_wenet_feat = self.wenet_aligner.offline_align(
+ np.load(self.utt2wenet_path[utt]), single_feature["target_len"]
+ )
+ single_feature["wenet_feat"] = aligned_wenet_feat
+
+ if self.cfg.model.condition_encoder.use_hubert:
+ assert "target_len" in single_feature.keys()
+ aligned_hubert_feat = self.hubert_aligner.offline_align(
+ np.load(self.utt2hubert_path[utt]), single_feature["target_len"]
+ )
+ single_feature["hubert_feat"] = aligned_hubert_feat.astype(np.int32)
+
+ return single_feature
+
+ def __len__(self):
+ return len(self.metadata)
+
+
+class VCTestCollator:
+ """Zero-pads model inputs and targets based on number of frames per step"""
+
+ def __init__(self, cfg):
+ self.cfg = cfg
+
+ def __call__(self, batch):
+ packed_batch_features = dict()
+
+ # mel: [b, T, n_mels]
+ # frame_pitch, frame_energy: [1, T]
+ # target_len: [1]
+ # spk_id: [b, 1]
+ # mask: [b, T, 1]
+
+ for key in batch[0].keys():
+ if key == "target_len":
+ packed_batch_features["target_len"] = torch.LongTensor(
+ [b["target_len"] for b in batch]
+ )
+ masks = [
+ torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+ ]
+ packed_batch_features["mask"] = pad_sequence(
+ masks, batch_first=True, padding_value=0
+ )
+ else:
+ values = [torch.from_numpy(b[key]) for b in batch]
+ packed_batch_features[key] = pad_sequence(
+ values, batch_first=True, padding_value=0
+ )
+
+ return packed_batch_features
diff --git a/models/vc/base/vc_inference.py b/models/vc/base/vc_inference.py
new file mode 100644
index 00000000..c7eee255
--- /dev/null
+++ b/models/vc/base/vc_inference.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+
+from models.base.new_inference import BaseInference
+from models.vc.base.vc_dataset import VCTestCollator, VCTestDataset
+
+from utils.io import save_audio
+from utils.util import load_config
+from utils.audio_slicer import is_silence
+from models.vocoders.vocoder_inference import synthesis
+
+EPS = 1.0e-12
+
+
+class VCInference(BaseInference):
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+ BaseInference.__init__(self, args, cfg, infer_type)
+
+ def _build_test_dataset(self):
+ return VCTestDataset, VCTestCollator
+
+ @torch.inference_mode()
+ def inference(self):
+ for i, batch in enumerate(self.test_dataloader):
+ y_pred = self._inference_each_batch(batch).cpu()
+ mel_min, mel_max = self.test_dataset.target_mel_extrema
+ y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min
+ y_ls = y_pred.chunk(self.test_batch_size)
+ tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size)
+ j = 0
+ for it, l in zip(y_ls, tgt_ls):
+ l = l.item()
+ it = it.squeeze(0)[:l]
+ uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
+ torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
+ j += 1
+
+ vocoder_cfg = load_config(
+ os.path.join(self.args.vocoder_dir, "args.json"), lowercase=True
+ )
+
+ res = synthesis(
+ cfg=vocoder_cfg,
+ vocoder_weight_file=self.args.vocoder_dir,
+ n_samples=None,
+ pred=[
+ torch.load(
+ os.path.join(self.args.output_dir, "{}.pt".format(i["Uid"]))
+ ).numpy(force=True)
+ for i in self.test_dataset.metadata
+ ],
+ )
+
+ output_audio_files = []
+ for it, wav in zip(self.test_dataset.metadata, res):
+ uid = it["Uid"]
+ file = os.path.join(self.args.output_dir, f"{uid}.wav")
+ output_audio_files.append(file)
+
+ wav = wav.numpy(force=True)
+ save_audio(
+ file,
+ wav,
+ self.cfg.preprocess.sample_rate,
+ add_silence=False,
+ turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
+ )
+ os.remove(os.path.join(self.args.output_dir, f"{uid}.pt"))
+
+ return sorted(output_audio_files)
diff --git a/models/vc/base/vc_trainer.py b/models/vc/base/vc_trainer.py
new file mode 100644
index 00000000..48800284
--- /dev/null
+++ b/models/vc/base/vc_trainer.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+
+import torch
+import torch.nn as nn
+
+from models.base.new_trainer import BaseTrainer
+from models.vc.base.vc_dataset import VCCollator, VCDataset
+
+
+class VCTrainer(BaseTrainer):
+ r"""The base trainer for all SVC models. It inherits from BaseTrainer and implements
+ ``build_criterion``, ``_build_dataset`` and ``_build_singer_lut`` methods. You can inherit from this
+ class, and implement ``_build_model``, ``_forward_step``.
+ """
+
+ def __init__(self, args=None, cfg=None):
+ self.args = args
+ self.cfg = cfg
+
+ self._init_accelerator()
+
+ # Only for SVC tasks
+ with self.accelerator.main_process_first():
+ self.singers = self._build_singer_lut()
+
+ # Super init
+ BaseTrainer.__init__(self, args, cfg)
+
+ # Only for SVC tasks
+ self.task_type = "SVC"
+ self.logger.info("Task type: {}".format(self.task_type))
+
+ ### Following are methods only for SVC tasks ###
+ # TODO: LEGACY CODE, NEED TO BE REFACTORED
+ def _build_dataset(self):
+ return VCDataset, VCCollator
+
+ @staticmethod
+ def _build_criterion():
+ criterion = nn.MSELoss(reduction="none")
+ return criterion
+
+ @staticmethod
+ def _compute_loss(criterion, y_pred, y_gt, loss_mask):
+ """
+ Args:
+ criterion: MSELoss(reduction='none')
+ y_pred, y_gt: (bs, seq_len, D)
+ loss_mask: (bs, seq_len, 1)
+ Returns:
+ loss: Tensor of shape []
+ """
+
+ # (bs, seq_len, D)
+ loss = criterion(y_pred, y_gt)
+ # expand loss_mask to (bs, seq_len, D)
+ loss_mask = loss_mask.repeat(1, 1, loss.shape[-1])
+
+ loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask)
+ return loss
+
+ def _save_auxiliary_states(self):
+ """
+ To save the singer's look-up table in the checkpoint saving path
+ """
+ with open(
+ os.path.join(self.tmp_checkpoint_save_path, self.cfg.preprocess.spk2id), "w"
+ ) as f:
+ json.dump(self.singers, f, indent=4, ensure_ascii=False)
+
+ def _build_singer_lut(self):
+ resumed_singer_path = None
+ if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "":
+ resumed_singer_path = os.path.join(
+ self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id
+ )
+ if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
+ resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+
+ if resumed_singer_path:
+ with open(resumed_singer_path, "r") as f:
+ singers = json.load(f)
+ else:
+ singers = dict()
+
+ for dataset in self.cfg.dataset:
+ singer_lut_path = os.path.join(
+ self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+ )
+ with open(singer_lut_path, "r") as singer_lut_path:
+ singer_lut = json.load(singer_lut_path)
+ for singer in singer_lut.keys():
+ if singer not in singers:
+ singers[singer] = len(singers)
+
+ with open(
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
+ ) as singer_file:
+ json.dump(singers, singer_file, indent=4, ensure_ascii=False)
+ print(
+ "singers have been dumped to {}".format(
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+ )
+ )
+ return singers
diff --git a/models/vc/transformer/__init__.py b/models/vc/transformer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/vc/transformer/conformer.py b/models/vc/transformer/conformer.py
new file mode 100644
index 00000000..5e48019c
--- /dev/null
+++ b/models/vc/transformer/conformer.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+from utils.util import convert_pad_shape
+
+
+class BaseModule(torch.nn.Module):
+ def __init__(self):
+ super(BaseModule, self).__init__()
+
+ @property
+ def nparams(self):
+ """
+ Returns number of trainable parameters of the module.
+ """
+ num_params = 0
+ for name, param in self.named_parameters():
+ if param.requires_grad:
+ num_params += np.prod(param.detach().cpu().numpy().shape)
+ return num_params
+
+ def relocate_input(self, x: list):
+ """
+ Relocates provided tensors to the same device set for the module.
+ """
+ device = next(self.parameters()).device
+ for i in range(len(x)):
+ if isinstance(x[i], torch.Tensor) and x[i].device != device:
+ x[i] = x[i].to(device)
+ return x
+
+
+class LayerNorm(BaseModule):
+ def __init__(self, channels, eps=1e-4):
+ super(LayerNorm, self).__init__()
+ self.channels = channels
+ self.eps = eps
+
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
+
+ def forward(self, x):
+ n_dims = len(x.shape)
+ mean = torch.mean(x, 1, keepdim=True)
+ variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+
+ x = (x - mean) * torch.rsqrt(variance + self.eps)
+
+ shape = [1, -1] + [1] * (n_dims - 2)
+ x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+ return x
+
+
+class ConvReluNorm(BaseModule):
+ def __init__(
+ self,
+ in_channels,
+ hidden_channels,
+ out_channels,
+ kernel_size,
+ n_layers,
+ p_dropout,
+ eps=1e-5,
+ ):
+ super(ConvReluNorm, self).__init__()
+ self.in_channels = in_channels
+ self.hidden_channels = hidden_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.p_dropout = p_dropout
+ self.eps = eps
+
+ self.conv_layers = torch.nn.ModuleList()
+ self.conv_layers.append(
+ torch.nn.Conv1d(
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+ )
+ )
+ self.relu_drop = torch.nn.Sequential(
+ torch.nn.ReLU(), torch.nn.Dropout(p_dropout)
+ )
+ for _ in range(n_layers - 1):
+ self.conv_layers.append(
+ torch.nn.Conv1d(
+ hidden_channels,
+ hidden_channels,
+ kernel_size,
+ padding=kernel_size // 2,
+ )
+ )
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
+ self.proj.weight.data.zero_()
+ self.proj.bias.data.zero_()
+
+ def forward(self, x, x_mask):
+ for i in range(self.n_layers):
+ x = self.conv_layers[i](x * x_mask)
+ x = self.instance_norm(x, x_mask)
+ x = self.relu_drop(x)
+ x = self.proj(x)
+ return x * x_mask
+
+ def instance_norm(self, x, mask, return_mean_std=False):
+ mean, std = self.calc_mean_std(x, mask)
+ x = (x - mean) / std
+ if return_mean_std:
+ return x, mean, std
+ else:
+ return x
+
+ def calc_mean_std(self, x, mask=None):
+ x = x * mask
+ B, C = x.shape[:2]
+ mn = x.view(B, C, -1).mean(-1)
+ sd = (x.view(B, C, -1).var(-1) + self.eps).sqrt()
+ mn = mn.view(B, C, *((len(x.shape) - 2) * [1]))
+ sd = sd.view(B, C, *((len(x.shape) - 2) * [1]))
+ return mn, sd
+
+
+class MultiHeadAttention(BaseModule):
+ def __init__(
+ self,
+ channels,
+ out_channels,
+ n_heads,
+ window_size=None,
+ heads_share=True,
+ p_dropout=0.0,
+ proximal_bias=False,
+ proximal_init=False,
+ ):
+ super(MultiHeadAttention, self).__init__()
+ assert channels % n_heads == 0
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.window_size = window_size
+ self.heads_share = heads_share
+ self.proximal_bias = proximal_bias
+ self.p_dropout = p_dropout
+ self.attn = None
+
+ self.k_channels = channels // n_heads
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+ if window_size is not None:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels**-0.5
+ self.emb_rel_k = torch.nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.emb_rel_v = torch.nn.Parameter(
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+ * rel_stddev
+ )
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
+ if proximal_init:
+ self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+ self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
+
+ def forward(self, x, c, attn_mask=None):
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+ scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+ if self.window_size is not None:
+ assert (
+ t_s == t_t
+ ), "Relative attention is only available for self-attention."
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+ rel_logits = self._relative_position_to_absolute_position(rel_logits)
+ scores_local = rel_logits / math.sqrt(self.k_channels)
+ scores = scores + scores_local
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
+ scores = scores + self._attention_bias_proximal(t_s).to(
+ device=scores.device, dtype=scores.dtype
+ )
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ p_attn = torch.nn.functional.softmax(scores, dim=-1)
+ p_attn = self.drop(p_attn)
+ output = torch.matmul(p_attn, value)
+ if self.window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(
+ self.emb_rel_v, t_s
+ )
+ output = output + self._matmul_with_relative_values(
+ relative_weights, value_relative_embeddings
+ )
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+ return output, p_attn
+
+ def _matmul_with_relative_values(self, x, y):
+ ret = torch.matmul(x, y.unsqueeze(0))
+ return ret
+
+ def _matmul_with_relative_keys(self, x, y):
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+ return ret
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ pad_length = max(length - (self.window_size + 1), 0)
+ slice_start_position = max((self.window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = torch.nn.functional.pad(
+ relative_embeddings,
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+ )
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[
+ :, slice_start_position:slice_end_position
+ ]
+ return used_relative_embeddings
+
+ def _relative_position_to_absolute_position(self, x):
+ batch, heads, length, _ = x.size()
+ x = torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
+ )
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = torch.nn.functional.pad(
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+ )
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+ :, :, :length, length - 1 :
+ ]
+ return x_final
+
+ def _absolute_position_to_relative_position(self, x):
+ batch, heads, length, _ = x.size()
+ x = torch.nn.functional.pad(
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+ )
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+ x_flat = torch.nn.functional.pad(
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
+ )
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ def _attention_bias_proximal(self, length):
+ r = torch.arange(length, dtype=torch.float32)
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(BaseModule):
+ def __init__(
+ self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0
+ ):
+ super(FFN, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+
+ self.conv_1 = torch.nn.Conv1d(
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.conv_2 = torch.nn.Conv1d(
+ filter_channels, out_channels, kernel_size, padding=kernel_size // 2
+ )
+ self.drop = torch.nn.Dropout(p_dropout)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(x * x_mask)
+ x = torch.relu(x)
+ x = self.drop(x)
+ x = self.conv_2(x * x_mask)
+ return x * x_mask
+
+
+class Encoder(BaseModule):
+ def __init__(
+ self,
+ hidden_channels,
+ filter_channels,
+ n_heads=2,
+ n_layers=6,
+ kernel_size=3,
+ p_dropout=0.1,
+ window_size=4,
+ **kwargs
+ ):
+ super(Encoder, self).__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+
+ self.drop = torch.nn.Dropout(p_dropout)
+ self.attn_layers = torch.nn.ModuleList()
+ self.norm_layers_1 = torch.nn.ModuleList()
+ self.ffn_layers = torch.nn.ModuleList()
+ self.norm_layers_2 = torch.nn.ModuleList()
+ for _ in range(self.n_layers):
+ self.attn_layers.append(
+ MultiHeadAttention(
+ hidden_channels,
+ hidden_channels,
+ n_heads,
+ window_size=window_size,
+ p_dropout=p_dropout,
+ )
+ )
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(
+ hidden_channels,
+ hidden_channels,
+ filter_channels,
+ kernel_size,
+ p_dropout=p_dropout,
+ )
+ )
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask):
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ for i in range(self.n_layers):
+ x = x * x_mask
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class Conformer(BaseModule):
+ def __init__(self, cfg):
+ super().__init__()
+ self.cfg = cfg
+ self.n_heads = self.cfg.n_heads
+ self.n_layers = self.cfg.n_layers
+ self.hidden_channels = self.cfg.input_dim
+ self.filter_channels = self.cfg.filter_channels
+ self.output_dim = self.cfg.output_dim
+ self.dropout = self.cfg.dropout
+
+ self.conformer_encoder = Encoder(
+ self.hidden_channels,
+ self.filter_channels,
+ n_heads=self.n_heads,
+ n_layers=self.n_layers,
+ kernel_size=3,
+ p_dropout=self.dropout,
+ window_size=4,
+ )
+ self.projection = nn.Conv1d(self.hidden_channels, self.output_dim, 1)
+
+ def forward(self, x, x_mask):
+ """
+ Args:
+ x: (N, seq_len, input_dim)
+ Returns:
+ output: (N, seq_len, output_dim)
+ """
+ # (N, seq_len, d_model)
+ x = x.transpose(1, 2)
+ x_mask = x_mask.transpose(1, 2)
+ output = self.conformer_encoder(x, x_mask)
+ # (N, seq_len, output_dim)
+ output = self.projection(output)
+ output = output.transpose(1, 2)
+ return output
diff --git a/models/vc/transformer/transformer.py b/models/vc/transformer/transformer.py
new file mode 100644
index 00000000..fd3cdb6c
--- /dev/null
+++ b/models/vc/transformer/transformer.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import torch
+import torch.nn as nn
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+
+
+class Transformer(nn.Module):
+ def __init__(self, cfg):
+ super().__init__()
+ self.cfg = cfg
+
+ dropout = self.cfg.dropout
+ nhead = self.cfg.n_heads
+ nlayers = self.cfg.n_layers
+ input_dim = self.cfg.input_dim
+ output_dim = self.cfg.output_dim
+
+ d_model = input_dim
+ self.pos_encoder = PositionalEncoding(d_model, dropout)
+ encoder_layers = TransformerEncoderLayer(
+ d_model, nhead, dropout=dropout, batch_first=True
+ )
+ self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+
+ self.output_mlp = nn.Linear(d_model, output_dim)
+
+ def forward(self, x, mask=None):
+ """
+ Args:
+ x: (N, seq_len, input_dim)
+ Returns:
+ output: (N, seq_len, output_dim)
+ """
+ # (N, seq_len, d_model)
+ src = self.pos_encoder(x)
+ # model_stats["pos_embedding"] = x
+ # (N, seq_len, d_model)
+ output = self.transformer_encoder(src)
+ # (N, seq_len, output_dim)
+ output = self.output_mlp(output)
+ return output
+
+
+class PositionalEncoding(nn.Module):
+ def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
+ super().__init__()
+ self.dropout = nn.Dropout(p=dropout)
+
+ position = torch.arange(max_len).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
+ )
+
+ # Assume that x is (seq_len, N, d)
+ # pe = torch.zeros(max_len, 1, d_model)
+ # pe[:, 0, 0::2] = torch.sin(position * div_term)
+ # pe[:, 0, 1::2] = torch.cos(position * div_term)
+
+ # Assume that x in (N, seq_len, d)
+ pe = torch.zeros(1, max_len, d_model)
+ pe[0, :, 0::2] = torch.sin(position * div_term)
+ pe[0, :, 1::2] = torch.cos(position * div_term)
+
+ self.register_buffer("pe", pe)
+
+ def forward(self, x):
+ """
+ Args:
+ x: Tensor, shape [N, seq_len, d]
+ """
+ # Old: Assume that x is (seq_len, N, d), and self.pe is (max_len, 1, d_model)
+ # x = x + self.pe[: x.size(0)]
+
+ # Now: self.pe is (1, max_len, d)
+ x = x + self.pe[:, : x.size(1), :]
+
+ return self.dropout(x)
diff --git a/models/vc/transformer/transformer_inference.py b/models/vc/transformer/transformer_inference.py
new file mode 100644
index 00000000..f1c8f943
--- /dev/null
+++ b/models/vc/transformer/transformer_inference.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+import numpy as np
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+from collections import OrderedDict
+
+from models.vc.base import VCInference
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.vc.transformer.transformer import Transformer
+from models.vc.transformer.conformer import Conformer
+
+
+class TransformerInference(VCInference):
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+ VCInference.__init__(self, args, cfg, infer_type)
+
+ def _build_model(self):
+ self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+ self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+ self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+ if self.cfg.model.transformer.type == "transformer":
+ self.acoustic_mapper = Transformer(self.cfg.model.transformer)
+ elif self.cfg.model.transformer.type == "conformer":
+ self.acoustic_mapper = Conformer(self.cfg.model.transformer)
+ else:
+ raise NotImplementedError
+ model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+ return model
+
+ def _inference_each_batch(self, batch_data):
+ device = self.accelerator.device
+ for k, v in batch_data.items():
+ batch_data[k] = v.to(device)
+
+ condition = self.condition_encoder(batch_data)
+ y_pred = self.acoustic_mapper(condition, batch_data["mask"])
+
+ return y_pred
diff --git a/models/vc/transformer/transformer_trainer.py b/models/vc/transformer/transformer_trainer.py
new file mode 100644
index 00000000..247db706
--- /dev/null
+++ b/models/vc/transformer/transformer_trainer.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from models.vc.base import VCTrainer
+from modules.encoder.condition_encoder import ConditionEncoder
+from models.vc.transformer.transformer import Transformer
+from models.vc.transformer.conformer import Conformer
+from utils.ssim import SSIM
+
+
+class TransformerTrainer(VCTrainer):
+ def __init__(self, args, cfg):
+ VCTrainer.__init__(self, args, cfg)
+ self.ssim_loss = SSIM()
+
+ def _build_model(self):
+ # self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+ # self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+ self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+ if self.cfg.model.transformer.type == "transformer":
+ self.acoustic_mapper = Transformer(self.cfg.model.transformer)
+ elif self.cfg.model.transformer.type == "conformer":
+ self.acoustic_mapper = Conformer(self.cfg.model.transformer)
+ else:
+ raise NotImplementedError
+ model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+ return model
+
+ def _forward_step(self, batch):
+ total_loss = 0
+ device = self.accelerator.device
+ mel = batch["mel"]
+ mask = batch["mask"]
+
+ condition = self.condition_encoder(batch)
+ mel_pred = self.acoustic_mapper(condition, mask)
+
+ l1_loss = torch.sum(torch.abs(mel_pred - mel) * batch["mask"]) / torch.sum(
+ batch["mask"]
+ )
+ self._check_nan(l1_loss, mel_pred, mel)
+ total_loss += l1_loss
+ ssim_loss = self.ssim_loss(mel_pred, mel)
+ ssim_loss = torch.sum(ssim_loss * batch["mask"]) / torch.sum(batch["mask"])
+ self._check_nan(ssim_loss, mel_pred, mel)
+ total_loss += ssim_loss
+
+ return total_loss
diff --git a/models/vc/vits/__init__.py b/models/vc/vits/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/vc/vits/vits.py b/models/vc/vits/vits.py
new file mode 100644
index 00000000..29088b42
--- /dev/null
+++ b/models/vc/vits/vits.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.1-Stable/models.py
+import copy
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from utils.util import *
+from utils.f0 import f0_to_coarse
+
+from modules.transformer.attentions import Encoder
+from models.tts.vits.vits import ResidualCouplingBlock, PosteriorEncoder
+from models.vocoders.gan.generator.bigvgan import BigVGAN
+from models.vocoders.gan.generator.hifigan import HiFiGAN
+from models.vocoders.gan.generator.nsfhifigan import NSFHiFiGAN
+from models.vocoders.gan.generator.melgan import MelGAN
+from models.vocoders.gan.generator.apnet import APNet
+from modules.encoder.condition_encoder import ConditionEncoder
+
+
+def slice_pitch_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, idx_str:idx_end]
+ return ret
+
+
+def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
+ return ret, ret_pitch, ids_str
+
+
+class ContentEncoder(nn.Module):
+ def __init__(
+ self,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ n_layers,
+ gin_channels=0,
+ filter_channels=None,
+ n_heads=None,
+ p_dropout=None,
+ ):
+ super().__init__()
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.f0_emb = nn.Embedding(256, hidden_channels)
+
+ self.enc_ = Encoder(
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+ )
+
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ # condition_encoder ver.
+ def forward(self, x, x_mask, noice_scale=1):
+ x = self.enc_(x * x_mask, x_mask)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
+
+ return z, m, logs, x_mask
+
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(self, spec_channels, segment_size, cfg):
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.segment_size = segment_size
+ self.cfg = cfg
+ self.inter_channels = cfg.model.vits.inter_channels
+ self.hidden_channels = cfg.model.vits.hidden_channels
+ self.filter_channels = cfg.model.vits.filter_channels
+ self.n_heads = cfg.model.vits.n_heads
+ self.n_layers = cfg.model.vits.n_layers
+ self.kernel_size = cfg.model.vits.kernel_size
+ self.p_dropout = cfg.model.vits.p_dropout
+ self.ssl_dim = cfg.model.vits.ssl_dim
+ self.n_flow_layer = cfg.model.vits.n_flow_layer
+ self.gin_channels = cfg.model.vits.gin_channels
+ self.n_speakers = cfg.model.vits.n_speakers
+
+ # f0
+ self.n_bins = cfg.preprocess.pitch_bin
+ self.f0_min = cfg.preprocess.f0_min
+ self.f0_max = cfg.preprocess.f0_max
+
+ # TODO: sort out the config
+ self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+
+ self.emb_g = nn.Embedding(self.n_speakers, self.gin_channels)
+
+ self.enc_p = ContentEncoder(
+ self.inter_channels,
+ self.hidden_channels,
+ filter_channels=self.filter_channels,
+ n_heads=self.n_heads,
+ n_layers=self.n_layers,
+ kernel_size=self.kernel_size,
+ p_dropout=self.p_dropout,
+ )
+
+ assert cfg.model.generator in [
+ "bigvgan",
+ "hifigan",
+ "melgan",
+ "nsfhifigan",
+ "apnet",
+ ]
+ self.dec_name = cfg.model.generator
+ temp_cfg = copy.deepcopy(cfg)
+ temp_cfg.preprocess.n_mel = self.inter_channels
+ if cfg.model.generator == "bigvgan":
+ temp_cfg.model.bigvgan = cfg.model.generator_config.bigvgan
+ self.dec = BigVGAN(temp_cfg)
+ elif cfg.model.generator == "hifigan":
+ temp_cfg.model.hifigan = cfg.model.generator_config.hifigan
+ self.dec = HiFiGAN(temp_cfg)
+ elif cfg.model.generator == "melgan":
+ temp_cfg.model.melgan = cfg.model.generator_config.melgan
+ self.dec = MelGAN(temp_cfg)
+ elif cfg.model.generator == "nsfhifigan":
+ temp_cfg.model.nsfhifigan = cfg.model.generator_config.nsfhifigan
+ self.dec = NSFHiFiGAN(temp_cfg) # TODO: nsf need f0
+ elif cfg.model.generator == "apnet":
+ temp_cfg.model.apnet = cfg.model.generator_config.apnet
+ self.dec = APNet(temp_cfg)
+
+ self.enc_q = PosteriorEncoder(
+ self.spec_channels,
+ self.inter_channels,
+ self.hidden_channels,
+ 5,
+ 1,
+ 16,
+ gin_channels=self.gin_channels,
+ )
+
+ self.flow = ResidualCouplingBlock(
+ self.inter_channels,
+ self.hidden_channels,
+ 5,
+ 1,
+ self.n_flow_layer,
+ gin_channels=self.gin_channels,
+ )
+
+ def forward(self, data):
+ """VitsSVC forward function.
+
+ Args:
+ data (dict): condition data & audio data, including:
+ B: batch size, T: target length
+ {
+ "spk_id": [B, singer_table_size]
+ "target_len": [B]
+ "mask": [B, T, 1]
+ "mel": [B, T, n_mel]
+ "linear": [B, T, n_fft // 2 + 1]
+ "frame_pitch": [B, T]
+ "frame_uv": [B, T]
+ "audio": [B, audio_len]
+ "audio_len": [B]
+ "contentvec_feat": [B, T, contentvec_dim]
+ "whisper_feat": [B, T, whisper_dim]
+ ...
+ }
+ """
+
+ # TODO: elegantly handle the dimensions
+ if "contentvec_feat" in data.keys():
+ c = data["contentvec_feat"].transpose(1, 2)
+ elif "whisper_feat" in data.keys():
+ c = data["whisper_feat"].transpose(1, 2)
+ elif "mert_feat" in data.keys():
+ c = data["mert_feat"].transpose(1, 2)
+ elif "wenet_feat" in data.keys():
+ c = data["wenet_feat"].transpose(1, 2)
+ elif "hubert_feat" in data.keys():
+ c = data["hubert_feat"].transpose(1, 2)
+
+ spec = data["linear"].transpose(1, 2)
+
+ if self.cfg.model.condition_encoder.use_spkid:
+ g = data["spk_id"]
+ g = self.emb_g(g).transpose(1, 2)
+ elif self.cfg.model.condition_encoder.use_spkemb:
+ g = data["spkemb"].unsqueeze(-1)
+
+ c_lengths = data["target_len"]
+ spec_lengths = data["target_len"]
+ f0 = data["frame_pitch"]
+
+ x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
+ # condition_encoder ver.
+ x = self.condition_encoder(data).transpose(1, 2)
+
+ # prior encoder
+ z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask)
+ # posterior encoder
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
+
+ # flow
+ z_p = self.flow(z, spec_mask, g=g)
+ z_slice, pitch_slice, ids_slice = rand_slice_segments_with_pitch(
+ z, f0, spec_lengths, self.segment_size
+ )
+
+ if self.dec_name == "nsfhifigan":
+ o = self.dec(z_slice, f0=f0.float())
+ elif self.dec_name == "apnet":
+ _, _, _, _, o = self.dec(z_slice)
+ else:
+ o = self.dec(z_slice)
+
+ outputs = {
+ "y_hat": o,
+ "ids_slice": ids_slice,
+ "x_mask": x_mask,
+ "z_mask": data["mask"].transpose(1, 2),
+ "z": z,
+ "z_p": z_p,
+ "m_p": m_p,
+ "logs_p": logs_p,
+ "m_q": m_q,
+ "logs_q": logs_q,
+ }
+ return outputs
+
+ @torch.no_grad()
+ def infer(self, data, noise_scale=0.35, seed=52468):
+ # c, f0, uv, g
+ if "contentvec_feat" in data.keys():
+ c = data["contentvec_feat"].transpose(1, 2)
+ elif "whisper_feat" in data.keys():
+ c = data["whisper_feat"].transpose(1, 2)
+ elif "mert_feat" in data.keys():
+ c = data["mert_feat"].transpose(1, 2)
+ elif "wenet_feat" in data.keys():
+ c = data["wenet_feat"].transpose(1, 2)
+ elif "hubert_feat" in data.keys():
+ c = data["hubert_feat"].transpose(1, 2)
+
+ f0 = data["frame_pitch"]
+ if self.cfg.model.condition_encoder.use_spkid:
+ g = data["spk_id"]
+ if g.dim() == 1:
+ g = g.unsqueeze(0)
+ g = self.emb_g(g).transpose(1, 2)
+ elif self.cfg.model.condition_encoder.use_spkemb:
+ g = data["spkemb"].unsqueeze(-1)
+
+ if c.device == torch.device("cuda"):
+ torch.cuda.manual_seed_all(seed)
+ else:
+ torch.manual_seed(seed)
+
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
+
+ x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
+ # condition_encoder ver.
+ x = self.condition_encoder(data).transpose(1, 2)
+
+ z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, noice_scale=noise_scale)
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
+
+ if self.dec_name == "nsfhifigan":
+ o = self.dec(z * c_mask, f0=f0.float())
+ elif self.dec_name == "apnet":
+ _, _, _, _, o = self.dec(z * c_mask)
+ else:
+ o = self.dec(z * c_mask)
+ return o, f0
diff --git a/models/vc/vits/vits_inference.py b/models/vc/vits/vits_inference.py
new file mode 100644
index 00000000..7c2c0253
--- /dev/null
+++ b/models/vc/vits/vits_inference.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import time
+import numpy as np
+from tqdm import tqdm
+import torch
+
+from models.vc.base import VCInference
+from models.vc.vits.vits import SynthesizerTrn
+
+from models.vc.base.vc_dataset import VCTestDataset, VCTestCollator
+from utils.io import save_audio
+from utils.audio_slicer import is_silence
+
+
+class VitsInference(VCInference):
+ def __init__(self, args=None, cfg=None, infer_type="from_dataset"):
+ VCInference.__init__(self, args, cfg)
+
+ def _build_model(self):
+ net_g = SynthesizerTrn(
+ self.cfg.preprocess.n_fft // 2 + 1,
+ self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+ self.cfg,
+ )
+ self.model = net_g
+ return net_g
+
+ def build_save_dir(self, dataset, speaker):
+ save_dir = os.path.join(
+ self.args.output_dir,
+ "vc_am_step-{}_{}".format(self.am_restore_step, self.args.mode),
+ )
+ if dataset is not None:
+ save_dir = os.path.join(save_dir, "data_{}".format(dataset))
+ if speaker != -1:
+ save_dir = os.path.join(
+ save_dir,
+ "spk_{}".format(speaker),
+ )
+ os.makedirs(save_dir, exist_ok=True)
+ print("Saving to ", save_dir)
+ return save_dir
+
+ @torch.inference_mode()
+ def inference(self):
+ res = []
+ for i, batch in enumerate(self.test_dataloader):
+ pred_audio_list = self._inference_each_batch(batch)
+ for it, wav in zip(self.test_dataset.metadata, pred_audio_list):
+ uid = it["Uid"]
+ file = os.path.join(self.args.output_dir, f"{uid}.wav")
+
+ wav = wav.numpy(force=True)
+ save_audio(
+ file,
+ wav,
+ self.cfg.preprocess.sample_rate,
+ add_silence=False,
+ turn_up=not is_silence(wav, self.cfg.preprocess.sample_rate),
+ )
+ res.append(file)
+ return res
+
+ def _inference_each_batch(self, batch_data, noise_scale=0.667):
+ device = self.accelerator.device
+ pred_res = []
+ self.model.eval()
+ with torch.no_grad():
+ # Put the data to device
+ # device = self.accelerator.device
+ for k, v in batch_data.items():
+ batch_data[k] = v.to(device)
+
+ audios, f0 = self.model.infer(batch_data, noise_scale=noise_scale)
+
+ pred_res.extend(audios)
+
+ return pred_res
diff --git a/models/vc/vits/vits_trainer.py b/models/vc/vits/vits_trainer.py
new file mode 100644
index 00000000..132cd189
--- /dev/null
+++ b/models/vc/vits/vits_trainer.py
@@ -0,0 +1,564 @@
+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.optim.lr_scheduler import ExponentialLR
+from tqdm import tqdm
+from pathlib import Path
+
+import accelerate
+
+# from models.svc.base import SVCTrainer
+from models.vc.base.vc_dataset import VCCollator, VCDataset
+from models.vc.vits.vits import *
+from models.tts.base import TTSTrainer
+
+from utils.mel import mel_spectrogram_torch
+import json
+
+from models.vocoders.gan.discriminator.mpd import (
+ MultiPeriodDiscriminator_vits as MultiPeriodDiscriminator,
+)
+
+
+class VitsVCTrainer(TTSTrainer):
+ def __init__(self, args, cfg):
+ self.args = args
+ self.cfg = cfg
+ self._init_accelerator()
+
+ TTSTrainer.__init__(self, args, cfg)
+
+ def _build_model(self):
+ net_g = SynthesizerTrn(
+ self.cfg.preprocess.n_fft // 2 + 1,
+ self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+ # directly use cfg
+ self.cfg,
+ )
+ net_d = MultiPeriodDiscriminator(self.cfg.model.vits.use_spectral_norm)
+ model = {"generator": net_g, "discriminator": net_d}
+
+ return model
+
+ def _build_dataset(self):
+ return VCDataset, VCCollator
+
+ def _build_optimizer(self):
+ optimizer_g = torch.optim.AdamW(
+ self.model["generator"].parameters(),
+ self.cfg.train.learning_rate,
+ betas=self.cfg.train.AdamW.betas,
+ eps=self.cfg.train.AdamW.eps,
+ )
+ optimizer_d = torch.optim.AdamW(
+ self.model["discriminator"].parameters(),
+ self.cfg.train.learning_rate,
+ betas=self.cfg.train.AdamW.betas,
+ eps=self.cfg.train.AdamW.eps,
+ )
+ optimizer = {"optimizer_g": optimizer_g, "optimizer_d": optimizer_d}
+
+ return optimizer
+
+ def _build_scheduler(self):
+ scheduler_g = ExponentialLR(
+ self.optimizer["optimizer_g"],
+ gamma=self.cfg.train.lr_decay,
+ last_epoch=self.epoch - 1,
+ )
+ scheduler_d = ExponentialLR(
+ self.optimizer["optimizer_d"],
+ gamma=self.cfg.train.lr_decay,
+ last_epoch=self.epoch - 1,
+ )
+
+ scheduler = {"scheduler_g": scheduler_g, "scheduler_d": scheduler_d}
+ return scheduler
+
+ def _build_criterion(self):
+ class GeneratorLoss(nn.Module):
+ def __init__(self, cfg):
+ super(GeneratorLoss, self).__init__()
+ self.cfg = cfg
+ self.l1_loss = nn.L1Loss()
+
+ def generator_loss(self, disc_outputs):
+ loss = 0
+ gen_losses = []
+ for dg in disc_outputs:
+ dg = dg.float()
+ l = torch.mean((1 - dg) ** 2)
+ gen_losses.append(l)
+ loss += l
+
+ return loss, gen_losses
+
+ def feature_loss(self, fmap_r, fmap_g):
+ loss = 0
+ for dr, dg in zip(fmap_r, fmap_g):
+ for rl, gl in zip(dr, dg):
+ rl = rl.float().detach()
+ gl = gl.float()
+ loss += torch.mean(torch.abs(rl - gl))
+
+ return loss * 2
+
+ def kl_loss(self, z_p, logs_q, m_p, logs_p, z_mask):
+ """
+ z_p, logs_q: [b, h, t_t]
+ m_p, logs_p: [b, h, t_t]
+ """
+ z_p = z_p.float()
+ logs_q = logs_q.float()
+ m_p = m_p.float()
+ logs_p = logs_p.float()
+ z_mask = z_mask.float()
+
+ kl = logs_p - logs_q - 0.5
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
+ kl = torch.sum(kl * z_mask)
+ l = kl / torch.sum(z_mask)
+ return l
+
+ def forward(
+ self,
+ outputs_g,
+ outputs_d,
+ y_mel,
+ y_hat_mel,
+ ):
+ loss_g = {}
+
+ # mel loss
+ loss_mel = self.l1_loss(y_mel, y_hat_mel) * self.cfg.train.c_mel
+ loss_g["loss_mel"] = loss_mel
+
+ # kl loss
+ loss_kl = (
+ self.kl_loss(
+ outputs_g["z_p"],
+ outputs_g["logs_q"],
+ outputs_g["m_p"],
+ outputs_g["logs_p"],
+ outputs_g["z_mask"],
+ )
+ * self.cfg.train.c_kl
+ )
+ loss_g["loss_kl"] = loss_kl
+
+ # feature loss
+ loss_fm = self.feature_loss(outputs_d["fmap_rs"], outputs_d["fmap_gs"])
+ loss_g["loss_fm"] = loss_fm
+
+ # gan loss
+ loss_gen, losses_gen = self.generator_loss(outputs_d["y_d_hat_g"])
+ loss_g["loss_gen"] = loss_gen
+ loss_g["loss_gen_all"] = loss_mel + loss_kl + loss_fm + loss_gen
+
+ return loss_g
+
+ class DiscriminatorLoss(nn.Module):
+ def __init__(self, cfg):
+ super(DiscriminatorLoss, self).__init__()
+ self.cfg = cfg
+ self.l1Loss = torch.nn.L1Loss(reduction="mean")
+
+ def __call__(self, disc_real_outputs, disc_generated_outputs):
+ loss_d = {}
+
+ loss = 0
+ r_losses = []
+ g_losses = []
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+ dr = dr.float()
+ dg = dg.float()
+ r_loss = torch.mean((1 - dr) ** 2)
+ g_loss = torch.mean(dg**2)
+ loss += r_loss + g_loss
+ r_losses.append(r_loss.item())
+ g_losses.append(g_loss.item())
+
+ loss_d["loss_disc_all"] = loss
+
+ return loss_d
+
+ criterion = {
+ "generator": GeneratorLoss(self.cfg),
+ "discriminator": DiscriminatorLoss(self.cfg),
+ }
+ return criterion
+
+ def _check_resume(self):
+ if self.args.resume:
+ if self.args.resume_from_ckpt_path == "":
+ ## Automatically resume according to the current exprimental name
+ self.logger.info(
+ "Automatically resuming from latest checkpoint in {}...".format(
+ self.checkpoint_dir
+ )
+ )
+ start = time.monotonic_ns()
+ ckpt_path = self.__load_model(
+ checkpoint_dir=self.checkpoint_dir,
+ resume_type=self.args.resume_type,
+ )
+ end = time.monotonic_ns()
+ self.logger.info(
+ f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+ )
+ self.checkpoints_path = json.load(
+ open(os.path.join(ckpt_path, "ckpts.json"), "r")
+ )
+ else:
+ ## Resume from the given checkpoint path
+ if not os.path.exists(self.args.resume_from_ckpt_path):
+ raise ValueError(
+ "[Error] The resumed checkpoint path {} don't exist.".format(
+ self.args.resume_from_ckpt_path
+ )
+ )
+ self.logger.info(
+ "Resuming from {}...".format(self.args.resume_from_ckpt_path)
+ )
+ start = time.monotonic_ns()
+ ckpt_path = self.__load_model(
+ checkpoint_path=self.args.resume_from_ckpt_path,
+ resume_type=self.args.resume_type,
+ )
+ end = time.monotonic_ns()
+ self.logger.info(
+ f"Resuming from checkpoint done in {(end - start) / 1e6:.2f}ms"
+ )
+
+ def __load_model(
+ self,
+ checkpoint_dir: str = None,
+ checkpoint_path: str = None,
+ resume_type: str = "",
+ ):
+ r"""Load model from checkpoint. If checkpoint_path is None, it will
+ load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+ None, it will load the checkpoint specified by checkpoint_path. **Only use this
+ method after** ``accelerator.prepare()``.
+ """
+ if checkpoint_path is None:
+ ls = [str(i) for i in Path(checkpoint_dir).glob("*")]
+ ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+ checkpoint_path = ls[0]
+ self.logger.info("Resume from {}...".format(checkpoint_path))
+
+ if resume_type in ["resume", ""]:
+ # Load all the things, including model weights, optimizer, scheduler, and random states.
+ self.accelerator.load_state(input_dir=checkpoint_path)
+
+ # set epoch and step
+ self.epoch = int(checkpoint_path.split("_")[-3].split("-")[-1]) + 1
+ self.step = int(checkpoint_path.split("_")[-2].split("-")[-1]) + 1
+
+ elif resume_type == "finetune":
+ # Load only the model weights
+ accelerate.load_checkpoint_and_dispatch(
+ self.accelerator.unwrap_model(self.model),
+ os.path.join(checkpoint_path, "pytorch_model.bin"),
+ )
+ self.logger.info("Load model weights for finetune...")
+
+ else:
+ raise ValueError("Resume_type must be `resume` or `finetune`.")
+
+ return checkpoint_path
+
+ # Keep legacy unchanged
+ def write_summary(
+ self,
+ losses,
+ stats,
+ images={},
+ audios={},
+ audio_sampling_rate=24000,
+ tag="train",
+ ):
+ for key, value in losses.items():
+ self.sw.add_scalar(tag + "/" + key, value, self.step)
+ self.sw.add_scalar(
+ "learning_rate",
+ self.optimizer["optimizer_g"].param_groups[0]["lr"],
+ self.step,
+ )
+
+ if len(images) != 0:
+ for key, value in images.items():
+ self.sw.add_image(key, value, self.global_step, batchformats="HWC")
+ if len(audios) != 0:
+ for key, value in audios.items():
+ self.sw.add_audio(key, value, self.global_step, audio_sampling_rate)
+
+ def write_valid_summary(
+ self, losses, stats, images={}, audios={}, audio_sampling_rate=24000, tag="val"
+ ):
+ for key, value in losses.items():
+ self.sw.add_scalar(tag + "/" + key, value, self.step)
+
+ if len(images) != 0:
+ for key, value in images.items():
+ self.sw.add_image(key, value, self.global_step, batchformats="HWC")
+ if len(audios) != 0:
+ for key, value in audios.items():
+ self.sw.add_audio(key, value, self.global_step, audio_sampling_rate)
+
+ def _get_state_dict(self):
+ state_dict = {
+ "generator": self.model["generator"].state_dict(),
+ "discriminator": self.model["discriminator"].state_dict(),
+ "optimizer_g": self.optimizer["optimizer_g"].state_dict(),
+ "optimizer_d": self.optimizer["optimizer_d"].state_dict(),
+ "scheduler_g": self.scheduler["scheduler_g"].state_dict(),
+ "scheduler_d": self.scheduler["scheduler_d"].state_dict(),
+ "step": self.step,
+ "epoch": self.epoch,
+ "batch_size": self.cfg.train.batch_size,
+ }
+ return state_dict
+
+ def get_state_dict(self):
+ state_dict = {
+ "generator": self.model["generator"].state_dict(),
+ "discriminator": self.model["discriminator"].state_dict(),
+ "optimizer_g": self.optimizer["optimizer_g"].state_dict(),
+ "optimizer_d": self.optimizer["optimizer_d"].state_dict(),
+ "scheduler_g": self.scheduler["scheduler_g"].state_dict(),
+ "scheduler_d": self.scheduler["scheduler_d"].state_dict(),
+ "step": self.step,
+ "epoch": self.epoch,
+ "batch_size": self.cfg.train.batch_size,
+ }
+ return state_dict
+
+ def load_model(self, checkpoint):
+ self.step = checkpoint["step"]
+ self.epoch = checkpoint["epoch"]
+ self.model["generator"].load_state_dict(checkpoint["generator"])
+ self.model["discriminator"].load_state_dict(checkpoint["discriminator"])
+ self.optimizer["optimizer_g"].load_state_dict(checkpoint["optimizer_g"])
+ self.optimizer["optimizer_d"].load_state_dict(checkpoint["optimizer_d"])
+ self.scheduler["scheduler_g"].load_state_dict(checkpoint["scheduler_g"])
+ self.scheduler["scheduler_d"].load_state_dict(checkpoint["scheduler_d"])
+
+ @torch.inference_mode()
+ def _valid_step(self, batch):
+ r"""Testing forward step. Should return average loss of a sample over
+ one batch. Provoke ``_forward_step`` is recommended except for special case.
+ See ``_test_epoch`` for usage.
+ """
+
+ valid_losses = {}
+ total_loss = 0
+ valid_stats = {}
+
+ # Discriminator
+ # Generator output
+ outputs_g = self.model["generator"](batch)
+
+ y_mel = slice_segments(
+ batch["mel"].transpose(1, 2),
+ outputs_g["ids_slice"],
+ self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+ )
+ y_hat_mel = mel_spectrogram_torch(
+ outputs_g["y_hat"].squeeze(1), self.cfg.preprocess
+ )
+ y = slice_segments(
+ batch["audio"].unsqueeze(1),
+ outputs_g["ids_slice"] * self.cfg.preprocess.hop_size,
+ self.cfg.preprocess.segment_size,
+ )
+
+ # Discriminator output
+ outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach())
+ ## Discriminator loss
+ loss_d = self.criterion["discriminator"](
+ outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"]
+ )
+ valid_losses.update(loss_d)
+
+ ## Generator
+ outputs_d = self.model["discriminator"](y, outputs_g["y_hat"])
+ loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel)
+ valid_losses.update(loss_g)
+
+ for item in valid_losses:
+ valid_losses[item] = valid_losses[item].item()
+
+ total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"]
+
+ return (
+ total_loss.item(),
+ valid_losses,
+ valid_stats,
+ )
+
+ def _train_step(self, batch):
+ r"""Forward step for training and inference. This function is called
+ in ``_train_step`` & ``_test_step`` function.
+ """
+
+ train_losses = {}
+ total_loss = 0
+ training_stats = {}
+
+ ## Train Discriminator
+ # Generator output
+ outputs_g = self.model["generator"](batch)
+
+ y_mel = slice_segments(
+ batch["mel"].transpose(1, 2),
+ outputs_g["ids_slice"],
+ self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+ )
+ y_hat_mel = mel_spectrogram_torch(
+ outputs_g["y_hat"].squeeze(1), self.cfg.preprocess
+ )
+
+ y = slice_segments(
+ # [1, 168418] -> [1, 1, 168418]
+ batch["audio"].unsqueeze(1),
+ outputs_g["ids_slice"] * self.cfg.preprocess.hop_size,
+ self.cfg.preprocess.segment_size,
+ )
+
+ # Discriminator output
+ outputs_d = self.model["discriminator"](y, outputs_g["y_hat"].detach())
+ # Discriminator loss
+ loss_d = self.criterion["discriminator"](
+ outputs_d["y_d_hat_r"], outputs_d["y_d_hat_g"]
+ )
+ train_losses.update(loss_d)
+
+ # BP and Grad Updated
+ self.optimizer["optimizer_d"].zero_grad()
+ self.accelerator.backward(loss_d["loss_disc_all"])
+ self.optimizer["optimizer_d"].step()
+
+ ## Train Generator
+ outputs_d = self.model["discriminator"](y, outputs_g["y_hat"])
+ loss_g = self.criterion["generator"](outputs_g, outputs_d, y_mel, y_hat_mel)
+ train_losses.update(loss_g)
+
+ # BP and Grad Updated
+ self.optimizer["optimizer_g"].zero_grad()
+ self.accelerator.backward(loss_g["loss_gen_all"])
+ self.optimizer["optimizer_g"].step()
+
+ for item in train_losses:
+ train_losses[item] = train_losses[item].item()
+
+ total_loss = loss_g["loss_gen_all"] + loss_d["loss_disc_all"]
+
+ return (
+ total_loss.item(),
+ train_losses,
+ training_stats,
+ )
+
+ def _train_epoch(self):
+ r"""Training epoch. Should return average loss of a batch (sample) over
+ one epoch. See ``train_loop`` for usage.
+ """
+ epoch_sum_loss: float = 0.0
+ epoch_losses: dict = {}
+ epoch_step: int = 0
+ for batch in tqdm(
+ self.train_dataloader,
+ desc=f"Training Epoch {self.epoch}",
+ unit="batch",
+ colour="GREEN",
+ leave=False,
+ dynamic_ncols=True,
+ smoothing=0.04,
+ disable=not self.accelerator.is_main_process,
+ ):
+ # Do training step and BP
+ with self.accelerator.accumulate(self.model):
+ total_loss, train_losses, training_stats = self._train_step(batch)
+ self.batch_count += 1
+
+ # Update info for each step
+ if self.batch_count % self.cfg.train.gradient_accumulation_step == 0:
+ epoch_sum_loss += total_loss
+ for key, value in train_losses.items():
+ if key not in epoch_losses.keys():
+ epoch_losses[key] = value
+ else:
+ epoch_losses[key] += value
+
+ self.accelerator.log(
+ {
+ "Step/Generator Loss": train_losses["loss_gen_all"],
+ "Step/Discriminator Loss": train_losses["loss_disc_all"],
+ "Step/Generator Learning Rate": self.optimizer[
+ "optimizer_d"
+ ].param_groups[0]["lr"],
+ "Step/Discriminator Learning Rate": self.optimizer[
+ "optimizer_g"
+ ].param_groups[0]["lr"],
+ },
+ step=self.step,
+ )
+ self.step += 1
+ epoch_step += 1
+
+ self.accelerator.wait_for_everyone()
+
+ epoch_sum_loss = (
+ epoch_sum_loss
+ / len(self.train_dataloader)
+ * self.cfg.train.gradient_accumulation_step
+ )
+
+ for key in epoch_losses.keys():
+ epoch_losses[key] = (
+ epoch_losses[key]
+ / len(self.train_dataloader)
+ * self.cfg.train.gradient_accumulation_step
+ )
+
+ return epoch_sum_loss, epoch_losses
+
+ def _build_singer_lut(self):
+ resumed_singer_path = None
+ if self.args.resume_from_ckpt_path and self.args.resume_from_ckpt_path != "":
+ resumed_singer_path = os.path.join(
+ self.args.resume_from_ckpt_path, self.cfg.preprocess.spk2id
+ )
+ if os.path.exists(os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)):
+ resumed_singer_path = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+
+ if resumed_singer_path:
+ with open(resumed_singer_path, "r") as f:
+ singers = json.load(f)
+ else:
+ singers = dict()
+
+ for dataset in self.cfg.dataset:
+ singer_lut_path = os.path.join(
+ self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+ )
+ with open(singer_lut_path, "r") as singer_lut_path:
+ singer_lut = json.load(singer_lut_path)
+ for singer in singer_lut.keys():
+ if singer not in singers:
+ singers[singer] = len(singers)
+
+ with open(
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id), "w"
+ ) as singer_file:
+ json.dump(singers, singer_file, indent=4, ensure_ascii=False)
+ print(
+ "singers have been dumped to {}".format(
+ os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+ )
+ )
+ return singers
diff --git a/modules/encoder/condition_encoder.py b/modules/encoder/condition_encoder.py
index 1600d078..2bd4b67f 100644
--- a/modules/encoder/condition_encoder.py
+++ b/modules/encoder/condition_encoder.py
@@ -52,6 +52,8 @@ def __init__(self, cfg):
self.input_dim = self.cfg.input_melody_dim
self.output_dim = self.cfg.output_melody_dim
self.n_bins = self.cfg.n_bins_melody
+ self.pitch_min = self.cfg.pitch_min
+ self.pitch_max = self.cfg.pitch_max
if self.input_dim != 0:
if self.n_bins == 0:
@@ -167,6 +169,15 @@ def __init__(self, cfg):
self.wenet_encoder = ContentEncoder(
self.cfg, self.cfg.wenet_dim, self.cfg.content_encoder_dim
)
+ if cfg.use_hubert:
+ self.hubert_lookup = nn.Embedding(
+ num_embeddings=1000,
+ embedding_dim=self.cfg.content_encoder_dim,
+ padding_idx=None,
+ )
+ self.hubert_encoder = ContentEncoder(
+ self.cfg, self.cfg.content_encoder_dim, self.cfg.content_encoder_dim
+ )
### Prosody Features ###
if cfg.use_f0:
@@ -177,6 +188,10 @@ def __init__(self, cfg):
### Speaker Features ###
if cfg.use_spkid:
self.singer_encoder = SingerEncoder(self.cfg)
+ if cfg.use_spkemb:
+ self.speaker_project = nn.Linear(
+ self.cfg.spkemb_dim, self.cfg.content_encoder_dim
+ )
def forward(self, x):
outputs = []
@@ -221,6 +236,12 @@ def forward(self, x):
outputs.append(wenet_enc_out)
seq_len = wenet_enc_out.shape[1]
+ if self.cfg.use_hubert:
+ hubert_enc_out = self.hubert_lookup(x["hubert_feat"].squeeze(-1))
+ hubert_enc_out = self.hubert_encoder(hubert_enc_out, length=x["target_len"])
+ outputs.append(hubert_enc_out)
+ seq_len = hubert_enc_out.shape[1]
+
if self.cfg.use_spkid:
speaker_enc_out = self.singer_encoder(x["spk_id"]) # [b, 1, 384]
assert (
@@ -228,10 +249,18 @@ def forward(self, x):
or "contentvec_feat" in x.keys()
or "mert_feat" in x.keys()
or "wenet_feat" in x.keys()
+ or "hubert_feat" in x.keys()
)
singer_info = speaker_enc_out.expand(-1, seq_len, -1)
outputs.append(singer_info)
+ if self.cfg.use_spkemb:
+ speaker_embedding = self.speaker_project(
+ x["spkemb"].unsqueeze(1)
+ ) # [b, 1, 384]
+ speaker_embedding = speaker_embedding.expand(-1, seq_len, -1)
+ outputs.append(speaker_embedding)
+
encoder_output = None
if self.merge_mode == "concat":
encoder_output = torch.cat(outputs, dim=-1)
diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py
index 9c4d9be7..be3d4a74 100644
--- a/processors/acoustic_extractor.py
+++ b/processors/acoustic_extractor.py
@@ -8,6 +8,7 @@
import numpy as np
import json
+import resemblyzer
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from utils.io import save_feature, save_txt, save_torch_audio
@@ -119,6 +120,14 @@ def __extract_utt_acoustic_features(dataset_output, cfg, utt):
wav = wav_torch.cpu().numpy()
# extract features
+ if cfg.preprocess.extract_speaker:
+ voice_encoder = resemblyzer.VoiceEncoder("cpu", verbose=False)
+ speaker_wav = resemblyzer.preprocess_wav(wav_path)
+ speaker_embedding = voice_encoder.embed_utterance(speaker_wav)
+ save_feature(
+ dataset_output, cfg.preprocess.speaker_dir, uid, speaker_embedding
+ )
+
if cfg.preprocess.extract_duration:
durations, phones, start, end = duration.get_duration(
utt, wav, cfg.preprocess
diff --git a/processors/content_extractor.py b/processors/content_extractor.py
index 34b54917..51723d2c 100644
--- a/processors/content_extractor.py
+++ b/processors/content_extractor.py
@@ -8,6 +8,7 @@
import numpy as np
import yaml
import copy
+import joblib
from tqdm import tqdm
from torchaudio.compliance import kaldi
from torch.nn.utils.rnn import pad_sequence
@@ -68,7 +69,7 @@ def __init__(self, cfg, extractor_type):
def init_for_retrans(self):
target_hop = self.cfg.preprocess.hop_size
- assert self.extractor_type in ["whisper", "contentvec", "wenet"]
+ assert self.extractor_type in ["whisper", "contentvec", "wenet", "hubert"]
if self.extractor_type == "whisper":
source_hop = (
self.cfg.preprocess.whisper_frameshift
@@ -86,6 +87,10 @@ def init_for_retrans(self):
* self.cfg.preprocess.wenet_downsample_rate
* self.cfg.preprocess.sample_rate
)
+ elif self.extractor_type == "hubert":
+ source_hop = (
+ self.cfg.preprocess.hubert_frameshift * self.cfg.preprocess.sample_rate
+ )
source_hop = int(source_hop)
factor = np.gcd(source_hop, target_hop)
source_hop //= factor
@@ -230,6 +235,8 @@ def get_valid_features(self, utt, content_feature):
) # 40ms
elif self.extractor_type == "mert":
frameshift = self.cfg.preprocess.mert_frameshift
+ elif self.extractor_type == "hubert":
+ frameshift = self.cfg.preprocess.hubert_frameshift
else:
raise NotImplementedError
@@ -495,6 +502,53 @@ def extract_content_features(self, wavs):
return mert_features
+class HubertExtractor(BaseExtractor):
+ def __init__(self, cfg):
+ super(HubertExtractor, self).__init__(cfg)
+ self.extractor_type = "hubert"
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ def load_model(self):
+ # load whisper checkpoint
+ print("Loading Hubert Model...")
+ (model, _, _) = checkpoint_utils.load_model_ensemble_and_task(
+ [self.cfg.preprocess.hubert_model_path]
+ )
+ model = model[0]
+ if torch.cuda.is_available():
+ print("Using GPU...\n")
+ model = model.cuda()
+ else:
+ print("Using CPU...\n")
+
+ self.model = model.eval()
+ self.km_model = joblib.load(self.cfg.preprocess.hubert_km_path)
+ self.C_np = self.km_model.cluster_centers_.transpose()
+ self.Cnorm_np = (self.C_np**2).sum(0, keepdims=True)
+ self.C = torch.from_numpy(self.C_np)
+ self.Cnorm = torch.from_numpy(self.Cnorm_np)
+
+ def extract_content_features(self, wavs):
+ hubert_features = []
+ for wav in wavs:
+ feat, _ = self.model.extract_features(
+ source=wav.view(1, -1).to(self.device),
+ padding_mask=None,
+ mask=False,
+ output_layer=11,
+ )
+ feat = feat.squeeze(0).cpu().detach()
+ dist = (
+ feat.pow(2).sum(1, keepdim=True)
+ - 2 * torch.matmul(feat, self.C)
+ + self.Cnorm
+ )
+ feat = dist.argmin(dim=1).unsqueeze(-1)
+ hubert_features.append(feat)
+
+ return hubert_features
+
+
def extract_utt_content_features_dataloader(cfg, metadata, num_workers):
dataset_name = metadata[0]["Dataset"]
with torch.no_grad():
@@ -624,3 +678,69 @@ def extract_utt_content_features_dataloader(cfg, metadata, num_workers):
batch_content_features = extractor.extract_content_features(wavs)
for index, utt in enumerate(_metadata):
extractor.save_feature(utt, batch_content_features[index])
+
+ if cfg.preprocess.extract_hubert_feature:
+ feat_dir = os.path.join(
+ cfg.preprocess.processed_dir, dataset_name, "hubert"
+ )
+ os.makedirs(feat_dir, exist_ok=True)
+ feat_files_num = len(os.listdir(feat_dir))
+ if feat_files_num != len(metadata):
+ hubert_waveforms = LibrosaDataset(
+ cfg,
+ dataset_name,
+ cfg.preprocess.hubert_sample_rate,
+ metadata=metadata,
+ )
+ data_loader = DataLoader(
+ hubert_waveforms,
+ num_workers=num_workers,
+ shuffle=False,
+ pin_memory=cfg.preprocess.pin_memory,
+ batch_size=cfg.preprocess.content_feature_batch_size,
+ collate_fn=collate_batch,
+ drop_last=False,
+ )
+ extractor = HubertExtractor(cfg)
+ extractor.load_model()
+ for batch_idx, items in enumerate(tqdm(data_loader)):
+ _metadata, wavs, lens = items
+
+ batch_content_features = extractor.extract_content_features(
+ wavs,
+ )
+ for index, utt in enumerate(_metadata):
+ extractor.save_feature(utt, batch_content_features[index])
+
+ if cfg.preprocess.extract_hubert_feature:
+ feat_dir = os.path.join(
+ cfg.preprocess.processed_dir, dataset_name, "hubert"
+ )
+ os.makedirs(feat_dir, exist_ok=True)
+ feat_files_num = len(os.listdir(feat_dir))
+ if feat_files_num != len(metadata):
+ hubert_waveforms = LibrosaDataset(
+ cfg,
+ dataset_name,
+ cfg.preprocess.hubert_sample_rate,
+ metadata=metadata,
+ )
+ data_loader = DataLoader(
+ hubert_waveforms,
+ num_workers=num_workers,
+ shuffle=False,
+ pin_memory=cfg.preprocess.pin_memory,
+ batch_size=cfg.preprocess.content_feature_batch_size,
+ collate_fn=collate_batch,
+ drop_last=False,
+ )
+ extractor = HubertExtractor(cfg)
+ extractor.load_model()
+ for batch_idx, items in enumerate(tqdm(data_loader)):
+ _metadata, wavs, lens = items
+
+ batch_content_features = extractor.extract_content_features(
+ wavs,
+ )
+ for index, utt in enumerate(_metadata):
+ extractor.save_feature(utt, batch_content_features[index])