rwth-i6 · AndreasPlt · Jan 7, 2025 · Jan 7, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/...peech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py b/...peech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_hard.py
@@ -0,0 +1,63 @@
+from sisyphus import tk
+import os
+
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \
+        get_fairseq_root, \
+        run_fairseq_pretraining
+
+
+# pretraining
+other_target_pretrain_job = run_fairseq_pretraining(
+    exp_name="monophone_negatives_other_target_v1",
+    commit="1397363c5c0e3c4e3ab620be562730399c852493",
+    python_exe_hash_overwrite="itc_python_launcher_py310_torch",
+    negative_sampling_strategy="other_target",
+)
+
+
+neg_hard_pretrain_job = run_fairseq_pretraining(
+        exp_name="monophone_negatives_hard_v1",
+        commit="be51394d876428ad531e0786d80de43d6a8818af",
+        python_exe_hash_overwrite="itc_python_launcher_py310_torch",
+        negative_sampling_strategy="hard_negatives",
+    )
+
+neg_hard_pretrain_jobs = dict()
+neg_hard_pretrain_jobs[0] = neg_hard_pretrain_job
+for start_cp in [50, 100, 150, 200, 250]:
+    neg_hard_pretrain_jobs[start_cp] = run_fairseq_pretraining(
+        exp_name=f"monophone_negatives_hard_after_{start_cp}ep_other_v1",
+        commit="be51394d876428ad531e0786d80de43d6a8818af",
+        python_exe_hash_overwrite="itc_python_launcher_py310_torch",
+        checkpoint=other_target_pretrain_job.out_models[start_cp].model,
+        negative_sampling_strategy="hard_negatives",
+    )
+
+# fairseq root
+fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3"))
+
+# Finetuning
+base_model_conf = {
+    "_name": "wav2vec_ctc",
+    "apply_mask": True,
+    "mask_prob": 0.65,
+    "mask_channel_prob": 0.5,
+    "mask_channel_length": 64,
+    "layerdrop": 0.1,
+    "activation_dropout": 0.1,
+    "feature_grad_mult": 0.0,
+    "freeze_finetune_updates": 10000,  # was 0 in fairseq config
+}
+
+for start_cp in [50, 100, 150, 200, 250]:
+    for additional_cp in range(50, 600+1-start_cp, 50):
+        model_conf_w2v = base_model_conf.copy()
+        model_conf_w2v["w2v_path"] = neg_hard_pretrain_jobs[start_cp].out_models[start_cp + additional_cp].model
+        eow_phon_ls100_ctc_base(
+            model_conf_w2v=model_conf_w2v,
+            train_name_suffix=os.path.join("w2v_negatives_hard", f"other_{start_cp}_hard_{additional_cp}"),
+            fairseq_root=fairseq_root,
+        )
diff --git a/...eech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py b/...eech_100_ctc/fairseq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other.py
@@ -0,0 +1,123 @@
+from sisyphus import tk
+import os
+
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \
+        get_fairseq_root, \
+        run_fairseq_pretraining
+
+
+# pretraining
+neg_other_pretrain_job = run_fairseq_pretraining(
+    exp_name="monophone_negatives_other_target_v1",
+    commit="1397363c5c0e3c4e3ab620be562730399c852493",
+    python_exe_hash_overwrite="itc_python_launcher_py310_torch",
+    negative_sampling_strategy="other_target",
+    )
+
+# fairseq root
+fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3"))
+
+# Finetuning
+
+base_model_conf = {
+    "_name": "wav2vec_ctc",
+    "apply_mask": True,
+    "mask_prob": 0.65,
+    "mask_channel_prob": 0.5,
+    "mask_channel_length": 64,
+    "layerdrop": 0.1,
+    "activation_dropout": 0.1,
+    "feature_grad_mult": 0.0,
+    "freeze_finetune_updates": 10000,  # was 0 in fairseq config
+}
+
+checkpoints = [100, 200, 300, 400, 500, 600]
+for checkpoint in checkpoints:
+    # negative sampling
+    model_conf_w2v = base_model_conf.copy()
+    model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[checkpoint].model
+    eow_phon_ls100_ctc_base(
+        model_conf_w2v=model_conf_w2v,
+        train_name_suffix=os.path.join("w2v_neg_sampling_other_target", f"checkpoint_{checkpoint}"),
+        fairseq_root=fairseq_root,
+    )
+
+
+# finetuning experiments only for the last checkpoint
+final_cp = 600
+# random vs phoneme mask in finetuning
+model_conf_w2v = base_model_conf.copy()  # base model, no need to set `mask_strategy` and `mask_length`
+model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target",
+        "random_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+model_conf_w2v = base_model_conf.copy()
+model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model
+model_conf_w2v["mask_strategy"] = "phonemes"
+model_conf_w2v["mask_length"] = 1
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target",
+        "phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+
+# phoneme mask lengths in finetuning
+for mask_len in [1, 2]:
+    model_conf_w2v = base_model_conf.copy()
+    model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model
+    model_conf_w2v["mask_strategy"] = "phonemes"
+    model_conf_w2v["mask_length"] = mask_len
+    eow_phon_ls100_ctc_base(
+        model_conf_w2v=model_conf_w2v,
+        train_name_suffix=os.path.join(
+            "w2v_neg_sampling_other_target",
+            f"{mask_len}_phoneme_spec",
+            f"checkpoint_{final_cp}"
+            ),
+        fairseq_root=fairseq_root,
+    )
+
+model_conf_w2v = base_model_conf.copy()
+model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model
+model_conf_w2v["mask_strategy"] = "phonemes"
+model_conf_w2v["mask_length"] = 1
+model_conf_w2v["mask_selection"] = "uniform"
+model_conf_w2v["mask_other"] = 1
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target",
+        "1_2_phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+
+# mask probability in finetuning
+for mask_prob in [0.35, 0.5, 0.65, 0.8]:
+    model_conf_w2v = base_model_conf.copy()
+    model_conf_w2v["w2v_path"] = neg_other_pretrain_job.out_models[final_cp].model
+    model_conf_w2v["mask_strategy"] = "phonemes"
+    model_conf_w2v["mask_prob"] = mask_prob
+    eow_phon_ls100_ctc_base(
+        model_conf_w2v=model_conf_w2v,
+        train_name_suffix=os.path.join(
+            "w2v_neg_sampling_other_target",
+            f"{str(mask_prob).replace('.', '_')}_phoneme_mask_prob",  # replace "." with "_" for the folder name
+            f"checkpoint_{final_cp}"
+            ),
+        fairseq_root=fairseq_root,
+    )
diff --git a/...seq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py b/...seq_finetuning/ctc_standalone/sisyphus_configs/config_negatives_other_phoneme_boundary.py
@@ -0,0 +1,132 @@
+from sisyphus import tk
+import os
+
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_100_ctc.fairseq_finetuning.ctc_standalone.experiments.ctc_phon.baseline import eow_phon_ls100_ctc_base
+from i6_experiments.users.vieting.experiments.librispeech.\
+    librispeech_960_pretraining.wav2vec2.config_02_fairseq_phoneme import \
+        get_fairseq_root, \
+        run_fairseq_pretraining
+
+# Pretraining
+neg_other_trg_phon_boundary_pretrain_job = run_fairseq_pretraining(
+    exp_name="monophone_negatives_other_target_boundary_masking_v1",
+    commit="87dec4ffcba2fd71e8838ca099a09816cddeff5b",
+    negative_sampling_strategy="other_target",
+    mask_strategy="phonemes",
+    mask_length=1,
+    )
+
+# fairseq root
+fairseq_root = get_fairseq_root(fairseq_exe=tk.Path("/usr/bin/python3"))
+
+# Finetuning
+base_model_conf = {
+    "_name": "wav2vec_ctc",
+    "apply_mask": True,
+    "mask_prob": 0.65,
+    "mask_channel_prob": 0.5,
+    "mask_channel_length": 64,
+    "layerdrop": 0.1,
+    "activation_dropout": 0.1,
+    "feature_grad_mult": 0.0,
+    "freeze_finetune_updates": 10000,  # was 0 in fairseq config
+}
+
+checkpoints = [100, 200, 300, 400, 500, 600]
+for checkpoint in checkpoints:
+    # negative sampling + phoneme boundary masking 
+    model_conf_w2v = base_model_conf.copy()
+    model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[checkpoint].model
+    eow_phon_ls100_ctc_base(
+        model_conf_w2v=model_conf_w2v,
+        train_name_suffix=os.path.join(
+            "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+            f"checkpoint_{checkpoint}"
+            ),
+        fairseq_root=fairseq_root,
+    )
+
+
+# finetuning experiments only for the last checkpoint
+final_cp = 600
+# random vs phoneme mask in finetuning
+model_conf_w2v = base_model_conf.copy()  # base model, no need to set `mask_strategy` and `mask_length`
+model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+        "phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+model_conf_w2v = base_model_conf.copy()
+model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+model_conf_w2v["mask_strategy"] = "random"
+model_conf_w2v["mask_length"] = 10
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+        "random_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+
+# phoneme mask lengths in finetuning
+model_conf_w2v = base_model_conf.copy()  # base model, no need to set `mask_length`
+model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+        "1_phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+model_conf_w2v = base_model_conf.copy()
+model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+model_conf_w2v["mask_length"] = 2
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+        "2_phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+
+model_conf_w2v = base_model_conf.copy()
+model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+model_conf_w2v["mask_length"] = 1
+model_conf_w2v["mask_other"] = 1
+model_conf_w2v["mask_selection"] = "uniform"
+eow_phon_ls100_ctc_base(
+    model_conf_w2v=model_conf_w2v,
+    train_name_suffix=os.path.join(
+        "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+        "1_2_phoneme_spec",
+        f"checkpoint_{final_cp}"
+        ),
+    fairseq_root=fairseq_root,
+)
+
+# mask probability in finetuning
+for mask_prob in [0.35, 0.5, 0.65, 0.8]:
+    model_conf_w2v = base_model_conf.copy()
+    model_conf_w2v["w2v_path"] = neg_other_trg_phon_boundary_pretrain_job.out_models[final_cp].model
+    model_conf_w2v["mask_prob"] = mask_prob
+    eow_phon_ls100_ctc_base(
+        model_conf_w2v=model_conf_w2v,
+        train_name_suffix=os.path.join(
+            "w2v_neg_sampling_other_target_phoneme_boundary_masking",
+            f"{str(mask_prob).replace('.', '_')}_phoneme_mask_prob",  # replace '.' with '_'
+            f"checkpoint_{final_cp}"
+            ),
+        fairseq_root=fairseq_root,
+    )