diff --git a/users/zeyer/experiments/exp2024_04_23_baselines/ctc_claix2023.py b/users/zeyer/experiments/exp2024_04_23_baselines/ctc_claix2023.py index e284e6a8b..91a2a51bc 100644 --- a/users/zeyer/experiments/exp2024_04_23_baselines/ctc_claix2023.py +++ b/users/zeyer/experiments/exp2024_04_23_baselines/ctc_claix2023.py @@ -30,27 +30,48 @@ def py(): from returnn.frontend.encoder.conformer import ConformerConvSubsample # Consistency regularization (CR) (crLoss). - for opts in [ + for opts, cr_ctc_variants in [ # Baseline (n12) has {"dev-clean": 2.35, "dev-other": 5.65, "test-clean": 2.66, "test-other": 5.94}. # CLAIX baseline: {"dev-clean": 2.54, "dev-other": 5.93, "test-clean": 2.68, "test-other": 6.27} # CLAIX CR: {"dev-clean": 2.49, "dev-other": 5.99, "test-clean": 2.68, "test-other": 6.05} # v6-relPosAttDef-noBias-aedLoss-bhv20-11gb-f32-bs15k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-spm10k-bpeSample001 # {"num_enc_layers": 12, "batch_size": 200_000, "vocab": "spm10k"}, - {"num_enc_layers": 12, "batch_size": 150_000, "vocab": "spm10k"}, + ( + {"num_enc_layers": 12, "batch_size": 150_000, "vocab": "spm10k"}, + [ + None, + {"cr_loss_scale": 0.1}, + {"cr_loss_scale": 0.2}, + ], + ), # Baseline (n16, spm10k) has {"dev-clean": 2.26, "dev-other": 5.44, "test-clean": 2.5, "test-other": 5.62}. # v6-n16-relPosAttDef-noBias-aedLoss-bhv20-11gb-f32-bs10k-accgrad1-mgpu4-pavg100-wd1e_2-lrlin1e_5_295k-featBN-speedpertV2-spm10k-bpeSample001 # This here is now spm512 though. # Note: In the original CR paper, they don't have time-downsampling! # {"num_enc_layers": 16, "batch_size": 10_000, "vocab": "spm512"}, # No CR: 6.18, CR 0.2: 5.96, CR 0.5: 6.05, CR 1.0: 6.22 - {"num_enc_layers": 12, "batch_size": 200_000, "vocab": "spm512"}, + ( + {"num_enc_layers": 12, "batch_size": 200_000, "vocab": "spm512"}, + [ + None, + {"cr_loss_scale": 0.1}, + {"cr_loss_scale": 0.2}, + {"cr_loss_scale": 0.2, "cr_loss_on_aux_probs": True}, + ], + ), # {"num_enc_layers": 12, "batch_size": 150_000, "vocab": "spm512", "time_downsampling": 4}, # {"num_enc_layers": 12, "batch_size": 75_000, "vocab": "spm512", "time_downsampling": 2}, ]: - for cr_ctc in [None, {"cr_loss_scale": 0.1}, {"cr_loss_scale": 0.2}]: + for cr_ctc in cr_ctc_variants: # TODO also adapt specaug for CR... use_cr_ctc = cr_ctc is not None - name = f"crLoss{cr_ctc['cr_loss_scale']}-" if use_cr_ctc else "" + if use_cr_ctc: + name = f"crLoss{cr_ctc['cr_loss_scale']}" + if cr_ctc.get("cr_loss_on_aux_probs"): + name += "_withAux" + name += "-" + else: + name = "" if opts.get("time_downsampling"): name += f"time{opts['time_downsampling']}-" name += f"n{opts['num_enc_layers']}-{opts['vocab']}-auxAED"