From 795979e667b2730ee87b1ba62eb1e883e058c3d0 Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Thu, 5 Sep 2024 22:50:40 +0200 Subject: [PATCH 1/9] added difference --- .../switchboard/ctc/feat/experiments.py | 27 ++++++++++++++++--- .../network_helpers/specaug_configurable.py | 13 ++++++++- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index c0f04a15b..a00f2d86b 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -858,13 +858,13 @@ def run_scf_specaug(): "max_number_masks_for_filter_based_specaug": 75, }, ), - "peakToAverage_based_specaug": dict( + "peakToAverageRatio_based_specaug": dict( returnn_args={ **base_returnn_args, "specaug_config": { "steps_per_epoch": 4100, "enable_sorting": False, - "filter_based_masking_strategy": "peakToAverage", + "filter_based_masking_strategy": "peakToAverageRatio", "enable_logging": True, "filter_factor": 0.5, "max_number_masks_for_filter_based_specaug": 75, @@ -874,7 +874,28 @@ def run_scf_specaug(): lr_args=lr_args, report_args={ "batch_size": "2x5k", - "filter_based_masking_strategy": "peakToAverage", + "filter_based_masking_strategy": "peakToAverageRatio", + "filter_factor": 0.5, + "max_number_masks_for_filter_based_specaug": 75, + }, + ), + "peakToAverageDifference_based_specaug": dict( + returnn_args={ + **base_returnn_args, + "specaug_config": { + "steps_per_epoch": 4100, + "enable_sorting": False, + "filter_based_masking_strategy": "peakToAverageDifference", + "enable_logging": True, + "filter_factor": 0.5, + "max_number_masks_for_filter_based_specaug": 75, + }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={ + "batch_size": "2x5k", + "filter_based_masking_strategy": "peakToAverageDifference", "filter_factor": 0.5, "max_number_masks_for_filter_based_specaug": 75, }, diff --git a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py index 5f4fdf8cc..8382e12d3 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py +++ b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py @@ -371,7 +371,7 @@ def get_masked(): probs = variance / tf.reduce_sum(variance) uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs - elif config["filter_based_masking_strategy"] == "peakToAverage": + elif config["filter_based_masking_strategy"] == "peakToAverageRatio": # Get peak to average ratio for each filter f_resp = get_frequency_response(filter_layer) peak = tf.reduce_max(f_resp, axis=0) @@ -382,6 +382,17 @@ def get_masked(): probs = ratio / tf.reduce_sum(ratio) uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs + elif config["filter_based_masking_strategy"] == "peakToAverageDifference": + # Get peak to average ratio for each filter + f_resp = get_frequency_response(filter_layer) + peak = tf.reduce_max(f_resp, axis=0) + average = tf.reduce_mean(f_resp, axis=0) + ratio = peak - average + n_features = tf.shape(x)[data.feature_dim_axis] + # Normalize the ratio to get probabilities + probs = ratio / tf.reduce_sum(ratio) + uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) + final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs enable_logging = tf.convert_to_tensor(config["enable_logging"], dtype=tf.bool) From bf987ef854e298daf35200d72f2da8635a376f5a Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 9 Sep 2024 13:54:24 +0200 Subject: [PATCH 2/9] Revert "added difference" This reverts commit 795979e667b2730ee87b1ba62eb1e883e058c3d0. --- .../switchboard/ctc/feat/experiments.py | 27 +++---------------- .../network_helpers/specaug_configurable.py | 13 +-------- 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index a00f2d86b..c0f04a15b 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -858,13 +858,13 @@ def run_scf_specaug(): "max_number_masks_for_filter_based_specaug": 75, }, ), - "peakToAverageRatio_based_specaug": dict( + "peakToAverage_based_specaug": dict( returnn_args={ **base_returnn_args, "specaug_config": { "steps_per_epoch": 4100, "enable_sorting": False, - "filter_based_masking_strategy": "peakToAverageRatio", + "filter_based_masking_strategy": "peakToAverage", "enable_logging": True, "filter_factor": 0.5, "max_number_masks_for_filter_based_specaug": 75, @@ -874,28 +874,7 @@ def run_scf_specaug(): lr_args=lr_args, report_args={ "batch_size": "2x5k", - "filter_based_masking_strategy": "peakToAverageRatio", - "filter_factor": 0.5, - "max_number_masks_for_filter_based_specaug": 75, - }, - ), - "peakToAverageDifference_based_specaug": dict( - returnn_args={ - **base_returnn_args, - "specaug_config": { - "steps_per_epoch": 4100, - "enable_sorting": False, - "filter_based_masking_strategy": "peakToAverageDifference", - "enable_logging": True, - "filter_factor": 0.5, - "max_number_masks_for_filter_based_specaug": 75, - }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={ - "batch_size": "2x5k", - "filter_based_masking_strategy": "peakToAverageDifference", + "filter_based_masking_strategy": "peakToAverage", "filter_factor": 0.5, "max_number_masks_for_filter_based_specaug": 75, }, diff --git a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py index 8382e12d3..5f4fdf8cc 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py +++ b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug_configurable.py @@ -371,7 +371,7 @@ def get_masked(): probs = variance / tf.reduce_sum(variance) uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs - elif config["filter_based_masking_strategy"] == "peakToAverageRatio": + elif config["filter_based_masking_strategy"] == "peakToAverage": # Get peak to average ratio for each filter f_resp = get_frequency_response(filter_layer) peak = tf.reduce_max(f_resp, axis=0) @@ -382,17 +382,6 @@ def get_masked(): probs = ratio / tf.reduce_sum(ratio) uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs - elif config["filter_based_masking_strategy"] == "peakToAverageDifference": - # Get peak to average ratio for each filter - f_resp = get_frequency_response(filter_layer) - peak = tf.reduce_max(f_resp, axis=0) - average = tf.reduce_mean(f_resp, axis=0) - ratio = peak - average - n_features = tf.shape(x)[data.feature_dim_axis] - # Normalize the ratio to get probabilities - probs = ratio / tf.reduce_sum(ratio) - uniform_probs = tf.ones_like(probs) / tf.cast(n_features, tf.float32) - final_probs = filter_factor * probs + (1 - filter_factor) * uniform_probs enable_logging = tf.convert_to_tensor(config["enable_logging"], dtype=tf.bool) From 965205a182fc4d5c4493f53c962202eaea4f930a Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 20 Jan 2025 13:11:05 +0100 Subject: [PATCH 3/9] make masking type independent --- .../experiments/switchboard/ctc/feat/network_helpers/specaug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug.py b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug.py index ec93fe6e7..c460b0a9e 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug.py +++ b/users/vieting/experiments/switchboard/ctc/feat/network_helpers/specaug.py @@ -67,7 +67,7 @@ def _mask(x, batch_axis, axis, pos, max_amount): ) from TFUtil import where_bc - x = where_bc(cond, 0.0, x) + x = where_bc(cond, tf.constant(0.0, dtype=x.dtype), x) return x From 7395623e331bb8975e4b81b22a0a4b57cc5bb4b4 Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 20 Jan 2025 16:35:18 +0100 Subject: [PATCH 4/9] added experiments --- .../switchboard/ctc/feat/experiments.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index c0f04a15b..589367375 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1122,6 +1122,131 @@ def run_mel_audio_perturbation_from_checkpoint(): return report +def run_stft_experiments(): + gs.ALIAS_AND_OUTPUT_SUBDIR = "experiments/switchboard/ctc/feat/" + + ( + returnn_datasets, + rasr_loss_corpus_path, + rasr_loss_corpus_segments, + rasr_loss_lexicon_path, + dev_corpora, + ) = get_datasets() + returnn_args = { + "batch_size": 5000, + "rasr_binary_path": RASR_BINARY_PATH, + "rasr_loss_corpus_path": rasr_loss_corpus_path, + "rasr_loss_corpus_segments": rasr_loss_corpus_segments, + "rasr_loss_lexicon_path": rasr_loss_lexicon_path, + "datasets": returnn_datasets, + "extra_args": { + "accum_grad_multiple_step": 2, + "conv_pad_seq_len_to_power": 1.5, + }, + "conformer_type": "wei", + } + feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2, "preemphasis": 0.97} + feature_args_lgm = {"class": "LogMelNetwork", "wave_norm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} + lr_args = { + "peak_lr": 4e-4, + "start_lr": 1.325e-05, + "end_lr": 1e-5, + "increase_epochs": 180, + "decrease_epochs": 180, + "final_epochs": 0, + } + + nn_args, report_args_collection = get_nn_args_baseline( + nn_base_args={ + "bs2x5k_scf_stft_time_only": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_1_1": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_2_4": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_5_8": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + "extra_args": + { + **returnn_args["extra_args"], + "dummy": "checkpoint", + } + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_5_15": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 15, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_mel_stft_mask_5_8": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + }, + feature_args=feature_args_lgm, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + }, + num_epochs=450, + evaluation_epochs=[350, 390, 400, 410, 450], + prefix="conformer_", + ) + + returnn_root = CloneGitRepositoryJob( + "https://github.com/rwth-i6/returnn", + commit="c4d36d06f6465e82a50d400d114259e07b8b0709", + ).out_repository + returnn_root.hash_overwrite = "returnn_conv_padding" + report, ctc_nn_system = run_nn_args( + nn_args, + report_args_collection, + dev_corpora, + "report_stft", + returnn_root=returnn_root, + recog_args={"epochs": [350, 390, 400, 410, 450]}, + ) + return report, ctc_nn_system + def py(): """ called if the file is passed to sis manager, used to run all experiments (replacement for main) From 898e23480640142522450c59fed1048780404cda Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 20 Jan 2025 19:12:27 +0100 Subject: [PATCH 5/9] report --- users/vieting/experiments/switchboard/ctc/feat/experiments.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index 589367375..a565f6b73 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1256,6 +1256,7 @@ def py(): report_scf_specaug_sort = run_scf_specaug_sort() report_scf_audio_perturbation_from_checkpoint = run_scf_audio_perturbation_from_checkpoint() report_mel_audio_perturbation_from_checkpoint = run_mel_audio_perturbation_from_checkpoint() + report_stft = run_stft_experiments() report_base = Report( columns_start=["train_name", "batch_size"], @@ -1269,6 +1270,7 @@ def py(): report_scf_specaug_sort, report_scf_audio_perturbation_from_checkpoint, report_mel_audio_perturbation_from_checkpoint, + report_stft, ] ) tk.register_report( From 9c31747361de7dc641b4b1ff9ef993dffe826f7e Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 20 Jan 2025 20:40:07 +0100 Subject: [PATCH 6/9] added layer configuration --- .../switchboard/ctc/feat/baseline_args.py | 3 + .../switchboard/ctc/feat/experiments.py | 120 +++++++++--------- .../ctc/feat/fullsum_ctc_raw_samples.py | 55 ++++++-- 3 files changed, 107 insertions(+), 71 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/baseline_args.py b/users/vieting/experiments/switchboard/ctc/feat/baseline_args.py index 1fb03e574..5589c1a9f 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/baseline_args.py +++ b/users/vieting/experiments/switchboard/ctc/feat/baseline_args.py @@ -225,12 +225,15 @@ def get_returnn_config( if audio_perturbation: prolog += get_code_for_perturbation() for layer in list(network.keys()): + if layer in ("stft", "istft", "wave_input"): + continue if network[layer]["from"] == "data": network[layer]["from"] = "features" elif isinstance(network[layer]["from"], list) and "data" in network[layer]["from"]: assert len(network[layer]["from"]) == 1 network[layer]["from"] = "features" network["features"] = feature_net + feature_net["from"] = "wave_input" if recognition: for layer in list(network.keys()): if "aux" in layer: diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index a565f6b73..8755f4d2b 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1158,74 +1158,74 @@ def run_stft_experiments(): nn_args, report_args_collection = get_nn_args_baseline( nn_base_args={ - "bs2x5k_scf_stft_time_only": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), - "bs2x5k_scf_stft_mask_1_1": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), - "bs2x5k_scf_stft_mask_2_4": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), - "bs2x5k_scf_stft_mask_5_8": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), - "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, - "extra_args": - { - **returnn_args["extra_args"], - "dummy": "checkpoint", - } - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), + # "bs2x5k_scf_stft_time_only": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, + # }, + # feature_args=feature_args, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), + # "bs2x5k_scf_stft_mask_1_1": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, + # }, + # feature_args=feature_args, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), + # "bs2x5k_scf_stft_mask_2_4": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, + # }, + # feature_args=feature_args, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), + # "bs2x5k_scf_stft_mask_5_8": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 8, "stft": True }, + # }, + # feature_args=feature_args, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), + # "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 8, "stft": True }, + # "extra_args": + # { + # **returnn_args["extra_args"], + # "dummy": "checkpoint", + # } + # }, + # feature_args=feature_args, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), "bs2x5k_scf_stft_mask_5_15": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 15, "stft": True }, + "specaug_old": {"max_feature": 15}, }, feature_args=feature_args, lr_args=lr_args, report_args={"batch_size": "2x5k", "stft": True}, ), - "bs2x5k_mel_stft_mask_5_8": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, - }, - feature_args=feature_args_lgm, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), + # "bs2x5k_mel_stft_mask_5_8": dict( + # returnn_args={ + # **returnn_args, + # "specaug_old": {"max_feature": 8, "stft": True }, + # }, + # feature_args=feature_args_lgm, + # lr_args=lr_args, + # report_args={"batch_size": "2x5k", "stft": True}, + # ), }, num_epochs=450, evaluation_epochs=[350, 390, 400, 410, 450], diff --git a/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py b/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py index a81890005..a40e5520f 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py +++ b/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py @@ -158,19 +158,52 @@ def make_conformer_fullsum_ctc_model( if recognition: python_code = [] + network["wave_input"] = {"class": "copy", "from": "data"} else: if specaug_old is not None: - assert specaug_config is None - sort_layer2 = specaug_old.pop("sort_layer2", False) - specaug_func = add_specaug_layer_sort_layer2 if sort_layer2 else add_specaug_layer - specaug_old_args = { - "max_time_num": 1, - "max_time": 15, - "max_feature_num": 5, - "max_feature": 4, - **specaug_old, - } - from_list, python_code = specaug_func(network, from_list=from_list, **specaug_old_args) + if specaug_old.get("stft", False): + specaug_old_args = { + "max_time_num": 1, + "max_time": 15, + "max_feature_num": 5, + "max_feature": 4, + **{k: v for k, v in specaug_old.items() if k != "stft"}, + } + # Add STFT layer + network["stft"] = { + "class": "stft", + "from": ["data"], + "frame_size": 400, + "frame_shift": 160, + "fft_size": 512, + } + from_list = ["stft"] + + specaug_func = add_specaug_layer + from_list, python_code = specaug_func(network, from_list=from_list, **specaug_old_args) + + # Add iSTFT layer + network["istft"] = { + "class": "istft", + "from": from_list, + "frame_size": 400, + "frame_shift": 160, + "fft_size": 512, + } + network["wave_input"] = {"class": "copy", "from": "istft"} + else: + assert specaug_config is None + sort_layer2 = specaug_old.pop("sort_layer2", False) + specaug_func = add_specaug_layer_sort_layer2 if sort_layer2 else add_specaug_layer + specaug_old_args = { + "max_time_num": 1, + "max_time": 15, + "max_feature_num": 5, + "max_feature": 4, + **specaug_old, + } + from_list, python_code = specaug_func(network, from_list=from_list, **specaug_old_args) + network["wave_input"] = {"class": "copy", "from": "data"} elif specaug_config is not None: assert specaug_old is None from_list, python_code = add_specaug_layer_configurable(network, from_list=from_list, num_epochs=num_epochs, config=specaug_config) From 25b0fd1ad84a0da8b1c73e57d3f877e45c2e1797 Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Mon, 20 Jan 2025 20:42:08 +0100 Subject: [PATCH 7/9] uncomment --- .../switchboard/ctc/feat/experiments.py | 120 +++++++++--------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index 8755f4d2b..a565f6b73 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1158,74 +1158,74 @@ def run_stft_experiments(): nn_args, report_args_collection = get_nn_args_baseline( nn_base_args={ - # "bs2x5k_scf_stft_time_only": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, - # }, - # feature_args=feature_args, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), - # "bs2x5k_scf_stft_mask_1_1": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, - # }, - # feature_args=feature_args, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), - # "bs2x5k_scf_stft_mask_2_4": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, - # }, - # feature_args=feature_args, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), - # "bs2x5k_scf_stft_mask_5_8": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 8, "stft": True }, - # }, - # feature_args=feature_args, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), - # "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 8, "stft": True }, - # "extra_args": - # { - # **returnn_args["extra_args"], - # "dummy": "checkpoint", - # } - # }, - # feature_args=feature_args, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), + "bs2x5k_scf_stft_time_only": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_1_1": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_2_4": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_5_8": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), + "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + "extra_args": + { + **returnn_args["extra_args"], + "dummy": "checkpoint", + } + }, + feature_args=feature_args, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), "bs2x5k_scf_stft_mask_5_15": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 15}, + "specaug_old": {"max_feature": 15, "stft": True }, }, feature_args=feature_args, lr_args=lr_args, report_args={"batch_size": "2x5k", "stft": True}, ), - # "bs2x5k_mel_stft_mask_5_8": dict( - # returnn_args={ - # **returnn_args, - # "specaug_old": {"max_feature": 8, "stft": True }, - # }, - # feature_args=feature_args_lgm, - # lr_args=lr_args, - # report_args={"batch_size": "2x5k", "stft": True}, - # ), + "bs2x5k_mel_stft_mask_5_8": dict( + returnn_args={ + **returnn_args, + "specaug_old": {"max_feature": 8, "stft": True }, + }, + feature_args=feature_args_lgm, + lr_args=lr_args, + report_args={"batch_size": "2x5k", "stft": True}, + ), }, num_epochs=450, evaluation_epochs=[350, 390, 400, 410, 450], From bf67072d0cfcf37c446186ed7ed57d45f9538840 Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Tue, 21 Jan 2025 09:59:29 +0100 Subject: [PATCH 8/9] black --- .../switchboard/ctc/feat/experiments.py | 28 +++++++++++-------- .../ctc/feat/fullsum_ctc_raw_samples.py | 6 ++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index a565f6b73..693492784 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1146,7 +1146,13 @@ def run_stft_experiments(): "conformer_type": "wei", } feature_args = {"class": "ScfNetwork", "size_tf": 256 // 2, "stride_tf": 10 // 2, "preemphasis": 0.97} - feature_args_lgm = {"class": "LogMelNetwork", "wave_norm": True, "frame_size": 200, "frame_shift": 80, "fft_size": 256} + feature_args_lgm = { + "class": "LogMelNetwork", + "wave_norm": True, + "frame_size": 200, + "frame_shift": 80, + "fft_size": 256, + } lr_args = { "peak_lr": 4e-4, "start_lr": 1.325e-05, @@ -1161,7 +1167,7 @@ def run_stft_experiments(): "bs2x5k_scf_stft_time_only": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True }, + "specaug_old": {"max_feature": 0, "max_feature_num": 0, "stft": True}, }, feature_args=feature_args, lr_args=lr_args, @@ -1170,7 +1176,7 @@ def run_stft_experiments(): "bs2x5k_scf_stft_mask_1_1": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True }, + "specaug_old": {"max_feature": 1, "max_feature_num": 1, "stft": True}, }, feature_args=feature_args, lr_args=lr_args, @@ -1179,7 +1185,7 @@ def run_stft_experiments(): "bs2x5k_scf_stft_mask_2_4": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True }, + "specaug_old": {"max_feature": 4, "max_feature_num": 2, "stft": True}, }, feature_args=feature_args, lr_args=lr_args, @@ -1188,7 +1194,7 @@ def run_stft_experiments(): "bs2x5k_scf_stft_mask_5_8": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, + "specaug_old": {"max_feature": 8, "stft": True}, }, feature_args=feature_args, lr_args=lr_args, @@ -1197,12 +1203,11 @@ def run_stft_experiments(): "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, - "extra_args": - { + "specaug_old": {"max_feature": 8, "stft": True}, + "extra_args": { **returnn_args["extra_args"], "dummy": "checkpoint", - } + }, }, feature_args=feature_args, lr_args=lr_args, @@ -1211,7 +1216,7 @@ def run_stft_experiments(): "bs2x5k_scf_stft_mask_5_15": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 15, "stft": True }, + "specaug_old": {"max_feature": 15, "stft": True}, }, feature_args=feature_args, lr_args=lr_args, @@ -1220,7 +1225,7 @@ def run_stft_experiments(): "bs2x5k_mel_stft_mask_5_8": dict( returnn_args={ **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True }, + "specaug_old": {"max_feature": 8, "stft": True}, }, feature_args=feature_args_lgm, lr_args=lr_args, @@ -1247,6 +1252,7 @@ def run_stft_experiments(): ) return report, ctc_nn_system + def py(): """ called if the file is passed to sis manager, used to run all experiments (replacement for main) diff --git a/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py b/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py index a40e5520f..36fa992ee 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py +++ b/users/vieting/experiments/switchboard/ctc/feat/fullsum_ctc_raw_samples.py @@ -167,7 +167,7 @@ def make_conformer_fullsum_ctc_model( "max_time": 15, "max_feature_num": 5, "max_feature": 4, - **{k: v for k, v in specaug_old.items() if k != "stft"}, + **{k: v for k, v in specaug_old.items() if k != "stft"}, } # Add STFT layer network["stft"] = { @@ -206,7 +206,9 @@ def make_conformer_fullsum_ctc_model( network["wave_input"] = {"class": "copy", "from": "data"} elif specaug_config is not None: assert specaug_old is None - from_list, python_code = add_specaug_layer_configurable(network, from_list=from_list, num_epochs=num_epochs, config=specaug_config) + from_list, python_code = add_specaug_layer_configurable( + network, from_list=from_list, num_epochs=num_epochs, config=specaug_config + ) else: from_list, python_code = add_specaug_layer_v2(network, from_list=from_list) From cfbfcbea12b82abe8a83c3dc6b988074073c916c Mon Sep 17 00:00:00 2001 From: Max-Ryujin Date: Tue, 21 Jan 2025 11:12:09 +0100 Subject: [PATCH 9/9] removed redundent experiment --- .../experiments/switchboard/ctc/feat/experiments.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/users/vieting/experiments/switchboard/ctc/feat/experiments.py b/users/vieting/experiments/switchboard/ctc/feat/experiments.py index 693492784..a923b6aad 100644 --- a/users/vieting/experiments/switchboard/ctc/feat/experiments.py +++ b/users/vieting/experiments/switchboard/ctc/feat/experiments.py @@ -1200,19 +1200,6 @@ def run_stft_experiments(): lr_args=lr_args, report_args={"batch_size": "2x5k", "stft": True}, ), - "bs2x5k_scf_stft_mask_5_8_checkpoint": dict( - returnn_args={ - **returnn_args, - "specaug_old": {"max_feature": 8, "stft": True}, - "extra_args": { - **returnn_args["extra_args"], - "dummy": "checkpoint", - }, - }, - feature_args=feature_args, - lr_args=lr_args, - report_args={"batch_size": "2x5k", "stft": True}, - ), "bs2x5k_scf_stft_mask_5_15": dict( returnn_args={ **returnn_args,