Revert "Update users/berger"

This reverts commit bbe56ae.
rwth-i6 · Mar 18, 2024 · b4b6ec3 · b4b6ec3
1 parent a6df1f5
commit b4b6ec3
Show file tree

Hide file tree

Showing 19 changed files with 130 additions and 1,294 deletions.
diff --git a/common/baselines/librispeech/default_tools.py b/common/baselines/librispeech/default_tools.py
@@ -8,17 +8,14 @@
 version listed here. Nevertheless, the most recent "head" should be safe to be used as well
 
 """
-from i6_experiments.common.tools.rasr import compile_rasr_binaries_apptainer
+from sisyphus import tk
+from i6_experiments.common.tools.audio import compile_ffmpeg_binary
+from i6_experiments.common.tools.rasr import compile_rasr_binaries_i6mode
 from i6_experiments.common.tools.sctk import compile_sctk
 
 # RASR_BINARY_PATH = None
 # RASR_BINARY_PATH = compile_rasr_binaries_i6mode(commit="907eec4f4e36c11153f6ab6b5dd7675116f909f6")  # use tested RASR
-# RASR_BINARY_PATH = compile_rasr_binaries_i6mode()  #  use most recent RASR
-RASR_BINARY_PATH = compile_rasr_binaries_apptainer(
-    "2023-05-08_tensorflow-2.8_v1", commit="a1218e196557aa6d02570bbb38767e987b7a77a2"
-)
-# , branch="apptainer_tf_2_8", commit="9dcef411b27a4b302698c83c0af81789ef4de2c2"
-# )
+RASR_BINARY_PATH = compile_rasr_binaries_i6mode()  #  use most recent RASR
 assert RASR_BINARY_PATH, "Please set a specific RASR_BINARY_PATH before running the pipeline"
 RASR_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_RASR_BINARY_PATH"
 

diff --git a/common/baselines/tedlium2/lm/ngram_config.py b/common/baselines/tedlium2/lm/ngram_config.py
@@ -54,7 +54,7 @@ def run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping: bool = False, alias_p
         srilm_path=SRILM_PATH,
         ngram_rqmt=None,
         perplexity_rqmt=None,
-        mail_address=gs.MAIL_ADDRESS if hasattr(gs, "MAIL_ADDRESS") else None,
+        mail_address=gs.MAIL_ADDRESS,
     )
     ngram_system.run_training()
 

diff --git a/common/setups/lm/srilm_system.py b/common/setups/lm/srilm_system.py
@@ -318,11 +318,7 @@ def _format_report_perplexities(self, ppl_dict: Dict[str, Union[str, tk.Variable
             for order in self.ngram_order:
                 out_str = str(order).ljust(len(order_header))
                 for eval_name in self.eval_data.keys():
-                    var = ppl_dict[f"{train_name}_{order}gram_{eval_name}"]
-                    if not var.is_set():
-                        out_str += "None".ljust(max_size)
-                    else:
-                        out_str += f"{var.get():.2f}".ljust(max_size)
+                    out_str += f'{ppl_dict[f"{train_name}_{order}gram_{eval_name}"].get():.2f}'.ljust(max_size)
                     out_str += " "
                 out.append(out_str)
 

diff --git a/common/setups/returnn_common/serialization.py b/common/setups/returnn_common/serialization.py
@@ -43,7 +43,6 @@
 """
 
 from __future__ import annotations
-import copy
 from typing import Any, List, Union, Optional, Dict, Set
 from dataclasses import dataclass, asdict
 import os
@@ -175,11 +174,8 @@ def get(self) -> str:
                     assert False, "invalid type for packages"
                 target_package_path = os.path.join(out_dir, package_path)
                 pathlib.Path(os.path.dirname(target_package_path)).mkdir(parents=True, exist_ok=True)
-                try:
-                    shutil.copytree(os.path.join(self.root_path, package_path), target_package_path)
-                except FileExistsError:
-                    pass
-            content.append(f"sys.path.insert(0, os.path.dirname(__file__))\n")
+                shutil.copytree(os.path.join(self.root_path, package_path), target_package_path)
+                content.append(f"sys.path.insert(0, os.path.dirname(__file__))\n")
         else:
             content.append(f"sys.path.insert(0, {self.root_path!r})\n")
 
@@ -327,14 +323,14 @@ def __init__(
         """
         super().__init__()
         self.net_func_name = net_func_name
-        self.net_kwargs = copy.deepcopy(net_kwargs)
+        self.net_kwargs = net_kwargs
         self.net_kwargs.update({k: CodeWrapper(v) for k, v in net_func_map.items()})
 
     def get(self):
         """get"""
         return string.Template(self.TEMPLATE).substitute(
             {
-                "NETWORK_KWARGS": str(instanciate_delayed(self.net_kwargs)),
+                "NETWORK_KWARGS": str(self.net_kwargs),
                 "FUNCTION_NAME": self.net_func_name,
             }
         )

diff --git a/common/setups/returnn_pytorch/serialization.py b/common/setups/returnn_pytorch/serialization.py
@@ -13,7 +13,7 @@
 import torch
 from i6_core.util import instanciate_delayed
 from sisyphus import gs, tk
-from sisyphus.delayed_ops import DelayedBase, DelayedFormat
+from sisyphus.delayed_ops import DelayedBase
 from sisyphus.hash import sis_hash_helper
 
 if TYPE_CHECKING:
@@ -163,108 +163,90 @@ def build_config_constructor_serializers(
     """
     from i6_models.config import ModelConfiguration, ModuleFactoryV1
 
-    def serialize_value(value: Any) -> Tuple[Union[str, DelayedBase], List[Import]]:
-        # Switch over serialization logic for different subtypes
+    # Import the class of <cfg>
+    imports = [
+        Import(
+            code_object_path=f"{type(cfg).__module__}.{type(cfg).__name__}", unhashed_package_root=unhashed_package_root
+        )
+    ]
+
+    call_kwargs = []
 
+    # Iterate over all dataclass fields
+    for key in fields(type(cfg)):
+        # Value corresponding to dataclass field name
+        value = getattr(cfg, key.name)
+
+        # Switch over serialization logic for different subtypes
         if isinstance(value, ModelConfiguration):
             # Example:
             # ConformerBlockConfig(mhsa_config=ConformerMHSAConfig(...))
             # -> Sub-Constructor-Call and imports for ConformerMHSAConfig
-            return build_config_constructor_serializers(value, unhashed_package_root=unhashed_package_root)
+            subcall, subimports = build_config_constructor_serializers(value)
+            imports += subimports
+            call_kwargs.append((key.name, subcall))
         elif isinstance(value, ModuleFactoryV1):
             # Example:
             # ConformerEncoderConfig(
             #     frontend=ModuleFactoryV1(module_class=VGGFrontend, cfg=VGGFrontendConfig(...)))
             # -> Import classes ModuleFactoryV1, VGGFrontend and VGGFrontendConfig
             # -> Sub-Constructor-Call for VGGFrontendConfig
-            subcall, subimports = build_config_constructor_serializers(value.cfg, unhashed_package_root=unhashed_package_root)
-            subimports.append(
+            subcall, subimports = build_config_constructor_serializers(value.cfg)
+            imports += subimports
+            imports.append(
                 Import(
                     code_object_path=f"{value.module_class.__module__}.{value.module_class.__name__}",
                     unhashed_package_root=unhashed_package_root,
                 )
             )
-            subimports.append(
+            imports.append(
                 Import(
                     code_object_path=f"{ModuleFactoryV1.__module__}.{ModuleFactoryV1.__name__}",
                     unhashed_package_root=unhashed_package_root,
                 )
             )
-            return Call(
-                callable_name=ModuleFactoryV1.__name__,
-                kwargs=[("module_class", value.module_class.__name__), ("cfg", subcall)],
-            ), subimports
+            call_kwargs.append(
+                (
+                    key.name,
+                    Call(
+                        callable_name=ModuleFactoryV1.__name__,
+                        kwargs=[("module_class", value.module_class.__name__), ("cfg", subcall)],
+                    ),
+                )
+            )
         elif isinstance(value, torch.nn.Module):
             # Example:
             # ConformerConvolutionConfig(norm=BatchNorm1d(...))
             # -> Import class BatchNorm1d
             # -> Sub-serialization of BatchNorm1d object.
             #       The __str__ function of torch.nn.Module already does this in the way we want.
-            return str(value), [
+            imports.append(
                 Import(
                     code_object_path=f"{value.__module__}.{type(value).__name__}",
                     unhashed_package_root=unhashed_package_root,
                 )
-            ]
+            )
+            call_kwargs.append((key.name, str(value)))
         elif isfunction(value):
             # Example:
             # ConformerConvolutionConfig(activation=torch.nn.functional.silu)
             # -> Import function silu
             # Builtins (e.g. 'sum') do not need to be imported
             if value.__module__ != "builtins":
-                subimports = [
+                imports.append(
                     Import(
                         code_object_path=f"{value.__module__}.{value.__name__}",
                         unhashed_package_root=unhashed_package_root,
                     )
-                ]
-            else:
-                subimports = []
-            return value.__name__, subimports
-        elif isinstance(value, list):
-            # -> Serialize list values individually, collect subimports
-            list_items = []
-            list_imports = []
-            for item in value:
-                item_serialized, item_imports = serialize_value(item)
-                list_items.append(item_serialized)
-                list_imports += item_imports
-            return DelayedFormat(f"[{', '.join(['{}'] * len(list_items))}]", *list_items), list_imports
-        elif isinstance(value, dict):
-            # -> Serialize dict values individually, collect subimports
-            dict_items = []  # Will alternatingly contain key and value of all dict items
-            dict_imports = []
-            for key, val in value.items():
-                val_serialized, item_imports = serialize_value(val)
-                dict_items += [key, val_serialized]
-                dict_imports += item_imports
-            return DelayedFormat(f"{{{', '.join(['{}: {}'] * len(dict_items))}}}", *dict_items), dict_imports
+                )
+            call_kwargs.append((key.name, value.__name__))
         elif isinstance(value, DelayedBase):
             # sisyphus variables are just given as-is and will be instanciated only when calling "get".
-            return value, []
+            call_kwargs.append((key.name, value))
         else:
             # No special case (usually python primitives)
             # -> Just get string representation
-            return str(value), []
-
-
-    # Import the class of <cfg>
-    imports = [
-        Import(
-            code_object_path=f"{type(cfg).__module__}.{type(cfg).__name__}", unhashed_package_root=unhashed_package_root
-        )
-    ]
-
-    call_kwargs = []
-
-    # Iterate over all dataclass fields
-    for key in fields(type(cfg)):
-        # Value corresponding to dataclass field name
-        value = getattr(cfg, key.name)
-
-        serialized_value, value_imports = serialize_value(value)
-        call_kwargs.append((key.name, serialized_value))
-        imports += value_imports
+            call_kwargs.append((key.name, str(value)))
 
     imports = list(OrderedDict.fromkeys(imports))  # remove duplications
 

diff --git a/common/tools/rasr.py b/common/tools/rasr.py
@@ -35,22 +35,3 @@ def compile_rasr_binaries_i6mode(
     )
     make_job.rqmt["mem"] = 8
     return make_job.out_links["binaries"]
-
-
-def compile_rasr_binaries_apptainer(
-    apptainer_image_version: str,  # Most recent: 2023-05-08_tensorflow-2.8_v1
-    branch: Optional[str] = None,
-    commit: Optional[str] = None,
-    rasr_git_repository: str = "https://github.com/rwth-i6/rasr",
-    rasr_arch: str = "linux-x86_64-standard",
-) -> tk.Path:
-    rasr_repo = CloneGitRepositoryJob(rasr_git_repository, branch=branch, commit=commit).out_repository
-    make_job = MakeJob(
-        folder=rasr_repo,
-        make_sequence=["build", "install"],
-        configure_opts=[f"--apptainer-setup={apptainer_image_version}"],
-        num_processes=8,
-        link_outputs={"binaries": f"arch/{rasr_arch}/"},
-    )
-    make_job.rqmt["mem"] = 8
-    return make_job.out_links["binaries"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,4 @@
 [tool.black]
 line-length = 120
-target-version = ["py38"]
-# exclude = 'users'
-
-[tool.ruff]
-line-length = 120
+target-version = ["py37"]
+exclude = 'users'
diff --git a/users/berger/configs/librispeech/20230210_baselines/__init__.py b/users/berger/configs/librispeech/20230210_baselines/__init__.py
@@ -17,15 +17,12 @@
 
 # from .config_02e_transducer_rasr_features_tinaconf_old import py as py_02e_old
 from .config_02e_transducer_rasr_features_tinaconf import py as py_02e
-from .config_02e_transducer_rasr_features_tinaconf_rtf import py as py_02e_rtf
 
 # from .config_02c_transducer_wei import py as py_02c
 # from .config_02d_transducer_rasr_features_dc import py as py_02d
 from .config_03a_transducer_fullsum_raw_samples import py as py_03a
 from .config_03b_transducer_fullsum_rasr_features import py as py_03b
 
-from .config_04b_transducer_fullsum_from_scratch_rasr_features import py as py_04b
-
 # from .config_03c_transducer_fullsum_wei import py as py_03c
 
 # from .config_test_1 import py as py_test_1
@@ -54,18 +51,16 @@ def main() -> SummaryReport:
     sub_reports.append(copy.deepcopy(py_02b()[0]))
     # sub_reports.append(copy.deepcopy(py_02e_old()))
     sub_reports.append(copy.deepcopy(py_02e()))
-    sub_reports.append(copy.deepcopy(py_02e_rtf()))
     sub_reports.append(copy.deepcopy(py_03a()))
     sub_reports.append(copy.deepcopy(py_03b()))
-    sub_reports.append(copy.deepcopy(py_04b()))
 
     for report in sub_reports:
         report.collapse(
             [SummaryKey.CORPUS.value], best_selector_key=SummaryKey.ERR.value
         )  # Keep one row for each recognition corpus
         summary_report.merge_report(report, update_structure=True)
 
-    summary_report.set_col_sort_key([SummaryKey.ERR.value, SummaryKey.CORPUS.value])
+    summary_report.set_col_sort_key([SummaryKey.ERR.value, SummaryKey.WER.value, SummaryKey.CORPUS.value])
 
     tk.register_report("summary.report", summary_report)