ecmwf · anaprietonem · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@ Keep it human-readable, your future self will thank you!
 - Feature: Add configurable models [#50](https://github.com/ecmwf/anemoi-training/pulls/50)
 - Feature: Support training for datasets with missing time steps [#48](https://github.com/ecmwf/anemoi-training/pulls/48)
 - Long Rollout Plots
+- Feat: Anemoi Profiler compatible with mlflow and using Pytorch (Kineto) Profiler for memory report
+
 
 ### Fixed
 

diff --git a/docs/images/profiler/anemoi_profiler_architecture.png b/docs/images/profiler/anemoi_profiler_architecture.png
diff --git a/docs/images/profiler/anemoi_profiler_benchmark_profiler.png b/docs/images/profiler/anemoi_profiler_benchmark_profiler.png
diff --git a/docs/images/profiler/anemoi_profiler_config.png b/docs/images/profiler/anemoi_profiler_config.png
diff --git a/docs/images/profiler/anemoi_profiler_high_level.png b/docs/images/profiler/anemoi_profiler_high_level.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration.png b/docs/images/profiler/anemoi_profiler_mlflow_integration.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png
diff --git a/docs/images/profiler/anemoi_profiler_speed_report.png b/docs/images/profiler/anemoi_profiler_speed_report.png
diff --git a/docs/images/profiler/anemoi_profiler_speedreport_diagram.png b/docs/images/profiler/anemoi_profiler_speedreport_diagram.png
diff --git a/docs/images/profiler/anemoi_profiler_training_rates.png b/docs/images/profiler/anemoi_profiler_training_rates.png
diff --git a/docs/images/profiler/anemoi_profiler_validation_rates.png b/docs/images/profiler/anemoi_profiler_validation_rates.png
diff --git a/docs/images/profiler/example_memory_report.png b/docs/images/profiler/example_memory_report.png
diff --git a/docs/images/profiler/example_memory_timeline.png b/docs/images/profiler/example_memory_timeline.png
diff --git a/docs/images/profiler/example_model_summary.png b/docs/images/profiler/example_model_summary.png
diff --git a/docs/images/profiler/example_model_summary_2.png b/docs/images/profiler/example_model_summary_2.png
diff --git a/docs/images/profiler/example_system_report.png b/docs/images/profiler/example_system_report.png
diff --git a/docs/images/profiler/example_time_report.png b/docs/images/profiler/example_time_report.png
diff --git a/docs/images/profiler/idle_time_breakdown.png b/docs/images/profiler/idle_time_breakdown.png
diff --git a/docs/images/profiler/kernel_breakdown_dfs.png b/docs/images/profiler/kernel_breakdown_dfs.png
diff --git a/docs/images/profiler/kernel_breakdown_plots.png b/docs/images/profiler/kernel_breakdown_plots.png
diff --git a/docs/images/profiler/memory_snapshot_diagram.png b/docs/images/profiler/memory_snapshot_diagram.png
diff --git a/docs/images/profiler/memory_snapshot_output.png b/docs/images/profiler/memory_snapshot_output.png
diff --git a/docs/images/profiler/temporal_breakdown.png b/docs/images/profiler/temporal_breakdown.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -43,6 +43,7 @@ This package provides the *Anemoi* training functionality.
    user-guide/training
    user-guide/models
    user-guide/tracking
+   user-guide/benchmarking
    user-guide/distributed
    user-guide/debugging
 

diff --git a/docs/overview.rst b/docs/overview.rst
@@ -91,6 +91,18 @@ and resolve issues during the training process, including:
 -  Debug configurations for quick error identification
 -  Guidance on isolating and addressing common problems
 
+8. Benchmarking and HPC Profiling
+=================================
+
+Anemoi Training offers tools and configurations to support benchmarking
+and High-Performance Computing (HPC) profiling, allowing users to
+optimize training performance. This includes:
+
+-  Benchmarking configurations for evaluating training efficiency across
+   different hardware setups.
+-  Profiling tools for monitoring resource utilization (CPU, GPU,
+   memory) and identifying performance bottlenecks.
+
 **************************
  Components and Structure
 **************************

diff --git a/docs/user-guide/benchmarking.rst b/docs/user-guide/benchmarking.rst
@@ -76,6 +76,13 @@ optional-dependencies.docs = [
   "sphinx-argparse",
   "sphinx-rtd-theme",
 ]
+optional-dependencies.profile = [
+  "holistictraceanalysis>=0.2",
+  "pandas>=1.3.2",
+  "rich>=13.6",
+  "tabulate>=0.9",
+]
+
 optional-dependencies.tests = [ "hypothesis", "pytest", "pytest-mock" ]
 
 urls.Changelog = "https://github.com/ecmwf/anemoi-training/CHANGELOG.md"

diff --git a/src/anemoi/training/commands/profiler.py b/src/anemoi/training/commands/profiler.py
@@ -0,0 +1,47 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+from __future__ import annotations
+
+import logging
+import sys
+from typing import TYPE_CHECKING
+
+from anemoi.training.commands import Command
+
+if TYPE_CHECKING:
+    import argparse
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Profile(Command):
+    """Commands to profile Anemoi models."""
+
+    accept_unknown_args = True
+
+    @staticmethod
+    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        return parser
+
+    @staticmethod
+    def run(args: list[str], unknown_args: list[str] | None = None) -> None:
+        del args
+
+        if unknown_args is not None:
+            sys.argv = [sys.argv[0], *unknown_args]
+        else:
+            sys.argv = [sys.argv[0]]
+
+        LOGGER.info("Running anemoi profiling command with overrides: %s", sys.argv[1:])
+        from anemoi.training.train.profiler import main as anemoi_profile
+
+        anemoi_profile()
+
+
+command = Profile
diff --git a/src/anemoi/training/config/diagnostics/eval_rollout.yaml b/src/anemoi/training/config/diagnostics/eval_rollout.yaml
@@ -57,6 +57,28 @@ debug:
 # remember to also activate the tensorboard logger (below)
 profiler: False
 
+# Use anemoi-profile to profile the training process
+benchmark_profiler:
+  memory:
+    enabled: True
+    steps: 5 # wait warmup steps and then do steps (too many steps would lead to a big file)
+    warmup: 2
+    extra_plots: False
+    trace_rank0_only: False #set to true and it will profile rank 0 only. Reads SLURM_PROC_ID so won't work when not running via Slurm
+  time:
+    enabled: True
+    verbose: False #If true, output every action the profiler caputres, otherwise output a subset defined in PROFILER_ACTIONS at the top of aifs/diagnostics/profiler.py
+  speed:
+    enabled: True
+  system:
+    enabled: True
+  model_summary:
+    enabled: True
+  snapshot:
+    enabled: True
+    steps: 4 # wait warmup steps and then do steps
+    warmup: 0
+
 checkpoint:
   every_n_minutes:
     save_frequency: 30 # Approximate, as this is checked at the end of training steps

diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml
@@ -19,6 +19,8 @@ multistep_input: 2
 # the effective batch size becomes num-devices * batch_size * k
 accum_grad_batches: 1
 
+num_sanity_val_steps: 6
+
 # clipp gradients, 0 : don't clip, default algorithm: norm, alternative: value
 gradient_clip:
   val: 32.

diff --git a/src/anemoi/training/diagnostics/callbacks/__init__.py b/src/anemoi/training/diagnostics/callbacks/__init__.py
@@ -37,6 +37,7 @@
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 from anemoi.training.diagnostics.plots import init_plot_settings
 from anemoi.training.diagnostics.plots import plot_graph_features
@@ -870,6 +871,71 @@ def on_load_checkpoint(
         pl_module.hparams["metadata"]["parent_uuid"] = checkpoint["hyper_parameters"]["metadata"]["uuid"]
 
 
+class MemorySnapshotRecorder(Callback):
+    """Record memory snapshot using torch.cuda._record_memory_history()."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dirpath = Path(self.config.hardware.paths.profiler)
+
+        self.warmup = self.config.diagnostics.benchmark_profiler.snapshot.warmup
+        if not self.warmup:
+            self.warmup = 0
+        self.num_steps = (
+            self.config.diagnostics.benchmark_profiler.snapshot.steps + self.warmup
+        )  # be consistent with profiler scheduler
+        self.status = False
+
+        assert (
+            self.num_steps % self.config.dataloader.batch_size.training == 0
+        ), "Snapshot steps is not a multiple of batch size"
+        assert (
+            self.warmup % self.config.dataloader.batch_size.training == 0
+        ), "Snapshot Warmup steps is not a multiple of batch size"
+
+    @rank_zero_only
+    def _start_snapshot_recording(self):
+        LOGGER.info("Starting snapshot record_memory_history")
+        torch.cuda.memory._record_memory_history()
+        self.status = True
+
+    @rank_zero_only
+    def _save_snapshot(self):
+        self.memory_snapshot_fname = self.dirpath / "memory_snapshot.pickle"
+        try:
+            LOGGER.info("Saving memory snapshot to %s", self.memory_snapshot_fname)
+            torch.cuda.memory._dump_snapshot(f"{self.memory_snapshot_fname}")
+        except Exception as e:
+            LOGGER.error(f"Failed to capture memory snapshot {e}")
+
+    @rank_zero_only
+    def stop_record_memory_history(self) -> None:
+        LOGGER.info("Stopping snapshot record_memory_history")
+        torch.cuda.memory._record_memory_history(enabled=None)
+
+    def on_train_batch_start(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
+    ) -> None:
+        if trainer.global_step == self.warmup:
+            self._start_snapshot_recording()
+
+    def on_train_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        if trainer.global_step == self.num_steps:
+            if self.status is True:
+                self._save_snapshot()
+                self.stop_record_memory_history()
+            else:
+                LOGGER.info("Snapshot recording was not started so no snapshot was saved")
+
+
 class AnemoiCheckpoint(ModelCheckpoint):
     """A checkpoint callback that saves the model after every validation epoch."""