Distributed ml on k8s - first tutorial (#280)

* First pod poc * Update with torchrun * Update README * First working version of CPU-only distrib ML * Cleanup Dockerfile * fix linter * Small fixes * Add Kubeflow tutorial to list * Polished tutorial * Fix note * cleanup spaces * improve text * improve text * Update distributed.py * Update README.md * Update README.md * Update README.md * Update train-cpu.py * Update distributed.py * fix linter * clarify salloc
interTwin-eu · Jan 9, 2025 · 3fc10ef · 3fc10ef
1 parent 7ab952e
commit 3fc10ef
Show file tree

Hide file tree

Showing 14 changed files with 2,677 additions and 1,211 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -54,6 +54,7 @@ jobs:
           VALIDATE_PYTHON_PYINK: false
           VALIDATE_JSON_PRETTIER: false
           VALIDATE_CSS_PRETTIER: false
+          VALIDATE_KUBERNETES_KUBECONFORM: false
 
           # Only check new or edited files
           VALIDATE_ALL_CODEBASE: false

diff --git a/docs/getting-started/slurm.rst b/docs/getting-started/slurm.rst
@@ -65,7 +65,18 @@ Once resources are available, the command will return a ``JOBID``. Use it to jum
    # Check that you are in the compute node and you have 4 GPUs
    nvidia-smi
 
-Eventually, remember to load the correct environment modules before activating the python virtual environment.
+Remember to load the correct environment modules before activating the python virtual environment.
+
+Alternatively, if you don’t need to open an interactive shell on the compute node allocated
+with the ``salloc`` command,
+you can directly run a command on the allocated node(s) by prefixing your command with ``srun``.
+This approach ensures that your command is executed on the compute node rather than on the login node.
+
+Example:
+
+.. code-block:: bash  
+
+   srun YOUR_COMMAND
 
 Environment variables
 ---------------------

diff --git a/docs/tutorials/distrib-ml/torch_tutorial_kubeflow_1.rst b/docs/tutorials/distrib-ml/torch_tutorial_kubeflow_1.rst
@@ -0,0 +1,25 @@
+Tutorial on Kubeflow and TorchTrainer class
+=========================================
+
+.. include:: ../../../tutorials/distributed-ml/torch-kubeflow-1/README.md
+   :parser: myst_parser.sphinx_
+
+
+train-cpu.py
+++++++++
+
+.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py
+   :language: python
+
+
+cpu.yaml
+++++++++
+
+.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml
+   :language: yaml
+
+Dockerfile
+++++++++
+
+.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/Dockerfile
+   :language: dockerfile
diff --git a/docs/tutorials/tutorials.rst b/docs/tutorials/tutorials.rst
@@ -21,6 +21,7 @@ Distributed ML with PyTorch
    distrib-ml/torch-tutorial-GAN
    distrib-ml/torch_scaling_test
    distrib-ml/torch-tutorial-containers
+   distrib-ml/torch_tutorial_kubeflow_1.rst
 
 
 Distributed ML with TensorFlow

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,76 +15,69 @@ requires-python = ">=3.10"
 license = { file = "LICENSE" }
 keywords = ["ml", "ai", "hpc"]
 maintainers = [
-    { name = "Matteo Bunino", email = "matteo.bunino@cern.ch" },
-    { name = "Jarl Sondre Sæther", email = "jarl.sondre.saether@cern.ch" },
-    { name = "Anna Lappe", email = "anna.elisa.lappe@cern.ch" },
+  { name = "Matteo Bunino", email = "matteo.bunino@cern.ch" },
+  { name = "Jarl Sondre Sæther", email = "jarl.sondre.saether@cern.ch" },
+  { name = "Anna Lappe", email = "anna.elisa.lappe@cern.ch" },
 ]
 classifiers = [
-    "Development Status :: 4 - Beta",
-    "Programming Language :: Python",
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
 ]
 
 dependencies = [
-    "rich>=13.5.3",
-    "typer>=0.9.0",
-    "numpy<2.0.0",
-    "wandb>=0.18.7",
-    "mlflow>=2.17.2",
-    "wheel>=0.45.0",
-    "seaborn>=0.13.2",
-    "py-cpuinfo>=9.0.0",
-    "packaging",
-    "pydantic",
-    "pyyaml>=6.0.2",
-    "omegaconf>=2.3.0",
-    "jsonargparse[signatures]>=4.34.0",
-    "matplotlib>=3.9.2",
-    "pip>=24.3.1",
-    "prov4ml@git+https://github.com/matbun/ProvML@new-main",
-    "ray[default,train,tune]",
-    "prov",
+  "rich>=13.5.3",
+  "typer>=0.9.0",
+  "numpy<2.0.0",
+  "wandb>=0.18.7",
+  "mlflow>=2.17.2",
+  "wheel>=0.45.0",
+  "seaborn>=0.13.2",
+  "py-cpuinfo>=9.0.0",
+  "packaging>=24.2",
+  "pydantic>=2.10.2",
+  "pyyaml>=6.0.2",
+  "omegaconf>=2.3.0",
+  "jsonargparse[signatures]>=4.34.0",
+  "matplotlib>=3.9.2",
+  "pip>=24.3.1",
+  "prov4ml@git+https://github.com/matbun/ProvML@new-main",
+  "ray[default,train,tune]>=2.39.0",
+  "prov>=2.0.1",
 ]
 
 [project.optional-dependencies]
 torch = [
-    "torch==2.4.*",
-    "lightning==2.*",
-    "torchmetrics",
-    "torchvision",
-    "torchaudio",
-]
-tf = [
-  "tensorflow==2.16.*",
-  "tf_keras==2.16.*"
-]
-tf-cuda = [
-  "tensorflow[and-cuda]==2.16.*",
-  "tf_keras==2.16.*"
+  "torch==2.4.*",
+  "lightning>=2",
+  "torchmetrics>=1.6.0",
+  "torchvision>=0.16.2",
+  "torchaudio>=2.4.0",
 ]
+tf = ["tensorflow==2.16.*", "tf_keras==2.16.*"]
+tf-cuda = ["tensorflow[and-cuda]==2.16.*", "tf_keras==2.16.*"]
 
 dev = [
   "pytest>=7.4.2",
   "pytest-mock>=3.11.1",
   "pytest-cov>=4.1.0",
   "ipykernel>=6.29.5",
-  "ipython",
-  "isort>=5.13.2",
+  "ipython>=8.30.0",
+  "ruff>=0.8.3",
 ]
 docs = [
-    "sphinx-rtd-theme==2.0.0",
-    "nbsphinx==0.9.4",
-    "myst-parser==2.0.0",
-    "IPython",
-    "tensorflow==2.16.*",
-    "sphinx-tabs",
+  "sphinx-rtd-theme>=2.0.0",
+  "nbsphinx>=0.9.4",
+  "myst-parser>=2.0.0",
+  "IPython>=8.30.0",
+  "tensorflow==2.16.*",
+  "sphinx-tabs>=3.4.7",
 ]
 hpo = [
-    "bayesian-optimization==2.0.*",
-    "hyperopt==0.2.*",
-    "ConfigSpace==1.2.*",
-    "hpbandster==0.7.*",
-    "scipy==1.14.*",
-    "GPy==1.13.*"
+  "bayesian-optimization>=2.0.0",
+  "hyperopt>=0.2.0",
+  "ConfigSpace>=1.2.0",
+  "hpbandster>=0.7.0",
+  "gpy>=1.13.2",
 ]
 
 macos = ["prov4ml[apple]@git+https://github.com/matbun/ProvML@new-main"]
@@ -108,32 +101,25 @@ itwinai = "itwinai.cli:app"
 
 [tool.pytest.ini_options]
 markers = [
-    "integration: integration tests (deselect with '-m \"not integration\"')",
-    "hpc: needs SLURM and HPC resources (multiple GPUs/nodes). (deselect with '-m \"not hpc\"')",
-    "torch_dist: runs with torch DDP distributed strategy. (deselect with '-m \"not torch_dist\"')",
-    "deepspeed_dist: runs with torch DeepSpeed distributed strategy. (deselect with '-m \"not deepspeed_dist\"')",
-    "horovod_dist: runs with torch Horovod distributed strategy. (deselect with '-m \"not horovod_dist\"')",
-    "functional: functional tests. (deselect with '-m \"not functional\"')",
-    "memory_heavy: high memory footprint. (deselect with '-m \"not heavy_memory\"')",
+  "integration: integration tests (deselect with '-m \"not integration\"')",
+  "hpc: needs SLURM and HPC resources (multiple GPUs/nodes). (deselect with '-m \"not hpc\"')",
+  "torch_dist: runs with torch DDP distributed strategy. (deselect with '-m \"not torch_dist\"')",
+  "deepspeed_dist: runs with torch DeepSpeed distributed strategy. (deselect with '-m \"not deepspeed_dist\"')",
+  "horovod_dist: runs with torch Horovod distributed strategy. (deselect with '-m \"not horovod_dist\"')",
+  "functional: functional tests. (deselect with '-m \"not functional\"')",
+  "memory_heavy: high memory footprint. (deselect with '-m \"not heavy_memory\"')",
 ]
 
 
 [tool.uv]
 # This is how you can force uv to accept conflicting extras
-conflicts = [
-  [
-    { extra = "tf-cuda" },
-    { extra = "torch" },
-  ],
-]
+conflicts = [[{ extra = "tf-cuda" }, { extra = "torch" }]]
 
 # Use PyTorch with CUDA for anything that is not macos
 [tool.uv.sources]
-torch = [
-  { index = "pytorch-cu121", marker = "platform_system != 'Darwin'"},
-]
+torch = [{ index = "pytorch-cu121", marker = "platform_system != 'Darwin'" }]
 torchvision = [
-  { index = "pytorch-cu121", marker = "platform_system != 'Darwin'"},
+  { index = "pytorch-cu121", marker = "platform_system != 'Darwin'" },
 ]
 
 # Specific index for pytorch

diff --git a/src/itwinai/distributed.py b/src/itwinai/distributed.py
@@ -11,6 +11,7 @@
 import builtins as __builtin__
 import functools
 import os
+import sys
 from typing import Any, Callable
 
 from pydantic import BaseModel
@@ -121,3 +122,35 @@ def wrapper(*args, **kwargs) -> Any:
         return result
 
     return wrapper
+
+
+def suppress_workers_output(func):
+    """Decorator to suppress ``stadout`` and ``stderr`` in workers having global rank
+    different from 0.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # Save the original stdout and stderr
+        original_stdout = sys.stdout
+        original_stderr = sys.stderr
+
+        # Get global rank
+        dist_grank = detect_distributed_environment().global_rank
+        try:
+            if dist_grank == 0:
+                # If on main worker
+                return func(*args, **kwargs)
+
+            # If not on main worker, redirect stdout and stderr to devnull
+            with open(os.devnull, "w") as devnull:
+                sys.stdout = devnull
+                sys.stderr = devnull
+                # Execute the wrapped function
+                return func(*args, **kwargs)
+        finally:
+            # Restore original stdout and stderr
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+
+    return wrapper
diff --git a/src/itwinai/torch/config.py b/src/itwinai/torch/config.py
@@ -44,7 +44,6 @@ class TrainingConfiguration(Configuration):
         ``TrainingConfiguration(lr=0.005)`` in the configuration you will now have both
         ``optim_lr`` (created by default) and ``lr`` (created by you). This may create
         confusion and potentially (and silently) break the logic in your code.
-
     """
 
     #: Batch size. In a distributed environment it is usually the
@@ -89,3 +88,7 @@ class TrainingConfiguration(Configuration):
     #: gradients before adding them up.
     #: Defaults to 1.0.
     gradient_predivide_factor: float = 1.0
+    #: Torch distributed
+    #: `backend <https://pytorch.org/docs/stable/distributed.html#backends>`_.
+    #: Defaults to ``nccl``.
+    dist_backend: Literal["nccl", "gloo", "mpi"] = "nccl"