Skip to content

Commit

Permalink
Distributed ml on k8s - first tutorial (#280)
Browse files Browse the repository at this point in the history
* First pod poc

* Update with torchrun

* Update README

* First working version of CPU-only distrib ML

* Cleanup Dockerfile

* fix linter

* Small fixes

* Add Kubeflow tutorial to list

* Polished tutorial

* Fix note

* cleanup spaces

* improve text

* improve text

* Update distributed.py

* Update README.md

* Update README.md

* Update README.md

* Update train-cpu.py

* Update distributed.py

* fix linter

* clarify salloc
  • Loading branch information
matbun authored Jan 9, 2025
1 parent 7ab952e commit 3fc10ef
Show file tree
Hide file tree
Showing 14 changed files with 2,677 additions and 1,211 deletions.
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
VALIDATE_PYTHON_PYINK: false
VALIDATE_JSON_PRETTIER: false
VALIDATE_CSS_PRETTIER: false
VALIDATE_KUBERNETES_KUBECONFORM: false

# Only check new or edited files
VALIDATE_ALL_CODEBASE: false
Expand Down
13 changes: 12 additions & 1 deletion docs/getting-started/slurm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,18 @@ Once resources are available, the command will return a ``JOBID``. Use it to jum
# Check that you are in the compute node and you have 4 GPUs
nvidia-smi
Eventually, remember to load the correct environment modules before activating the python virtual environment.
Remember to load the correct environment modules before activating the python virtual environment.

Alternatively, if you don’t need to open an interactive shell on the compute node allocated
with the ``salloc`` command,
you can directly run a command on the allocated node(s) by prefixing your command with ``srun``.
This approach ensures that your command is executed on the compute node rather than on the login node.

Example:

.. code-block:: bash
srun YOUR_COMMAND
Environment variables
---------------------
Expand Down
25 changes: 25 additions & 0 deletions docs/tutorials/distrib-ml/torch_tutorial_kubeflow_1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Tutorial on Kubeflow and TorchTrainer class
=========================================

.. include:: ../../../tutorials/distributed-ml/torch-kubeflow-1/README.md
:parser: myst_parser.sphinx_


train-cpu.py
++++++++

.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/train-cpu.py
:language: python


cpu.yaml
++++++++

.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/cpu.yaml
:language: yaml

Dockerfile
++++++++

.. literalinclude:: ../../../tutorials/distributed-ml/torch-kubeflow-1/Dockerfile
:language: dockerfile
1 change: 1 addition & 0 deletions docs/tutorials/tutorials.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Distributed ML with PyTorch
distrib-ml/torch-tutorial-GAN
distrib-ml/torch_scaling_test
distrib-ml/torch-tutorial-containers
distrib-ml/torch_tutorial_kubeflow_1.rst


Distributed ML with TensorFlow
Expand Down
120 changes: 53 additions & 67 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,76 +15,69 @@ requires-python = ">=3.10"
license = { file = "LICENSE" }
keywords = ["ml", "ai", "hpc"]
maintainers = [
{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" },
{ name = "Jarl Sondre Sæther", email = "jarl.sondre.saether@cern.ch" },
{ name = "Anna Lappe", email = "anna.elisa.lappe@cern.ch" },
{ name = "Matteo Bunino", email = "matteo.bunino@cern.ch" },
{ name = "Jarl Sondre Sæther", email = "jarl.sondre.saether@cern.ch" },
{ name = "Anna Lappe", email = "anna.elisa.lappe@cern.ch" },
]
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python",
"Development Status :: 4 - Beta",
"Programming Language :: Python",
]

dependencies = [
"rich>=13.5.3",
"typer>=0.9.0",
"numpy<2.0.0",
"wandb>=0.18.7",
"mlflow>=2.17.2",
"wheel>=0.45.0",
"seaborn>=0.13.2",
"py-cpuinfo>=9.0.0",
"packaging",
"pydantic",
"pyyaml>=6.0.2",
"omegaconf>=2.3.0",
"jsonargparse[signatures]>=4.34.0",
"matplotlib>=3.9.2",
"pip>=24.3.1",
"prov4ml@git+https://github.com/matbun/ProvML@new-main",
"ray[default,train,tune]",
"prov",
"rich>=13.5.3",
"typer>=0.9.0",
"numpy<2.0.0",
"wandb>=0.18.7",
"mlflow>=2.17.2",
"wheel>=0.45.0",
"seaborn>=0.13.2",
"py-cpuinfo>=9.0.0",
"packaging>=24.2",
"pydantic>=2.10.2",
"pyyaml>=6.0.2",
"omegaconf>=2.3.0",
"jsonargparse[signatures]>=4.34.0",
"matplotlib>=3.9.2",
"pip>=24.3.1",
"prov4ml@git+https://github.com/matbun/ProvML@new-main",
"ray[default,train,tune]>=2.39.0",
"prov>=2.0.1",
]

[project.optional-dependencies]
torch = [
"torch==2.4.*",
"lightning==2.*",
"torchmetrics",
"torchvision",
"torchaudio",
]
tf = [
"tensorflow==2.16.*",
"tf_keras==2.16.*"
]
tf-cuda = [
"tensorflow[and-cuda]==2.16.*",
"tf_keras==2.16.*"
"torch==2.4.*",
"lightning>=2",
"torchmetrics>=1.6.0",
"torchvision>=0.16.2",
"torchaudio>=2.4.0",
]
tf = ["tensorflow==2.16.*", "tf_keras==2.16.*"]
tf-cuda = ["tensorflow[and-cuda]==2.16.*", "tf_keras==2.16.*"]

dev = [
"pytest>=7.4.2",
"pytest-mock>=3.11.1",
"pytest-cov>=4.1.0",
"ipykernel>=6.29.5",
"ipython",
"isort>=5.13.2",
"ipython>=8.30.0",
"ruff>=0.8.3",
]
docs = [
"sphinx-rtd-theme==2.0.0",
"nbsphinx==0.9.4",
"myst-parser==2.0.0",
"IPython",
"tensorflow==2.16.*",
"sphinx-tabs",
"sphinx-rtd-theme>=2.0.0",
"nbsphinx>=0.9.4",
"myst-parser>=2.0.0",
"IPython>=8.30.0",
"tensorflow==2.16.*",
"sphinx-tabs>=3.4.7",
]
hpo = [
"bayesian-optimization==2.0.*",
"hyperopt==0.2.*",
"ConfigSpace==1.2.*",
"hpbandster==0.7.*",
"scipy==1.14.*",
"GPy==1.13.*"
"bayesian-optimization>=2.0.0",
"hyperopt>=0.2.0",
"ConfigSpace>=1.2.0",
"hpbandster>=0.7.0",
"gpy>=1.13.2",
]

macos = ["prov4ml[apple]@git+https://github.com/matbun/ProvML@new-main"]
Expand All @@ -108,32 +101,25 @@ itwinai = "itwinai.cli:app"

[tool.pytest.ini_options]
markers = [
"integration: integration tests (deselect with '-m \"not integration\"')",
"hpc: needs SLURM and HPC resources (multiple GPUs/nodes). (deselect with '-m \"not hpc\"')",
"torch_dist: runs with torch DDP distributed strategy. (deselect with '-m \"not torch_dist\"')",
"deepspeed_dist: runs with torch DeepSpeed distributed strategy. (deselect with '-m \"not deepspeed_dist\"')",
"horovod_dist: runs with torch Horovod distributed strategy. (deselect with '-m \"not horovod_dist\"')",
"functional: functional tests. (deselect with '-m \"not functional\"')",
"memory_heavy: high memory footprint. (deselect with '-m \"not heavy_memory\"')",
"integration: integration tests (deselect with '-m \"not integration\"')",
"hpc: needs SLURM and HPC resources (multiple GPUs/nodes). (deselect with '-m \"not hpc\"')",
"torch_dist: runs with torch DDP distributed strategy. (deselect with '-m \"not torch_dist\"')",
"deepspeed_dist: runs with torch DeepSpeed distributed strategy. (deselect with '-m \"not deepspeed_dist\"')",
"horovod_dist: runs with torch Horovod distributed strategy. (deselect with '-m \"not horovod_dist\"')",
"functional: functional tests. (deselect with '-m \"not functional\"')",
"memory_heavy: high memory footprint. (deselect with '-m \"not heavy_memory\"')",
]


[tool.uv]
# This is how you can force uv to accept conflicting extras
conflicts = [
[
{ extra = "tf-cuda" },
{ extra = "torch" },
],
]
conflicts = [[{ extra = "tf-cuda" }, { extra = "torch" }]]

# Use PyTorch with CUDA for anything that is not macos
[tool.uv.sources]
torch = [
{ index = "pytorch-cu121", marker = "platform_system != 'Darwin'"},
]
torch = [{ index = "pytorch-cu121", marker = "platform_system != 'Darwin'" }]
torchvision = [
{ index = "pytorch-cu121", marker = "platform_system != 'Darwin'"},
{ index = "pytorch-cu121", marker = "platform_system != 'Darwin'" },
]

# Specific index for pytorch
Expand Down
33 changes: 33 additions & 0 deletions src/itwinai/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import builtins as __builtin__
import functools
import os
import sys
from typing import Any, Callable

from pydantic import BaseModel
Expand Down Expand Up @@ -121,3 +122,35 @@ def wrapper(*args, **kwargs) -> Any:
return result

return wrapper


def suppress_workers_output(func):
"""Decorator to suppress ``stadout`` and ``stderr`` in workers having global rank
different from 0.
"""

@functools.wraps(func)
def wrapper(*args, **kwargs):
# Save the original stdout and stderr
original_stdout = sys.stdout
original_stderr = sys.stderr

# Get global rank
dist_grank = detect_distributed_environment().global_rank
try:
if dist_grank == 0:
# If on main worker
return func(*args, **kwargs)

# If not on main worker, redirect stdout and stderr to devnull
with open(os.devnull, "w") as devnull:
sys.stdout = devnull
sys.stderr = devnull
# Execute the wrapped function
return func(*args, **kwargs)
finally:
# Restore original stdout and stderr
sys.stdout = original_stdout
sys.stderr = original_stderr

return wrapper
5 changes: 4 additions & 1 deletion src/itwinai/torch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class TrainingConfiguration(Configuration):
``TrainingConfiguration(lr=0.005)`` in the configuration you will now have both
``optim_lr`` (created by default) and ``lr`` (created by you). This may create
confusion and potentially (and silently) break the logic in your code.
"""

#: Batch size. In a distributed environment it is usually the
Expand Down Expand Up @@ -89,3 +88,7 @@ class TrainingConfiguration(Configuration):
#: gradients before adding them up.
#: Defaults to 1.0.
gradient_predivide_factor: float = 1.0
#: Torch distributed
#: `backend <https://pytorch.org/docs/stable/distributed.html#backends>`_.
#: Defaults to ``nccl``.
dist_backend: Literal["nccl", "gloo", "mpi"] = "nccl"
Loading

0 comments on commit 3fc10ef

Please sign in to comment.