Merge branch 'develop' into fix-model-kwargs-in-parallelize

axonn-ai · Oct 20, 2024 · 52cde06 · 52cde06
2 parents ea9600c + 464e4a8
commit 52cde06
Show file tree

Hide file tree

Showing 39 changed files with 822 additions and 414 deletions.
diff --git a/.github/workflows/nvidia-rtx-3090-tests.yaml b/.github/workflows/nvidia-rtx-3090-tests.yaml
@@ -29,7 +29,7 @@ jobs:
         export G_inter=${{ matrix.ginter }}
         export G_data=$(( 2 / G_inter ))
         echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter  )) ${{ matrix.memopt }}" 
-        mpirun -n 2 pytest --with-mpi ./axonn/tests/test_vit.py 
+        PYTHONPATH="." mpirun -n 2 pytest --with-mpi ./axonn/tests/test_vit.py 
     - name: Uninstall AxoNN
       run: |
         pip uninstall --yes axonn
@@ -46,13 +46,13 @@ jobs:
     - name: Run intra-layer FC unit tests 
       run: |
         torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_fc.py 
-    - name: Run intra-layer Conv unit tests 
-      run: |
-        torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_conv.py 
-    - name: Run intra-layer Embedding unit tests
-      run: |
-        torchrun --nproc_per_node 2 --no_python python -m pytest ./axonn/tests/test_intra_layer_emb.py -k bw_pass
-        torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_emb.py -k fw_pass
+        #- name: Run intra-layer Conv unit tests 
+        #run: |
+        #torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_conv.py 
+        #- name: Run intra-layer Embedding unit tests
+        #run: |
+        #torchrun --nproc_per_node 2 --no_python python -m pytest ./axonn/tests/test_intra_layer_emb.py -k bw_pass
+        #torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_emb.py -k fw_pass
     - name: Uninstall AxoNN
       run: |
         pip uninstall --yes axonn
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,11 +1,15 @@
 version: 2
 
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"  # Python version defined for the build environment
+
 # Build from the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py
 
 # Explicitly set the version of Python and its requirements
 python:
-  version: 3.8
   install:
     - requirements: docs/requirements.txt
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
-# <img src="https://github.com/axonn-ai/axonn/blob/c356b821c2020c7dcd2181dfacc226619bfd5240/logo.png" width="64" valign="middle" alt="hatchet"/> AxoNN
+# <img src="https://github.com/axonn-ai/axonn/blob/c356b821c2020c7dcd2181dfacc226619bfd5240/logo.png" width="64" valign="middle" alt="axonn"/> AxoNN
+
+[![nvidia-rtx-3090-tests](https://github.com/axonn-ai/axonn/actions/workflows/ci.yaml/badge.svg)](https://github.com/axonn-ai/axonn/actions/workflows/ci.yaml)
 
-[![rtx-3090 tests](https://github.com/hpcgroup/axonn/actions/workflows/nvidia-tests.yaml/badge.svg)](https://github.com/hpcgroup/axonn/actions/workflows/nvidia-tests.yaml)
 [![docs](https://readthedocs.org/projects/axonn/badge/?version=latest)](https://axonn.readthedocs.io/en/latest/?badge=latest)
 [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Join slack](https://img.shields.io/badge/slack-axonn--users-blue)](https://join.slack.com/t/axonn-users/shared_invite/zt-2itbahk29-_Ig1JasFxnuVyfMtcC4GnA)

diff --git a/axonn/__init__.py b/axonn/__init__.py
@@ -1,5 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# from . import models  # noqa: F401
diff --git a/axonn/axonn.py b/axonn/axonn.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -9,14 +9,6 @@
 from .communication import communication_handle
 import torch
 
-try:
-    import mpi4py
-
-    MPI4PY = True
-    mpi4py.rc.initialize = False  # do not initialize MPI automatically
-except ImportError:
-    MPI4PY = False
-
 # True when init has been called
 is_initialized = False
 # Communication handle for point-to-point (MPI) and collective (NCCL) communication

diff --git a/axonn/checkpoint.py b/axonn/checkpoint.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2022-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

diff --git a/axonn/communication.py b/axonn/communication.py
@@ -1,12 +1,11 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
 
 try:
-    # from mpi4py import MPI
     import mpi4py
 
     MPI4PY = True
@@ -112,7 +111,7 @@ def __init__(
         if not torch.distributed.is_initialized():
             init_method = "tcp://"
             master_ip = os.getenv("MASTER_ADDR", "localhost")
-            master_port = os.getenv("MASTER_PORT", "6000")
+            master_port = os.getenv("MASTER_PORT", "29500")
             init_method += master_ip + ":" + master_port
             torch.distributed.init_process_group(
                 backend="nccl",

diff --git a/axonn/config.py b/axonn/config.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

diff --git a/axonn/inter_layer.py b/axonn/inter_layer.py
@@ -1,11 +1,16 @@
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# from . import models  # noqa: F401
+
+
 from enum import Enum
 from dataclasses import dataclass
 from axonn import axonn as ax
 from mpi4py import MPI
-from axonn.intra_layer import (
-    sync_gradients_data_parallel,
-    sync_gradients_depth_parallel,
-)
+from axonn.intra_layer import sync_gradients
+
 import torch
 import numpy as np
 
@@ -418,8 +423,7 @@ def forward_backward_optimizer(
             assert not eval_mode
             post_bw_hook(self.model)
 
-        sync_gradients_depth_parallel(self.model, mean=True)
-        sync_gradients_data_parallel(self.model, mean=True)
+        sync_gradients(self.model, mean=True, expert_mode=True)
         if self.computation_dtype == torch.float16:
             global_overflow = self._unscale_gradients()
             if not global_overflow: