reorg code and first implementation of the new easy API (#96)

axonn-ai · Oct 18, 2024 · 0b6328c · 0b6328c
1 parent 879ae57
commit 0b6328c
Show file tree

Hide file tree

Showing 31 changed files with 771 additions and 379 deletions.
diff --git a/.github/workflows/nvidia-rtx-3090-tests.yaml b/.github/workflows/nvidia-rtx-3090-tests.yaml
@@ -29,7 +29,7 @@ jobs:
         export G_inter=${{ matrix.ginter }}
         export G_data=$(( 2 / G_inter ))
         echo "training with G_inter = ${G_inter}, G_data = $(( 2 / G_inter  )) ${{ matrix.memopt }}" 
-        mpirun -n 2 pytest --with-mpi ./axonn/tests/test_vit.py 
+        PYTHONPATH="." mpirun -n 2 pytest --with-mpi ./axonn/tests/test_vit.py 
     - name: Uninstall AxoNN
       run: |
         pip uninstall --yes axonn
@@ -46,13 +46,13 @@ jobs:
     - name: Run intra-layer FC unit tests 
       run: |
         torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_fc.py 
-    - name: Run intra-layer Conv unit tests 
-      run: |
-        torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_conv.py 
-    - name: Run intra-layer Embedding unit tests
-      run: |
-        torchrun --nproc_per_node 2 --no_python python -m pytest ./axonn/tests/test_intra_layer_emb.py -k bw_pass
-        torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_emb.py -k fw_pass
+        #- name: Run intra-layer Conv unit tests 
+        #run: |
+        #torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_conv.py 
+        #- name: Run intra-layer Embedding unit tests
+        #run: |
+        #torchrun --nproc_per_node 2 --no_python python -m pytest ./axonn/tests/test_intra_layer_emb.py -k bw_pass
+        #torchrun --nproc_per_node 2 --no_python python -m pytest  ./axonn/tests/test_intra_layer_emb.py -k fw_pass
     - name: Uninstall AxoNN
       run: |
         pip uninstall --yes axonn
diff --git a/axonn/__init__.py b/axonn/__init__.py
@@ -1,5 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# from . import models  # noqa: F401
diff --git a/axonn/axonn.py b/axonn/axonn.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -9,14 +9,6 @@
 from .communication import communication_handle
 import torch
 
-try:
-    import mpi4py
-
-    MPI4PY = True
-    mpi4py.rc.initialize = False  # do not initialize MPI automatically
-except ImportError:
-    MPI4PY = False
-
 # True when init has been called
 is_initialized = False
 # Communication handle for point-to-point (MPI) and collective (NCCL) communication

diff --git a/axonn/checkpoint.py b/axonn/checkpoint.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2022-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

diff --git a/axonn/communication.py b/axonn/communication.py
@@ -1,12 +1,11 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
 
 try:
-    # from mpi4py import MPI
     import mpi4py
 
     MPI4PY = True
@@ -112,7 +111,7 @@ def __init__(
         if not torch.distributed.is_initialized():
             init_method = "tcp://"
             master_ip = os.getenv("MASTER_ADDR", "localhost")
-            master_port = os.getenv("MASTER_PORT", "6000")
+            master_port = os.getenv("MASTER_PORT", "29500")
             init_method += master_ip + ":" + master_port
             torch.distributed.init_process_group(
                 backend="nccl",

diff --git a/axonn/config.py b/axonn/config.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Parallel Software and Systems Group, University of Maryland.
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
 # See the top-level LICENSE file for details.
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

diff --git a/axonn/inter_layer.py b/axonn/inter_layer.py
@@ -1,11 +1,16 @@
+# Copyright 2021-2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# from . import models  # noqa: F401
+
+
 from enum import Enum
 from dataclasses import dataclass
 from axonn import axonn as ax
 from mpi4py import MPI
-from axonn.intra_layer import (
-    sync_gradients_data_parallel,
-    sync_gradients_depth_parallel,
-)
+from axonn.intra_layer import sync_gradients
+
 import torch
 import numpy as np
 
@@ -418,8 +423,7 @@ def forward_backward_optimizer(
             assert not eval_mode
             post_bw_hook(self.model)
 
-        sync_gradients_depth_parallel(self.model, mean=True)
-        sync_gradients_data_parallel(self.model, mean=True)
+        sync_gradients(self.model, mean=True, expert_mode=True)
         if self.computation_dtype == torch.float16:
             global_overflow = self._unscale_gradients()
             if not global_overflow: