ILP Conv Layer : Fixed overlap all_gather optimisation; Renamed Tenso…

…r_Parallel_Conv2d to Conv2d
axonn-ai · Jan 17, 2024 · 454f031 · 454f031
1 parent b653760
commit 454f031
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/axonn/intra_layer/__init__.py b/axonn/intra_layer/__init__.py
@@ -1,6 +1,6 @@
 from contextlib import contextmanager
 from .fully_connected import Linear  # noqa: F401
-from .conv import Conv2d as Tensor_Parallel_Conv2d  # noqa: F401
+from .conv import Conv2d  # noqa: F401
 
 from .communication import Drop, Gather
 from .gradient_normalization import clip_grad_norm_  # noqa: F401
@@ -86,7 +86,7 @@ def clear_weights_cache():
 def trigger_async_all_gathers(model):
     global weights_cache
     for module in model.modules():
-        if isinstance(module, Linear):
+        if isinstance(module, Linear) or isinstance(module, Conv2d):
             weight = module.weight
             if weight not in weights_cache:
                 # only trigger all gathers if not in cache

diff --git a/axonn/intra_layer/conv.py b/axonn/intra_layer/conv.py
@@ -167,7 +167,7 @@ def forward(
         if self.bias is None:
             return h
         else:
-            bias = self.bias  # Why do we need this extra copy?
+            bias = self.bias
             if gather_output:
                 bias = Gather.apply(bias, self.outer_group)
 

diff --git a/axonn/tests/test_intra_layer_conv.py b/axonn/tests/test_intra_layer_conv.py
@@ -3,7 +3,7 @@
 from axonn import axonn as ax
 from axonn.intra_layer.communication import _drop, _gather
 from axonn.intra_layer import (
-    Tensor_Parallel_Conv2d,
+    Conv2d,
     optimize_communication,
     clear_weights_cache,
     sync_gradients,
@@ -76,7 +76,7 @@ def test_fw_pass(G_intra_r, G_intra_c, G_intra_d, B, H, W, C, easy_tp, bias):
     else:
         X_local = X
 
-    layer = Tensor_Parallel_Conv2d(
+    layer = Conv2d(
         in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias
     ).cuda()
 
@@ -155,7 +155,7 @@ def test_bw_pass(
     depth_group = ax.comm_handle.depth_intra_layer_parallel_group
 
     # parallel backward pass
-    layer = Tensor_Parallel_Conv2d(
+    layer = Conv2d(
         in_channels=C, out_channels=2 * C, kernel_size=5, bias=bias
     ).cuda()