merge main

samsja · samsja · commit 486a4d13ca2e · 2024-10-04T04:20:15.000Z
diff --git a/src/zeroband/comms.py b/src/zeroband/comms.py
@@ -273,6 +273,11 @@ def _resolve_world(self):
 
     def maybe_reinit_global_pg(self):
         """Reinitialize the global_pg if there are joiners or dead nodes."""
+
+        if self.world_info.global_world_size == 1:
+            # no op if we only have one node
+            return
+
         time_start = time.perf_counter()
         self._logger.debug("Resolving world")
 
@@ -334,6 +339,12 @@ def maybe_reinit_global_pg(self):
 
         self.live_recovery.init_background_loop()
 
+    def get_global_pg(self, maybe_reinit: bool = False) -> dist.ProcessGroup:
+        """Get the global process group. If maybe_reinit is True, reinitialize the global process group if needed."""
+        if maybe_reinit:
+            self.maybe_reinit_global_pg()
+        return self.global_pg
+
 
 class LiveRecoveryModel(BaseModel):
     dest_rank: int
diff --git a/src/zeroband/diloco.py b/src/zeroband/diloco.py
@@ -2,6 +2,7 @@
 from pydantic_config import BaseConfig
 import torch
 from torch import nn
+from zeroband.comms import ElasticDeviceMesh
 from zeroband.utils.world_info import get_world_info
 from zeroband.utils.logging import get_logger
 from torch.distributed.fsdp import ShardingStrategy
@@ -44,11 +45,11 @@ def __init__(
         config: DilocoConfig,
         model: nn.Module,
         fsdp_sharding_strategy: ShardingStrategy,
-        global_pg: dist.ProcessGroup,
+        elastic_device_mesh: ElasticDeviceMesh,
     ):
         self.config = config
         self.fsdp_sharding_strategy = fsdp_sharding_strategy
-        self.global_pg = global_pg
+        self.elastic_device_mesh = elastic_device_mesh
 
         self._logger = get_logger()
         self.world_info = get_world_info()
@@ -70,14 +71,15 @@ def sync_pseudo_gradient(self, model: nn.Module):
         Sync the pseudo gradient from the local process group to the global process group
         """
         self._logger.debug("sync pseudo gradient")
+        global_pg = self.elastic_device_mesh.get_global_pg(maybe_reinit=True)
         for param_offloaded, param in zip(self.param_list_cpu, model.parameters()):
             if param.shape[0] == 0:
                 continue
             param_offloaded.grad = param_offloaded.data - param.data.to(param_offloaded.device)
 
             # gloo does not support AVG
-            param_offloaded.grad = param_offloaded.grad / self.global_pg.size()
-            dist.all_reduce(param_offloaded.grad, op=dist.ReduceOp.SUM, group=self.global_pg)
+            param_offloaded.grad = param_offloaded.grad / global_pg.size()
+            dist.all_reduce(param_offloaded.grad, op=dist.ReduceOp.SUM, group=global_pg)
             # todo async here
 
     def sync_inner_model(self, model: nn.Module):
diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -175,7 +175,7 @@ def train(config: Config):
     )
 
     if config.diloco is not None:
-        diloco = Diloco(config.diloco, model, sharding_strategy, elastic_device_mesh.global_pg)
+        diloco = Diloco(config.diloco, model, sharding_strategy, elastic_device_mesh)
 
     scheduler = get_cosine_schedule_with_warmup(
         inner_optimizer,
@@ -364,6 +364,9 @@ def train(config: Config):
         metric_logger.finish()
 
     ckpt_manager.wait_async_save_process()
+
+    del elastic_device_mesh  # allow to clean up for smoother tests transition
+
     logger.info("Training finished, exiting ...")
 
 
diff --git a/tests/test_dist/test_comms.py b/tests/test_dist/test_comms.py
@@ -21,6 +21,8 @@ def foo(**kwargs):
             dist.all_reduce(a, op=dist.ReduceOp.SUM, group=edm.global_pg)
             assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]))
 
+            del edm
+
     processes = []
     for rank in range(world_size):
         processes.append(
@@ -64,6 +66,8 @@ def foo(**kwargs):
             sum_ints = global_world_size * (global_world_size + 1) // 2
             assert torch.allclose(a, torch.tensor([0, sum_ints, 2 * sum_ints]) + rank * global_world_size)
 
+            del edm
+
     global_ports = [i for i in range(21970, 21970 + world_size)]
     master_ports = [i for i in range(31000, 31000 + global_world_size)]
     processes = []
@@ -96,8 +100,8 @@ def foo(**kwargs):
             pytest.fail(f"Process {p.pid} failed with exit code {p.exitcode}")
 
 
-@pytest.mark.parametrize("world_size", [1, 2, 8])
-@pytest.mark.parametrize("global_world_size", [2, 8])
+@pytest.mark.parametrize("world_size", [1, 2])
+@pytest.mark.parametrize("global_world_size", [2, 4])
 def test_elastic_device_mesh_on_off_ramp(world_size: int, global_world_size: int, mock_env):
     ready_event = mp.Event()
 
@@ -136,6 +140,8 @@ def foo(**kwargs):
 
             dist.barrier(edm.global_pg)
 
+            del edm
+
     def bar(**kwargs):
         with mock_env(**kwargs):
             test_value = int(kwargs["TEST_VALUE"])
@@ -163,6 +169,8 @@ def bar(**kwargs):
 
             dist.barrier(edm.global_pg)
 
+            del edm
+
     global_ports = [i for i in range(21970, 21970 + world_size)]
     master_ports = [i for i in range(31000, 31000 + global_world_size + 1)]
     processes = []
diff --git a/tests/test_dist/test_diloco.py b/tests/test_dist/test_diloco.py
@@ -21,6 +21,13 @@ def test_diloco_all_reduce(world_size, random_available_port, dist_environment):
     if it is done correclty.
     """
 
+    class FakeElasticDeviceMesh:
+        def __init__(self):
+            self.global_pg = dist.new_group(backend="gloo")
+
+        def get_global_pg(self, maybe_reinit: bool = False) -> dist.ProcessGroup:
+            return self.global_pg
+
     def all_reduce(rank: int, world_size: int):
         with dist_environment(random_available_port, rank=rank, world_size=world_size, global_unique_id=str(rank)):
             diloco_config = DilocoConfig(inner_steps=10)
@@ -31,8 +38,7 @@ def all_reduce(rank: int, world_size: int):
             for param in model.parameters():
                 param.data = (rank + 1) * torch.ones_like(param.data).to("cuda")
 
-            global_pg = dist.new_group(backend="gloo")
-            diloco = Diloco(diloco_config, model, ShardingStrategy.FULL_SHARD, global_pg)
+            diloco = Diloco(diloco_config, model, ShardingStrategy.FULL_SHARD, FakeElasticDeviceMesh())
 
             # simulate inner model updates
             for param in model.parameters():