fsdp(jit(model)) + parameter sharing - dont duplicate allgather (#602)

kshitij12345 · web-flow · commit 21a222b18000 · 2024-06-15T20:20:44.000+02:00
diff --git a/thunder/distributed/__init__.py b/thunder/distributed/__init__.py
@@ -453,6 +453,19 @@ def fsdp_transform_module(
     # Key to this dictionary is the original parameter from the user's Module.
     # Values are the copied and sharded parameter for the thunder module and meta-data related to sharding.
     shared_params = WeakTensorKeyDictionary()
+
+    # NOTE: Shared Parameters in Trace
+    # Shared parameters in PyTorch eager are parameters of module which have different name but share the underlying tensor.
+    # For shared parameter, we replace all occurence shared parameter with it's corresponding `base` parameter.
+    # In our implementation `base` parameter is the parameter and corresponding name which we see the first time while
+    # iterating our parameters (see below). We track subsequent parameter which share the underlying Tensor with this `base` parameter
+    # in `shared_params_name` dictionary.
+    # Then while, transforming the trace - `see FSDPTraceTransform.transform_traces` - we replace all the proxy of shared parameter
+    # with the corresponding proxy of base parameter in the computation trace.
+
+    # This is used to track the shared parameters when the transform is applied.
+    # key - parameter name, value - `base` parameter name.
+    shared_params_name: dict[str, str] = {}
     for module_name, _ in thunder_model._model.named_modules():
         submodule = thunder_model.get_submodule(module_name)
 
@@ -500,6 +513,8 @@ def fsdp_transform_module(
             # If there are shared params in the original user Module, we reuse the sharded copy created from the original parameter below.
             # This way we re-create parameter sharing in thunder's copy of the Module.
             if p in shared_params:
+                # Shared param names : current param - base param
+                shared_params_name[pn] = shared_params[p]["param_name"]
                 # Re-use the previous copy of this parameter.
                 thunder_model._overrides_parameters[pn] = shared_params[p]["param_copy"]
                 sharded_params[pn] = shared_params[p]["param_shard_meta"]
@@ -520,11 +535,13 @@ def fsdp_transform_module(
             shared_params[p] = {
                 "param_copy": thunder_model._overrides_parameters[pn],
                 "param_shard_meta": sharded_params[pn],
+                "param_name": pn,
             }
 
     early_transform_from_trace_to_fsdp_trace = FSDPTraceTransform(
         sharded_params=sharded_params,
         process_group=process_group,
+        shared_params_name=shared_params_name,
     )
     # add prologue + compute transform
     thunder_model = add_transform(thunder_model, early_transform=early_transform_from_trace_to_fsdp_trace)
diff --git a/thunder/distributed/transforms/fsdp_v2.py b/thunder/distributed/transforms/fsdp_v2.py
@@ -27,6 +27,7 @@
 class FSDPTraceTransform(EarlyTransform):
     sharded_params: dict[str, Any]
     process_group: ProcessGroup
+    shared_params_name: dict[str, str]
 
     def transform_traces(self, prologue_trace, computation_trace, epilogue_trace, **kwargs):
         from thunder.distributed import prims as dist_prims
@@ -49,13 +50,15 @@ def transform_traces(self, prologue_trace, computation_trace, epilogue_trace, **
         computation_trace.push_scope([])
 
         synchronized_parameters = []
+        param_name_to_comp_trc_proxy = {}  # Track param_name to it's corresponding proxy in computation_trc.
         # todo: deal with epilogue output
         for pro_out_p, comp_inp_p in zip(prologue_trace.output, computation_trace.args):
             bsym = prologue_producers[pro_out_p]
             if bsym.sym == prims.unpack_parameter:
                 param_thunder_module, param_name = bsym.args
                 assert param_thunder_module is thunder_module_proxy
                 if param_name in self.sharded_params:
+                    param_name_to_comp_trc_proxy[param_name] = comp_inp_p
                     old_shape, new_shape, new_torch_device = self.sharded_params[param_name]
                     thunder_device = devices.to_device(new_torch_device)
                     thunder_device_str = str(thunder_device)
@@ -91,6 +94,15 @@ def transform_traces(self, prologue_trace, computation_trace, epilogue_trace, **
 
         proxies_to_replace = {id(bsym.args[0]): bsym.output for bsym in new_scope}
 
+        # See NOTE: Shared Parameters in Trace
+        for param_name, base_param in self.shared_params_name.items():
+            param_proxy = param_name_to_comp_trc_proxy[param_name]
+            base_param_proxy = param_name_to_comp_trc_proxy[base_param]
+            allgather_base_param_proxy = proxies_to_replace[id(base_param_proxy)]
+            # Update `proxies_to_replace` so we replace all usage of `param_proxy`
+            # with the output of `AllGather` on `base_param_proxy`.
+            proxies_to_replace[id(param_proxy)] = allgather_base_param_proxy
+
         new_computation_trace = from_trace(computation_trace)
         for idx, bsym in enumerate(computation_trace.bound_symbols):
             if bsym.sym != prims.unpack_trivial:
diff --git a/thunder/tests/distributed/test_ddp.py b/thunder/tests/distributed/test_ddp.py
@@ -960,7 +960,7 @@ def __init__(self) -> None:
             def forward(self, x):
                 return self.fc1(x) + self.fc2(x)
 
-        def _test_model_output_and_gradients(model, x):
+        def _test_model_output_and_gradients(model, x, duplicate_all_gather):
             output = model(x)
             with device:
                 grad_output = torch.ones_like(output)
@@ -985,6 +985,23 @@ def _test_model_output_and_gradients(model, x):
             expected_grad = 2 * (grad_output.T @ x)
             torch.testing.assert_close(actual_grad_gathered, expected_grad)
 
+            forward_exec_trace = thunder.last_traces(model)[-1]
+            gathered_params = set()
+            for bsym in forward_exec_trace.bound_symbols:
+                if bsym.sym.id in (
+                    thunder.distributed.prims.PrimIDs.ALL_GATHER,
+                    thunder.executors.torchex.all_gather_prim_impl.id,
+                ):
+                    gathered_params.add(bsym.args[0].name)
+
+            # Check trace to see we don't have duplicate AllGather for shared parameters.
+            if duplicate_all_gather:
+                # Both params are gathered.
+                assert "t_fc1_weight" in gathered_params and "t_fc2_weight" in gathered_params
+            else:
+                # Either of the param was gathered but not both.
+                assert ("t_fc1_weight" in gathered_params) ^ ("t_fc2_weight" in gathered_params)
+
         with device:
             jit_fsdp_model = Model()
             fsdp_jit_model = Model()
@@ -995,14 +1012,14 @@ def _test_model_output_and_gradients(model, x):
 
         jit_fsdp_model = thunder.jit(thunder.distributed.fsdp(jit_fsdp_model), executors=["torch"])
 
-        _test_model_output_and_gradients(jit_fsdp_model, x)
+        _test_model_output_and_gradients(jit_fsdp_model, x, duplicate_all_gather=True)
 
         # Check `fsdp(jit(model))` works
         fsdp_jit_model.fc1.weight = fsdp_jit_model.fc2.weight
 
         fsdp_jit_model = thunder.distributed.fsdp(thunder.jit(fsdp_jit_model, executors=["torch"]))
 
-        _test_model_output_and_gradients(fsdp_jit_model, x)
+        _test_model_output_and_gradients(fsdp_jit_model, x, duplicate_all_gather=False)
 
 
 common_utils.instantiate_parametrized_tests(CompileDDPTest)