use bfloat16 in grad reduction (= reduce-scatter / all-reduce)

ref: https://github.com/pytorch/pytorch/blob/762b1b4/torch/distributed/_composable/fsdp/_fsdp_api.py#L9 Signed-off-by: Masaki Kozuki <mkozuki@nvidia.com>
Lightning-AI · Aug 16, 2024 · e0fd16c · e0fd16c
1 parent 43b6345
commit e0fd16c
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/thunder/benchmarks/benchmark_litgpt.py b/thunder/benchmarks/benchmark_litgpt.py
@@ -319,14 +319,17 @@ def setup_distributed(self, model):
                             transformer_block,
                             mesh=mesh,
                             reshard_after_forward=reshard_after_forward,
-                            mp_policy=MixedPrecisionPolicy(),
+                            mp_policy=MixedPrecisionPolicy(
+                                param_dtype=torch.bfloat16,
+                                reduce_dtype=torch.bfloat16,
+                            ),
                         )
 
                 fully_shard(
                     model,
                     mesh=mesh,
                     reshard_after_forward=reshard_after_forward,
-                    mp_policy=MixedPrecisionPolicy(),
+                    mp_policy=MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16),
                 )
                 model.to_empty(device=self.device)
                 model.apply(model._init_weights)