Merge branch 'master' into linear

oelayan7 · web-flow · commit e6efaf9513b0 · 2025-01-12T09:45:23.000+02:00
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
@@ -43,7 +43,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install transformers==4.45.2
+          pip install transformers
           pip install .[dev]
           ds_report
 
diff --git a/SECURITY.md b/SECURITY.md
@@ -39,3 +39,7 @@ We prefer all communications to be in English.
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
 
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
+
+---
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -191,7 +191,7 @@ def __init__(self, weight_shape=None, weight=None, bias=None):
         self.offset = 2
         super().__init__(weight_shape, weight=weight)
 
-    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0, position_ids: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         attention_mask = attention_mask.long()
 
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
@@ -342,13 +342,11 @@ def set_lm_head(module):
                 module.lm_head, "weight") and module.lm_head.weight.is_meta:
             module.lm_head.weight = embedding_weight
         # enable tensor parallel for the last linear
-        if hasattr(module, "lm_head") and hasattr(module.lm_head,
-                                                  "weight") and not module.lm_head.weight.is_meta and isinstance(
-                                                      module.lm_head, torch.nn.Linear):
+        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and isinstance(
+                module.lm_head, torch.nn.Linear):
             module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head")
-        elif hasattr(module, "embed_out") and hasattr(module.embed_out,
-                                                      "weight") and not module.embed_out.weight.is_meta and isinstance(
-                                                          module.embed_out, torch.nn.Linear):
+        elif hasattr(module, "embed_out") and hasattr(module.embed_out, "weight") and isinstance(
+                module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
         elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"):
             module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
@@ -389,7 +387,6 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                                              checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
-        replaced_module = set_lm_head(replaced_module)
         # conv2d tp module replace
         # Now is for yuan model. Add model list and conv policy to decide whether to replace conv.
         if 'Yuan' in str(replaced_module):
@@ -399,6 +396,9 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                                          orig_class=orig_layer_impl,
                                          replace_fn=replace_fn,
                                          _replace_policy=config.injection_policy_tuple)
+    # AutoTP default set lm_head tp
+    if not config.replace_with_kernel_inject:
+        replaced_module = set_lm_head(replaced_module)
 
     quantizer = GroupQuantizer(q_int8=quantize)
     world_size = dist.get_world_size() if dist.is_initialized() else 1