Merge branch 'master' into loadams/fix-ds-chat-transformers

microsoft · Jan 7, 2025 · 143647a · 143647a
2 parents 1b85e81 + c348c5b
commit 143647a
Show file tree

Hide file tree

Showing 28 changed files with 421 additions and 301 deletions.
diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml
@@ -21,7 +21,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
@@ -39,7 +39,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -21,8 +21,8 @@ def __init__(self):
         self.apply_hpu_workarounds()
         try:
             import habana_frameworks.torch.hpu as hpu
-            hpu.setDeterministic(True)
             self.hpu = hpu
+            torch.use_deterministic_algorithms(True)
         except ImportError as e:
             raise ValueError(
                 f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")

diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
@@ -20,6 +20,12 @@
 DS_COMM_REDUCE_OFF = False
 
 
+def disable_compiler_collective(func):
+    if required_torch_version(min_version=2.3):
+        return func
+    return compiler.disable(func)
+
+
 def build_shm_op():
     builder = get_accelerator().create_op_builder("ShareMemCommBuilder")
     if builder is None or not deepspeed.ops.__compatible_ops__[builder.NAME]:
@@ -114,7 +120,7 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
             self.shm_comm_op.initialize(self.get_world_size(), self.get_rank())
 
     @classmethod
-    @compiler.disable
+    @disable_compiler_collective
     def get_all_gather_function(self):
         if hasattr(torch.distributed, "all_gather_into_tensor"):
             return torch.distributed.all_gather_into_tensor
@@ -123,7 +129,7 @@ def get_all_gather_function(self):
         return None
 
     @classmethod
-    @compiler.disable
+    @disable_compiler_collective
     def get_reduce_scatter_function(self):
         if hasattr(torch.distributed, "reduce_scatter_tensor"):
             return torch.distributed.reduce_scatter_tensor
@@ -146,7 +152,7 @@ def init_process_group(self, backend, timeout, init_method, rank, world_size):
                                                  world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
@@ -158,7 +164,7 @@ def inference_all_reduce(self, tensor, op, group=None):
         else:
             return torch.ops.deepspeed.inference_all_reduce_(tensor)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         """ proxy func to torch.distributed.all_reduce_coalesced,
         which is included in PyTorch 1.13 and above
@@ -169,15 +175,15 @@ def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group
         op = self._reduce_op(op)
         return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_OFF:
             if int(os.getenv('RANK', '0')) == 0:
                 utils.logger.warning("REDUCE is OFF")
             return Noop()
         return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_SCATTER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -190,7 +196,7 @@ def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_
                                                     group=group,
                                                     async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def broadcast(self, tensor, src, group=None, async_op=False):
         if DS_COMM_BROADCAST_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -199,7 +205,7 @@ def broadcast(self, tensor, src, group=None, async_op=False):
         else:
             return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -208,15 +214,15 @@ def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         else:
             return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
         if self.has_all_gather_into_tensor():
             return self.all_gather_function(output_tensor=output_tensor,
                                             input_tensor=input_tensor,
                                             group=group,
                                             async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -234,7 +240,7 @@ def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=Fals
                                      "please consider upgrading your pytorch installation.")
                 pass
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
         """"""
         assert len(output_tensors) == len(input_tensors), ""
@@ -258,7 +264,7 @@ def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_
             else:
                 reqs[-1].wait()
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
         if self.has_reduce_scatter_tensor():
             return self.reduce_scatter_function(output_tensor,
@@ -272,7 +278,7 @@ def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, gr
                                  "please consider upgrading your pytorch installation.")
             pass
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_to_all_single(self,
                           output,
                           input,
@@ -287,49 +293,49 @@ def all_to_all_single(self,
                                                    group=group,
                                                    async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_to_all(self, output_tensor_list, input_tensor_list, group=None, async_op=False):
         return torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def send(self, tensor, dst, group=None, tag=0):
         return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def recv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.recv(tensor=tensor, src=src, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def isend(self, tensor, dst, group=None, tag=0):
         return torch.distributed.isend(tensor=tensor, dst=dst, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def irecv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.irecv(tensor=tensor, src=src, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
         return torch.distributed.gather(tensor=tensor,
                                         gather_list=gather_list,
                                         dst=dst,
                                         group=group,
                                         async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
         return torch.distributed.scatter(tensor=tensor,
                                          scatter_list=scatter_list,
                                          src=src,
                                          group=group,
                                          async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
         return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
 
-    @compiler.disable
+    @disable_compiler_collective
     def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD

diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py
@@ -4,9 +4,4 @@
 # DeepSpeed Team
 
 from .quantize import FP_Quantize, Quantizer
-
-try:
-    import triton
-    from .fp8_gemm import matmul_fp8
-except ImportError:
-    pass
+from .fp8_gemm import matmul_fp8
diff --git a/deepspeed/ops/fp_quantizer/fp8_gemm.py b/deepspeed/ops/fp_quantizer/fp8_gemm.py
@@ -11,161 +11,18 @@
 ###################################
 
 import torch
-import triton
-import triton.language as tl
 
 
-@triton.jit
-def matmul_kernel_fp8_bf16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
-                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
-                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
-                           quantization_group_size: tl.constexpr):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
+def matmul_fp8(inp, weight, scale, quantization_group_size, quantizer):
+    from deepspeed import get_accelerator
 
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    if not get_accelerator().is_triton_supported():
+        return matmul_fp8_fallback(inp, weight, scale, quantization_group_size, quantizer)
+    else:
+        # Import dynamically to prevent failures on systems without triton.
+        from .fp8_gemm_triton import matmul_fp8_triton
+        return matmul_fp8_triton(inp, weight, scale, quantization_group_size)
 
-    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
-    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
-        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
 
-    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
-    scale = tl.load(scale_ptr + weight_ptrs_offset)
-
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-        # Dequantize weight (fp8 -> bf16)
-        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 4)).to(tl.uint16)
-        w = (w + 0x3C00).to(tl.uint16)
-        w = (w.to(tl.bfloat16, bitcast=True) * scale).to(tl.bfloat16)
-
-        inp_data += BLOCK_SIZE_K * stride_ak
-        weight_data += BLOCK_SIZE_K * stride_bk
-        weight_mask = offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K
-        weight = tl.load(weight_data, mask=weight_mask, other=0.0)
-        scale = tl.load(scale_ptr + (weight_ptrs_offset +
-                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)),
-                        mask=weight_mask,
-                        other=0.0)
-
-        accumulator += tl.dot(inp, w)
-
-    out = accumulator.to(tl.bfloat16)
-
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
-
-
-@triton.jit
-def matmul_kernel_fp8_fp16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
-                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
-                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
-                           quantization_group_size: tl.constexpr):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-
-    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
-    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
-        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
-
-    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
-    scale = tl.load(scale_ptr + weight_ptrs_offset)
-
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
-        # Dequantize weight (fp8 -> fp16)
-        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 7)).to(tl.uint16)
-        w = (w + 0x2000).to(tl.uint16)
-        w = (w.to(tl.float16, bitcast=True) * scale).to(tl.float16)
-
-        inp_data += BLOCK_SIZE_K * stride_ak
-        weight_data += BLOCK_SIZE_K * stride_bk
-
-        weight = tl.load(weight_data, mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0)
-        scale = tl.load(scale_ptr + (weight_ptrs_offset +
-                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)))
-
-        accumulator += tl.dot(inp, w)
-
-    out = accumulator.to(tl.float16)
-
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
-
-
-def matmul_fp8(inp, weight, scale, quantization_group_size):
-
-    assert inp.shape[1] == weight.shape[0], \
-        f"Incompatible dimensions (input: {inp.shape}, weight: {weight.shape})"
-
-    M, K = inp.shape
-    K, N = weight.shape
-
-    out = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
-
-    # GEMM tuning parameters!
-    # TODO: Add a more configurable tuning for selecting the best GeMM
-    BLOCK_SIZE_M = 16 if M <= 16 else 32 if M <= 32 else 64 if M <= 64 else 128
-    BLOCK_SIZE_N = 64
-    BLOCK_SIZE_K = max(64, quantization_group_size)
-    GROUP_SIZE_M = 8
-    num_stages = 4
-    num_warps = 4
-    if M >= 256:
-        BLOCK_SIZE_M = 256
-        BLOCK_SIZE_N = 128
-        BLOCK_SIZE_K = max(128, quantization_group_size)
-        num_stages = 3
-        num_warps = 8
-
-    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
-    kernel = matmul_kernel_fp8_bf16 if inp.dtype == torch.bfloat16 else matmul_kernel_fp8_fp16
-    kernel[grid](inp,
-                 weight,
-                 out,
-                 scale,
-                 M,
-                 N,
-                 K,
-                 inp.stride(0),
-                 inp.stride(1),
-                 weight.stride(0),
-                 weight.stride(1),
-                 out.stride(0),
-                 out.stride(1),
-                 quantization_group_size=quantization_group_size,
-                 BLOCK_SIZE_M=BLOCK_SIZE_M,
-                 BLOCK_SIZE_N=BLOCK_SIZE_N,
-                 BLOCK_SIZE_K=BLOCK_SIZE_K,
-                 GROUP_SIZE_M=GROUP_SIZE_M,
-                 num_stages=num_stages,
-                 num_warps=num_warps)
-    return out
+def matmul_fp8_fallback(inp, weight, scale, quantization_group_size, quantizer):
+    return torch.matmul(inp, quantizer.dequantize(weight, scale=scale))