Merge pull request #48 from flash-algo/add-dot-triton-kernel

LoserCheems · web-flow · commit 93aeb57ec893 · 2025-12-01T11:44:59.000+08:00
[PERFORMANCE OPTIMIZATION] add dot triton kernel
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ The following common BLAS kernels have been implemented in multiple frameworks.
 | [swap](./docs/swap.md) | swap vectors | $x \leftrightarrow y$ | $0$ | $4n$ | [✅](./kernel_course/python_ops/swap.py) | [✅](./kernel_course/pytorch_ops/swap.py) | [✅](./kernel_course/triton_ops/swap.py) | ❌ | [✅](./tests/test_swap.py) |
 | [scal](./docs/scal.md) | scale vector | $y = \alpha y$ | $n$ | $2n$ | [✅](./kernel_course/python_ops/scal.py) | [✅](./kernel_course/pytorch_ops/scal.py) | [✅](./kernel_course/triton_ops/scal.py) | ❌ | [✅](./tests/test_scal.py) |
 | [axpby](./docs/axpby.md) | update vector| $y = \alpha x + \beta y$ | $3n$ | $3n$ | [✅](./kernel_course/python_ops/axpby.py) | [✅](./kernel_course/pytorch_ops/axpby.py) | [✅](./kernel_course/triton_ops/axpby.py) | ❌ | [✅](./tests/test_axpby.py) |
-| [dot](./docs/dot.md) | dot product | $z = x^\top y$ | $2n$ | $2n$ | [✅](./kernel_course/python_ops/dot.py) | [✅](./kernel_course/pytorch_ops/dot.py) | ❌ | ❌ | ❌ |
+| [dot](./docs/dot.md) | dot product | $z = x^\top y$ | $2n$ | $2n$ | [✅](./kernel_course/python_ops/dot.py) | [✅](./kernel_course/pytorch_ops/dot.py) | [✅](./kernel_course/triton_ops/dot.py) | ❌ | ❌ |
 | gemv | general matrix-vector multiply | $y = \alpha A x + \beta y$ | $2mn$ | $mn + n + 2m$ | ❌ | ❌ | ❌ | ❌ | ❌ |
 | geru | general rank-1 update | $A = A + \alpha x y^\top$ | $2mn$ | $2mn + m + n$ | ❌ | ❌ | ❌ | ❌ | ❌ |
 | gemm | general matrix-matrix multiply | $C = \alpha A B + \beta C$ | $2mnk$ | $mk + nk + 2mn$ | ❌ | ❌ | ❌ | ❌ | ❌ |
diff --git a/kernel_course/triton_ops/dot.py b/kernel_course/triton_ops/dot.py
@@ -0,0 +1,70 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 1024}, num_stages=4, num_warps=4),
+        triton.Config({"BLOCK_SIZE": 2048}, num_stages=4, num_warps=8),
+    ],
+    key=["n_elements"],
+)
+@triton.jit
+def dot_kernel(
+    x_ptr,
+    y_ptr,
+    z_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # There are multiple program processing different blocks of data
+    # We identify which program we are in using program_id
+    pid = tl.program_id(axis=0)
+    # This program will process inputs that offset from the initial pointer
+    # For example, if you had a vector of size 256 and block_size of 64, the programs would each access the elements [0:64], [64:128], [128:192], [192:256]
+    # We need note that offsets is a list of pointers
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Create a mask to guard memory operations against out-of-bounds accesses
+    mask = offsets < n_elements
+    # Load x and y from DRAM, masking out any extra elements in case the input is not a multiple of the block_size
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    # Compute z = x \cdot y
+    z = tl.dot(tl.trans(x), y)
+    # Write z back to DRAM
+    tl.store(z_ptr + offsets, z, mask=mask)
+
+
+def dot(
+    x: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Computes the dot product of two tensors `x` and `y` using a Triton kernel.
+
+    Args:
+        x (torch.Tensor): First tensor.
+        y (torch.Tensor): Second tensor.
+
+    Returns:
+        torch.Tensor: The dot product of `x` and `y`.
+    """
+
+    # Calculate the number of elements in the input tensors
+    n_elements = x.numel()
+
+    # Allocate output tensor
+    z = torch.empty(1, device=x.device, dtype=x.dtype)
+
+    # The SPMD launch grid denotes the number of kernel instances that run it parallelly
+    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
+    # In this case, we use a 1D grid where the size is the number of blocks needed to cover all elements
+
+    def grid(meta):
+        return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+    dot_kernel[grid](x, y, z, n_elements)
+
+    return z