small notes, enable other tile sizes

raayandhar · raayandhar · commit dcbc17a31769 · 2025-11-16T21:09:37.000-08:00
Signed-off-by: Raayan Dhar raayan.dhar@gmail.com &lt;raayan.dhar@gmail.com&gt;
diff --git a/csrc/bf16_gemm_cutlass.jinja b/csrc/bf16_gemm_cutlass.jinja
@@ -19,9 +19,9 @@
 namespace flashinfer {
 namespace gemm {
     INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 1, 1, _1SM);
-    // INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 2, 1, _1SM);
-    // INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 4, 1, _1SM);
-    // INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 1, 1, _2SM);
-    // INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 2, 1, _2SM);
+    INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 2, 1, _1SM);
+    INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 4, 1, _1SM);
+    INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 1, 1, _2SM);
+    INSTANCE_BF16_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 2, 1, _2SM);
 }  // namespace gemm
 }  // namespace flashinfer
diff --git a/flashinfer/gemm/gemm_base.py b/flashinfer/gemm/gemm_base.py
@@ -211,7 +211,18 @@ def mm_bf16(
     torch.Tensor
         Out tensor, shape (m, n), bf16 or fp16.
 
-    # Note: add Examples section here
+    Examples
+    --------
+    >>> import torch
+    >>> import torch.nn.functional as F
+    >>> import flashinfer
+    >>> input = torch.randn([48, 64], device="cuda", dtype=torch.bfloat16
+    >>> weight = torch.randn([80, 64], device="cuda", dtype=torch.bfloat16).transpose(-2, -1)
+    >>> out = flashinfer.mm_bf16(input, weight)
+    >>> print(out.shape)
+    torch.Size([48, 80])
+    >>> out.dtype
+    torch.bfloat16
     """
     if backend != "cutlass":
         raise ValueError(f"Unsupported backend: {backend}. Only cutlass is available.")
@@ -277,7 +288,18 @@ def bmm_bf16(
     torch.Tensor
         Out tensor, shape (b, m, n), bf16 or fp16.
 
-    # Note: add Examples section here
+    Examples
+    --------
+    >>> import torch
+    >>> import torch.nn.functional as F
+    >>> import flashinfer
+    >>> input = torch.randn([16, 48, 64], device="cuda", dtype=torch.bfloat16
+    >>> weight = torch.randn([16, 80, 64], device="cuda", dtype=torch.bfloat16).transpose(-2, -1)
+    >>> out = flashinfer.bmm_bf16(input, weight)
+    >>> print(out.shape)
+    torch.Size([16, 48, 80])
+    >>> out.dtype
+    torch.bfloat16
     """
     if backend != "cutlass":
         raise ValueError(f"Unsupported backend: {backend}. Only cutlass is available.")
diff --git a/flashinfer/jit/gemm/core.py b/flashinfer/jit/gemm/core.py
@@ -202,11 +202,10 @@ def gen_gemm_sm100_module_cutlass_bf16() -> JitSpec:
         dtype_list = ["__nv_bfloat16", "half"]
         cta_m_n_k_list = [
             (64, 64, 128),
-            # (64, 128, 128),
-            # (64, 256, 128),
-            # (128, 64, 128),
-            # (128, 128, 128),
-            # (128, 256, 128),
+            (64, 128, 128),
+            (64, 256, 128),
+            (128, 64, 128),
+            (128, 128, 128),
         ]
         for cta_m, cta_n, cta_k in cta_m_n_k_list:
             for dtype in dtype_list:
diff --git a/include/flashinfer/gemm/bf16_gemm_cutlass_template.h b/include/flashinfer/gemm/bf16_gemm_cutlass_template.h
@@ -64,26 +64,26 @@ size_t dispatchGemmClusterShapeSm100(__nv_bfloat16 const* A, __nv_bfloat16 const
                                                 _1SM>(A, B, D, m, n, k, b, gemmConfig, workspacePtr,
                                                       workspaceBytes, stream);
       break;
-    // case ClusterShape::ClusterShape_1x2x1:
-    //   return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_,
-    //                                             Shape<_1, _2, _1>, _1SM>(
-    //     A, B, D, m, n, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
-    //   break;
-    // case ClusterShape::ClusterShape_1x4x1:
-    //   return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_,
-    //                                             Shape<_1, _4, _1>, _1SM>(
-    //     A, B, D, m, n, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
-    //   break;
-    // case ClusterShape::ClusterShape_2x1x1:
-    //   return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_,
-    //                                             Shape<_2, _1, _1>, _2SM>(
-    //     A, B, D, m, n, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
-    //   break;
-    // case ClusterShape::ClusterShape_2x2x1:
-    //   return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_,
-    //                                             Shape<_2, _2, _1>, _2SM>(
-    //     A, B, D, m, n, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
-    //   break;
+    case ClusterShape::ClusterShape_1x2x1:
+      return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_, Shape<_1, _2, _1>,
+                                                _1SM>(A, B, D, m, n, k, b, gemmConfig, workspacePtr,
+                                                      workspaceBytes, stream);
+      break;
+    case ClusterShape::ClusterShape_1x4x1:
+      return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_, Shape<_1, _4, _1>,
+                                                _1SM>(A, B, D, m, n, k, b, gemmConfig, workspacePtr,
+                                                      workspaceBytes, stream);
+      break;
+    case ClusterShape::ClusterShape_2x1x1:
+      return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_, Shape<_2, _1, _1>,
+                                                _2SM>(A, B, D, m, n, k, b, gemmConfig, workspacePtr,
+                                                      workspaceBytes, stream);
+      break;
+    case ClusterShape::ClusterShape_2x2x1:
+      return genericBf16GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_, Shape<_2, _2, _1>,
+                                                _2SM>(A, B, D, m, n, k, b, gemmConfig, workspacePtr,
+                                                      workspaceBytes, stream);
+      break;
     default:
       throw std::runtime_error("invalid config for bf16 gemm");
       break;
@@ -101,31 +101,22 @@ size_t dispatchToArch(__nv_bfloat16 const* A, __nv_bfloat16 const* B, void* D, i
       return dispatchGemmClusterShapeSm100<T, arch, 64, 64, 128>(
           B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
       break;
-      // case CutlassTileConfigSM100::CtaShape64x128x128B:
-      //  return dispatchGemmClusterShapeSm100<T, arch, 64, 128, 128>(
-      //      B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes,
-      //      stream);
-      //  break;
-      // case CutlassTileConfigSM100::CtaShape64x256x128B:
-      //  return dispatchGemmClusterShapeSm100<T, arch, 64, 256, 128>(
-      //      B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes,
-      //      stream);
-      //  break;
-      // case CutlassTileConfigSM100::CtaShape128x64x128B:
-      //  return dispatchGemmClusterShapeSm100<T, arch, 128, 64, 128>(
-      //      B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes,
-      //      stream);
-      //  break;
-      // case CutlassTileConfigSM100::CtaShape128x128x128B:
-      //  return dispatchGemmClusterShapeSm100<T, arch, 128, 128, 128>(
-      //      B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes,
-      //      stream);
-      //  break;
-      // case CutlassTileConfigSM100::CtaShape128x256x128B:
-      //  return dispatchGemmClusterShapeSm100<T, arch, 128, 256, 128>(
-      //      B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes,
-      //      stream);
-      //  break;
+    case CutlassTileConfigSM100::CtaShape64x128x128B:
+      return dispatchGemmClusterShapeSm100<T, arch, 64, 128, 128>(
+          B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
+      break;
+    case CutlassTileConfigSM100::CtaShape64x256x128B:
+      return dispatchGemmClusterShapeSm100<T, arch, 64, 256, 128>(
+          B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
+      break;
+    case CutlassTileConfigSM100::CtaShape128x64x128B:
+      return dispatchGemmClusterShapeSm100<T, arch, 128, 64, 128>(
+          B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
+      break;
+    case CutlassTileConfigSM100::CtaShape128x128x128B:
+      return dispatchGemmClusterShapeSm100<T, arch, 128, 128, 128>(
+          B, A, static_cast<T*>(D), n, m, k, b, gemmConfig, workspacePtr, workspaceBytes, stream);
+      break;
 
     default:
       throw std::runtime_error("unsupported tile config for bf16 gemm");
@@ -189,15 +180,15 @@ std::vector<CutlassGemmConfig> CutlassBf16GemmRunner<T>::getConfigs() const {
   std::vector<CutlassGemmConfig> candidate_configs;
 
   std::vector<CutlassTileConfigSM100> tilesSm100 = {
-      CutlassTileConfigSM100::CtaShape64x64x128B,  // CutlassTileConfigSM100::CtaShape64x128x128B,
-      // CutlassTileConfigSM100::CtaShape64x256x128B,  CutlassTileConfigSM100::CtaShape128x64x128B,
-      // CutlassTileConfigSM100::CtaShape128x128x128B, CutlassTileConfigSM100::CtaShape128x256x128B,
+      CutlassTileConfigSM100::CtaShape64x64x128B,   CutlassTileConfigSM100::CtaShape64x128x128B,
+      CutlassTileConfigSM100::CtaShape64x256x128B,  CutlassTileConfigSM100::CtaShape128x64x128B,
+      CutlassTileConfigSM100::CtaShape128x128x128B,
   };
 
   std::vector<ClusterShape> clusterShapes = {
-      ClusterShape::ClusterShape_1x1x1,  // ClusterShape::ClusterShape_1x2x1,
-      // ClusterShape::ClusterShape_1x4x1, ClusterShape::ClusterShape_2x1x1,
-      // ClusterShape::ClusterShape_2x2x1,
+      ClusterShape::ClusterShape_1x1x1, ClusterShape::ClusterShape_1x2x1,
+      ClusterShape::ClusterShape_1x4x1, ClusterShape::ClusterShape_2x1x1,
+      ClusterShape::ClusterShape_2x2x1,
   };
 
   for (auto const& tile_config : tilesSm100) {
diff --git a/include/flashinfer/gemm/bf16_gemm_template_sm100.h b/include/flashinfer/gemm/bf16_gemm_template_sm100.h
@@ -151,25 +151,31 @@ size_t genericBf16GemmKernelLauncherSm100(__nv_bfloat16 const* A, __nv_bfloat16
 
   Gemm gemm;
 
-  CUTLASS_CHECK(gemm.can_implement(arguments));
+  // Return workspace size
+  if (!A && !B && !D) {
+    return gemm.get_workspace_size(arguments);
+  }
 
-  size_t workspace_size = gemm.get_workspace_size(arguments);
-  if (workspace_size > workspaceBytes) {
+  if (gemm.get_workspace_size(arguments) > workspaceBytes) {
     throw std::runtime_error("[Bf16 Gemm Runner] insufficient workspace");
   }
 
-  // NOTE: These can also be simplified using CUTLASS_CHECK. Same goes for some of the other files.
-  cutlass::Status initStatus = gemm.initialize(arguments, workspacePtr, stream);
+  auto can_implement = gemm.can_implement(arguments);
+  if (can_implement != cutlass::Status::kSuccess) {
+    throw std::runtime_error("[Bf16 Gemm Runner] cutlass kernel not implemented given the params");
+  }
+
+  auto initStatus = gemm.initialize(arguments, workspacePtr);
   if (initStatus != cutlass::Status::kSuccess) {
     throw std::runtime_error("[Bf16 Gemm Runner] failed to initialize");
   }
 
-  cutlass::Status runStatus = gemm.run(stream);
+  auto runStatus = gemm.run(stream);
   if (runStatus != cutlass::Status::kSuccess) {
     throw std::runtime_error("[Bf16 Gemm Runner] failed to run");
   }
 
-  return workspace_size;
+  return gemm.get_workspace_size(arguments);
 }
 
 }  // namespace gemm