Enable SplitK and fix autotuner for trtllm fp4 fused moe (#1548)

stslxg-nv · web-flow · commit 375a26b65505 · 2025-09-05T00:48:15.000-07:00
Enable splitK for trtllm-gen fused moe. Make autotuner for trtllm-gen fp4 fused moe more robust. Add autotuner for trtllm-gen fused moe test.  ## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -104,13 +104,6 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         tileSize == mOptions.tileSize &&
         options.mUseShuffledMatrixA == mOptions.useShuffledMatrixA &&
         options.mLayoutA == mOptions.weightLayout) {
-      // FIXME: Disable split-k for swiglu for now.
-      if (static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType) ==
-              batchedGemm::gemmGatedAct::ActType::SwiGlu &&
-          options.mClusterDimZ != 1) {
-        continue;
-      }
-
       if (options.mFusedAct) {
         if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType)) {
           continue;
diff --git a/flashinfer/artifacts.py b/flashinfer/artifacts.py
@@ -111,7 +111,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):
 class ArtifactPath:
     TRTLLM_GEN_FMHA: str = "037e528e719ec3456a7d7d654f26b805e44c63b1/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "037e528e719ec3456a7d7d654f26b805e44c63b1/batched_gemm-8704aa4-ba3b00d/"
+        "e6f22dcc3fdeb29ff87af2f4a2cb3d30b8d273e0/batched_gemm-45beda1-ee6a802/"
     )
     TRTLLM_GEN_GEMM: str = (
         "037e528e719ec3456a7d7d654f26b805e44c63b1/gemm-8704aa4-f91dc9e/"
@@ -125,7 +125,7 @@ class MetaInfoHash:
         "0ff77215b86997665cf75973e13cd2932f551d46b4e008f851d32d47e1d9560f"
     )
     TRTLLM_GEN_BMM: str = (
-        "34bdfe7acfd49f5fb8b48e06d56e6a5ad88b951c730552f228fc5f614f7632a8"
+        "c98b4ce69a39fd41556d67033c30ea814ef76b0a2fe16e798e55baf0104acc34"
     )
     DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"
     TRTLLM_GEN_GEMM: str = (
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -29,6 +29,7 @@
     TunableRunner,
     TuningConfig,
 )
+from ..jit.core import logger
 from ..jit import JitSpec
 from ..jit import env as jit_env
 from ..jit import (
@@ -1104,9 +1105,14 @@ def get_valid_tactics(
                 num_tokens,
             )
             if instance_key not in MoERunner.valid_tactics_dict:
-                MoERunner.valid_tactics_dict[instance_key] = (
-                    moe_op.trtllm_get_valid_moe_configs(*instance_key)
-                )
+                try:
+                    valid_tactics = moe_op.trtllm_get_valid_moe_configs(*instance_key)
+                except Exception as e:
+                    logger.debug(
+                        f"[Autotuner]: Failed to get valid tactics for {instance_key}. Error occurred: {e}"
+                    )
+                    return []
+                MoERunner.valid_tactics_dict[instance_key] = valid_tactics
             return MoERunner.valid_tactics_dict[instance_key]
 
         def forward(
diff --git a/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h b/include/flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/GemmOptions.h
@@ -1085,22 +1085,34 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
                      ")");
   }
 
+  // Number of iterations in K dimension after padding.
+  // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations.
+  // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is
+  //
+  //   ceil(512 / (128*3)) * (128*3) = 768
+  //
+  int const paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK);
+  int const perCtaK = paddedK / options.mNumSlicesForSplitK;
+  // However, number of iterations is clamped to multiples of tileK within individual CTAs
+  // E.g., K = 448, TileK = 64, numSlicesForSplitK = 4.
+  //
+  //   paddedK                        = 512
+  //   perCtaK                        = 128
+  //   clampedPerCtaK for CTA 0, 1, 2 = 128
+  //   clampedPerCtaK for CTA 3       = 64
+  int const paddingForK = paddedK - options.mK;
+  int const clampedAndPaddedPerCtaK = divUpMul(perCtaK - paddingForK, options.mTileK);
   if (options.mUseUnrollLoop2xForMma) {
-    // Number of iterations in K dimension after padding.
-    // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations.
-    // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is
-    //
-    //   ceil(512 / (128*3)) * (128*3) = 768
+    // Check that the padded K and clamped padded K (K rounded to next multiple of tileK) is a
+    // multiple of 2*TileK when UnrollLoop2x is enabled. This is to avoid deadlock when mma runs
+    // even-numbered loop while the other warps run odd-numbered loop.
     //
-    int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK);
-    // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when
-    // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the
-    // other warps run odd-numbered loop.
-    //
-    bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0;
+    bool notSupported = (perCtaK % (options.mTileK * 2) != 0) ||
+                        (clampedAndPaddedPerCtaK % (options.mTileK * 2) != 0);
     if (notSupported) {
       TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=",
                        options.mTileK, " and K=", options.mK, " (paddedK=", paddedK,
+                       " clampedAndPaddedPerCtaK=", clampedAndPaddedPerCtaK,
                        ") and numSlicesForSplitK=", options.mNumSlicesForSplitK,
                        ". Disabling unrollLoop2xForMma.");
       if (updateOptions) {
@@ -1110,6 +1122,11 @@ inline bool checkAndUpdateGemmOptions(GemmOptions& options, bool isBlackwell, in
       }
     }
   }
+  if (options.mNumSlicesForSplitK > 1) {
+    TLLM_CHECK_ERROR(
+        perCtaK * (options.mNumSlicesForSplitK - 1) < options.mK,
+        "K must be greater than perCtaK * (numSlicesForSplitK - 1) to ensure each CTA has work");
+  }
 
   if (!isBlackwell && options.mTileScheduler == TileScheduler::Persistent) {
     // TODO(anchengc): will be supported in upcoming MRs.
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py
@@ -33,6 +33,7 @@
     reorder_rows_for_gated_act_gemm,
     shuffle_matrix_a,
 )
+from flashinfer.autotuner import autotune
 from flashinfer.fp4_quantization import block_scale_interleave
 from flashinfer.fused_moe import (
     WeightLayout,
@@ -105,7 +106,7 @@ def capture(self, hidden_states_sample, **runtime_args):
         self.input_tensor = hidden_states_sample.clone()
 
         # Warmup
-        with torch.cuda.stream(torch_stream):
+        with torch.cuda.stream(torch_stream), autotune(True):
             for _ in range(1):
                 self._run_moe_computation(runtime_args)
 

Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def get_available_cubin_files(source, retries=3, delay=5, timeout=10):`
`111`	`111`	`class ArtifactPath:`
`112`	`112`	`TRTLLM_GEN_FMHA: str = "037e528e719ec3456a7d7d654f26b805e44c63b1/fmha/trtllm-gen/"`
`113`	`113`	`TRTLLM_GEN_BMM: str = (`
`114`		`- "037e528e719ec3456a7d7d654f26b805e44c63b1/batched_gemm-8704aa4-ba3b00d/"`
	`114`	`+ "e6f22dcc3fdeb29ff87af2f4a2cb3d30b8d273e0/batched_gemm-45beda1-ee6a802/"`
`115`	`115`	`)`
`116`	`116`	`TRTLLM_GEN_GEMM: str = (`
`117`	`117`	`"037e528e719ec3456a7d7d654f26b805e44c63b1/gemm-8704aa4-f91dc9e/"`
`@@ -125,7 +125,7 @@ class MetaInfoHash:`
`125`	`125`	`"0ff77215b86997665cf75973e13cd2932f551d46b4e008f851d32d47e1d9560f"`
`126`	`126`	`)`
`127`	`127`	`TRTLLM_GEN_BMM: str = (`
`128`		`- "34bdfe7acfd49f5fb8b48e06d56e6a5ad88b951c730552f228fc5f614f7632a8"`
	`128`	`+ "c98b4ce69a39fd41556d67033c30ea814ef76b0a2fe16e798e55baf0104acc34"`
`129`	`129`	`)`
`130`	`130`	`DEEPGEMM: str = "69aa277b7f3663ed929e73f9c57301792b8c594dac15a465b44a5d151b6a1d50"`
`131`	`131`	`TRTLLM_GEN_GEMM: str = (`