[CK TILE] Implement cschuflle algorithm (#1842)

* [CK TILE] Implement cschuflle algorithm * Rebase * Vector store size fixes * fixes * Fixes * fixes * fmha fix * fixes * fixes of fixes
ROCm · Jan 30, 2025 · 25e2e0f · 25e2e0f
1 parent c5fff07
commit 25e2e0f
Show file tree

Hide file tree

Showing 18 changed files with 403 additions and 366 deletions.
diff --git a/example/ck_tile/03_gemm/gemm_basic.cpp b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -20,10 +20,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     constexpr bool kPadN = false;
     constexpr bool kPadK = false;
 
-    constexpr bool kTilePermute = false;
-    // The rank and permutation will also be generate out by the CodeGen part.
-    constexpr ck_tile::index_t kOutputRank = 2;
-
     constexpr int kBlockPerCu = 1;
 
     // This part comes from the Codegen
@@ -39,38 +35,31 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
     constexpr ck_tile::index_t N_Warp_Tile = 32;
     constexpr ck_tile::index_t K_Warp_Tile = 8;
 
-    // Whether doing the CShuffle (transpose before the global memory), depending on the output
-    // layout.
-    constexpr bool CShuffleEpilogue =
-        std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
-
     using CodegenGemmShape =
         ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
                                ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
 
     using TilePartitioner = ck_tile::GemmTile2DPartitioner<CodegenGemmShape>;
 
-    using GemmEpilogue = std::conditional_t<
-        CShuffleEpilogue,
-        ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                                                   CDataType,
-                                                                   kPadM,
-                                                                   kPadN,
-                                                                   kTilePermute,
-                                                                   kOutputRank,
-                                                                   1,
-                                                                   0,
-                                                                   TilePartitioner::MPerBlock,
-                                                                   TilePartitioner::NPerBlock>>,
-        ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
-
     using CodegenGemmTraits =
         ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
     using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                         CDataType,
+                                         CLayout,
+                                         CodegenPipelineProblem::kBlockSize,
+                                         TilePartitioner::MPerBlock,
+                                         TilePartitioner::NPerBlock,
+                                         M_Warp,
+                                         N_Warp,
+                                         M_Warp_Tile,
+                                         N_Warp_Tile,
+                                         K_Warp_Tile,
+                                         CodegenPipelineProblem::TransposeC>>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;

diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <hip/hip_runtime.h>
 
@@ -60,9 +60,6 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
     using TilePartitioner = ck_tile::GemmTile2DPartitioner<GemmShape>;
 
-    using GemmEpilogue = ck_tile::Default2DEpilogue<
-        ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>;
-
     using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
     using GemmUniversalTraits = ck_tile::
         TileGemmUniversalTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout, TransposeC>;
@@ -95,6 +92,19 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
 
         using GemmPipeline =
             GEMM_PIPELINE<UniversalGemmProblem, ck_tile::UniversalGemmPipelineAgBgCrPolicy>;
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                             CDataType,
+                                             CLayout,
+                                             GemmPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC>>;
         using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
         auto kargs   = Kernel::MakeKernelArgs(args);
 

diff --git a/example/ck_tile/16_batched_gemm/batched_gemm.cpp b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -19,12 +19,9 @@ template <typename ALayout, typename BLayout, typename CLayout>
 float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
     // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM        = false;
-    constexpr bool kPadN        = false;
-    constexpr bool kPadK        = false;
-    constexpr bool kTilePermute = false;
-    // The rank and permutation will also be generate out by the CodeGen part.
-    constexpr ck_tile::index_t kOutputRank = 2;
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
 
     constexpr int kBlockPerCu = 1;
 
@@ -41,38 +38,31 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre
     constexpr ck_tile::index_t N_Warp_Tile = 32;
     constexpr ck_tile::index_t K_Warp_Tile = 8;
 
-    // Whether doing the CShuffle (transpose before the global memory), depending on the output
-    // layout.
-    constexpr bool CShuffleEpilogue =
-        std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>;
-
     using CodegenGemmShape =
         ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
                                ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
 
     using TilePartitioner = ck_tile::GemmTile2DPartitioner<CodegenGemmShape>;
 
-    using GemmEpilogue = std::conditional_t<
-        CShuffleEpilogue,
-        ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                                                   CDataType,
-                                                                   kPadM,
-                                                                   kPadN,
-                                                                   kTilePermute,
-                                                                   kOutputRank,
-                                                                   1,
-                                                                   0,
-                                                                   TilePartitioner::MPerBlock,
-                                                                   TilePartitioner::NPerBlock>>,
-        ck_tile::Default2DEpilogue<
-            ck_tile::Default2DEpilogueProblem<AccDataType, CDataType, kPadM, kPadN>>>;
-
     using CodegenGemmTraits =
         ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
     using CodegenPipelineProblem = ck_tile::
         GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
     using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+    using GemmEpilogue        = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<AccDataType,
+                                         CDataType,
+                                         CLayout,
+                                         CodegenPipelineProblem::kBlockSize,
+                                         TilePartitioner::MPerBlock,
+                                         TilePartitioner::NPerBlock,
+                                         M_Warp,
+                                         N_Warp,
+                                         M_Warp_Tile,
+                                         N_Warp_Tile,
+                                         K_Warp_Tile,
+                                         CodegenPipelineProblem::TransposeC>>;
     // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
     // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
     using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;

diff --git a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <hip/hip_runtime.h>
 
@@ -20,12 +20,9 @@ namespace {
 
 struct GroupedGemmKernelParam
 {
-    static const bool kPadM        = false;
-    static const bool kPadN        = false;
-    static const bool kPadK        = false;
-    static const bool kTilePermute = false;
-
-    static const ck_tile::index_t kOutputRank = 2;
+    static const bool kPadM = false;
+    static const bool kPadN = false;
+    static const bool kPadK = false;
 
     static const int kBlockPerCu         = 1;
     static const ck_tile::index_t M_Tile = 128;
@@ -54,24 +51,6 @@ using CodegenGemmShape =
 
 using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
 
-template <typename CLayout>
-using GemmEpilogue = std::conditional_t<
-    std::is_same_v<CLayout, ck_tile::tensor_layout::gemm::ColumnMajor>,
-    ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<AccDataType,
-                                                               CDataType,
-                                                               GroupedGemmKernelParam::kPadM,
-                                                               GroupedGemmKernelParam::kPadN,
-                                                               GroupedGemmKernelParam::kTilePermute,
-                                                               GroupedGemmKernelParam::kOutputRank,
-                                                               1,
-                                                               0,
-                                                               TilePartitioner::MPerBlock,
-                                                               TilePartitioner::NPerBlock>>,
-    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<AccDataType,
-                                                                 CDataType,
-                                                                 GroupedGemmKernelParam::kPadM,
-                                                                 GroupedGemmKernelParam::kPadN>>>;
-
 template <typename ALayout, typename BLayout, typename CLayout>
 using CodegenGemmTraits = ck_tile::TileGemmTraits<GroupedGemmKernelParam::kPadM,
                                                   GroupedGemmKernelParam::kPadN,
@@ -92,10 +71,25 @@ template <typename ALayout, typename BLayout, typename CLayout>
 using CodegenGemmPipeline =
     ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem<ALayout, BLayout, CLayout>>;
 
+template <typename ALayout, typename BLayout, typename CLayout>
+using GemmEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+    AccDataType,
+    CDataType,
+    CLayout,
+    CodegenPipelineProblem<ALayout, BLayout, CLayout>::kBlockSize,
+    TilePartitioner::MPerBlock,
+    TilePartitioner::NPerBlock,
+    GroupedGemmKernelParam::M_Warp,
+    GroupedGemmKernelParam::N_Warp,
+    GroupedGemmKernelParam::M_Warp_Tile,
+    GroupedGemmKernelParam::N_Warp_Tile,
+    GroupedGemmKernelParam::K_Warp_Tile,
+    CodegenPipelineProblem<ALayout, BLayout, CLayout>::TransposeC>>;
+
 template <typename ALayout, typename BLayout, typename CLayout>
 using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner,
                                           CodegenGemmPipeline<ALayout, BLayout, CLayout>,
-                                          GemmEpilogue<CLayout>>;
+                                          GemmEpilogue<ALayout, BLayout, CLayout>>;
 }; // namespace
 
 std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)