From 3b75e94324254f646d1eff9b6e4a17928961d40c Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Fri, 10 Jan 2025 11:17:58 -0600
Subject: [PATCH] Match TileAndFuse Matmul Heuristics to VectorDistibute and
 raise limit of TileLargeTensorPass

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../iree/compiler/Codegen/Common/Passes.td    |  2 +-
 .../Common/test/tile_large_tensors.mlir       | 30 +++++++++----------
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 16 +++++-----
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp |  3 ++
 .../ROCDL/config_igemm_tile_and_fuse.mlir     | 12 ++++----
 .../test/ROCDL/config_tile_and_fuse.mlir      |  6 ++--
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    |  5 ++--
 .../LLVMGPU/test/config_custom_op.mlir        |  2 +-
 8 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 7188de257ca8..245b07f6deaa 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -654,7 +654,7 @@ def TileLargeTensorsPass :
   ];
   let options = [
     Option<"maxVectorSize", "max-vector-size", "int64_t",
-           /*default=*/"64",
+           /*default=*/"256",
            "Maximum static size to tile to (i.e. all remaining ops will be smaller)">,
   ];
 }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
index 66c73da981c0..3bb51a2d6d0c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
@@ -3,22 +3,22 @@
 // RUN:   FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32> {
+func.func @simple_generic(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>, %5: tensor<64x512xf32>) -> tensor<64x512xf32> {
   %6 = linalg.generic {
     indexing_maps = [#map, #map, #map],
     iterator_types = ["parallel", "parallel"]
-    } ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) {
+    } ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>) outs(%5 : tensor<64x512xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %7 = arith.addf %in, %in_0 : f32
     linalg.yield %7 : f32
-  } -> tensor<64x256xf32>
-  return %6 : tensor<64x256xf32>
+  } -> tensor<64x512xf32>
+  return %6 : tensor<64x512xf32>
 }
 
 // CHECK-LABEL: func.func @simple_generic
 //       CHECK:   scf.for %{{.*}} = %c0 to %c64 step %c1
-//       CHECK:     scf.for %{{.*}} = %c0 to %c256 step %c64
-//       CHECK:       linalg.generic {{.*}} outs({{.*}}: tensor<1x64xf32>)
+//       CHECK:     scf.for %{{.*}} = %c0 to %c512 step %c256
+//       CHECK:       linalg.generic {{.*}} outs({{.*}}: tensor<1x256xf32>)
 
 // -----
 
@@ -65,21 +65,21 @@ func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: te
 
 // -----
 
-func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>) -> (tensor<64x256xf32>, tensor<256x64xf32>) {
-  %add_empty = tensor.empty() : tensor<64x256xf32>
+func.func @multiple_use_tilable_op(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>) -> (tensor<64x512xf32>, tensor<512x64xf32>) {
+  %add_empty = tensor.empty() : tensor<64x512xf32>
   %6 = linalg.add
-    ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>)
-    outs(%add_empty : tensor<64x256xf32>) -> tensor<64x256xf32>
-  %transpose_empty = tensor.empty() : tensor<256x64xf32>
+    ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>)
+    outs(%add_empty : tensor<64x512xf32>) -> tensor<64x512xf32>
+  %transpose_empty = tensor.empty() : tensor<512x64xf32>
   %7 = linalg.transpose
-    ins(%6 : tensor<64x256xf32>)
-    outs(%transpose_empty : tensor<256x64xf32>) permutation = [1, 0]
-  return %6, %7 : tensor<64x256xf32>, tensor<256x64xf32>
+    ins(%6 : tensor<64x512xf32>)
+    outs(%transpose_empty : tensor<512x64xf32>) permutation = [1, 0]
+  return %6, %7 : tensor<64x512xf32>, tensor<512x64xf32>
 }
 
 // CHECK-LABEL: func.func @multiple_use_tilable_op
 //       CHECK:   %[[ADD_TILING:.+]] = scf.for
-//       CHECK:     linalg.add {{.*}} -> tensor<1x64xf32>
+//       CHECK:     linalg.add {{.*}} -> tensor<1x256xf32>
 //       CHECK:   %[[T_TILING:.+]] = scf.for
 //       CHECK:     %[[FUSED_ADD:.+]] = linalg.add {{.*}} -> tensor<64x1xf32>
 //       CHECK:     linalg.transpose ins(%[[FUSED_ADD]]
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 084ba36e6721..fe18a4bc6fad 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -149,25 +149,27 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/4,
              /*bestKTileCountPerSubgroup=*/8,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
+             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 2 /
+                 inBitWidth};
   } else {
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/16,
              /*bestKTileCountPerSubgroup=*/4,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
-                 inBitWidth};
+             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
   }
 
-  int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
+  // We target slightly below the full available shared Memory to leave room for
+  // `GPUReduceBankConflictsPass` that will pad shared memory without keeping
+  // track of usage. We can drop this after solving
+  // https://github.com/iree-org/iree/issues/19675
+  int64_t maxSharedMemoryBytes =
+      target.getWgp().getMaxWorkgroupMemoryBytes() - 64 * inBitWidth;
 
   // First try to find a schedule with an exactly matching intrinsic.
   std::optional<GPUMMASchedule> schedule = deduceMMASchedule(
       problem, intrinsics, seeds, maxSharedMemoryBytes, targetSubgroupSize,
       transposedLhs, transposedRhs, /*canUpcastAcc=*/false,
       /*mustBeAligned*/ mustBeAligned, doCPromotion);
-  // TODO (nirvedhmeshram) : Add support for upcasting accumulator schedule.
-  // Currently we dont have this for TileAndFuse path, see
-  // https://github.com/iree-org/iree/issues/19532
   return schedule;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 2ba637f9889c..8687bf5238d3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -620,6 +620,9 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
                           /*canUpcastAcc=*/true);
   }
 
+  LDBG("transposedLhs: " << transposedLhs);
+  LDBG("transposedRhs: " << transposedRhs);
+
   // Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
   // pipeline.
   // TODO(hanchung): Support cases that there are fused producers.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
index d8af22e58664..c1be57b94903 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
@@ -24,7 +24,7 @@ func.func @nhwc_conv_mfma() {
 //       CHECK:   linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 2, 32, 64, 0]
 
@@ -53,7 +53,7 @@ func.func @nchw_conv_mfma() {
 //       CHECK:   linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 64, 2, 32, 0]
 
@@ -81,9 +81,9 @@ func.func @nhwc_conv_unaligned_mfma() {
 
 //       CHECK:   linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
-//  CHECK-SAME:     padding = [2, 1, 32, 64, 32]
+//  CHECK-SAME:     padding = [2, 1, 32, 64, 64]
 //  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
 //  CHECK-SAME:     subgroup = [2, 1, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [2, 1, 32, 64, 0]
 
@@ -111,8 +111,8 @@ func.func @nchw_conv_unaligned_mfma() {
 
 //       CHECK:   linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
-//  CHECK-SAME:     padding = [1, 64, 2, 32, 32]
+//  CHECK-SAME:     padding = [1, 64, 2, 32, 64]
 //  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 64, 2, 32, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index c714a6a63b5b..d29336a64f27 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 4]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 1, 4, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 1, 64, 64, 0]
 
@@ -74,7 +74,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 4, 1]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8, 1]
 //  CHECK-SAME:     subgroup = [2, 2, 1, 1, 0, 0]
 //  CHECK-SAME:     workgroup = [2, 2, 32, 32, 0, 0]
 
@@ -136,7 +136,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 //       CHECK:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 2]
+//  CHECK-SAME:     reduction = [0, 0, 4]
 //  CHECK-SAME:     subgroup = [4, 4, 0]
 //  CHECK-SAME:     workgroup = [128, 128, 0]
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 1a521e61ebd9..6c9e4d5f752d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1013,9 +1013,8 @@ hal.executable public @main {
 //       CHECK:         scf.yield %[[REDUCE]]
 
 //       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1
-//       CHECK:     scf.for
-// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
-//       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
+// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9x9xf32>
+//       CHECK:       vector.transfer_write {{.*}} vector<9x9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
index bea2f2abe738..d553fb67d11b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir
@@ -40,7 +40,7 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
 // CHECK-SAME:       lowering_config = #[[CONFIG]]
 //      CHECK:   ^bb
 //      CHECK:     linalg.matmul
-// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [2, 2, 0], workgroup = [64, 64, 0]}>
+// CHECK-SAME:         lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 16], subgroup = [2, 2, 0], workgroup = [64, 64, 0]}>
 //      CHECK:   iree_linalg_ext.yield
 
 // -----