Update tile and distribute to enable workgroup reordering

pashu123 · pashu123 · commit f96b60117e45 · 2024-12-17T17:33:14.000+05:30
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -14,6 +14,7 @@
 
 #include <limits>
 
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Common/PassUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -94,6 +95,11 @@ createTileAndDistributeToWorkgroupsPass(
     int32_t maxWorkgroupParallelDims,
     linalg::DistributionMethod distributionMethod);
 
+// Pass to tile and distribute using scf.forall with workgroup reordering.
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createTileAndDistributeToWorkgroupsWithReordering(
+    ReorderWorkgroupsStrategy strategy);
+
 //----------------------------------------------------------------------------//
 // CodeGen Common Patterns
 //----------------------------------------------------------------------------//
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -632,6 +632,10 @@ def TileAndDistributeToWorkgroupsUsingForallOpPass :
     "scf::SCFDialect",
     "tensor::TensorDialect",
   ];
+  let options = [
+    Option<"strategy", "strategy", "std::string", /*default=*/"",
+           "Workgroup reordering strategy, one of: '' (none),  'transpose'">,
+  ];
 }
 
 def TileLargeTensorsPass :
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -4,6 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
@@ -33,8 +34,34 @@ namespace {
 struct TileAndDistributeToWorkgroupsUsingForallOpPass final
     : public impl::TileAndDistributeToWorkgroupsUsingForallOpPassBase<
           TileAndDistributeToWorkgroupsUsingForallOpPass> {
+  TileAndDistributeToWorkgroupsUsingForallOpPass(
+      ReorderWorkgroupsStrategy strategy)
+      : reorderingStrategy(strategy) {}
+
   using Base::Base;
   void runOnOperation() override;
+
+  LogicalResult initializeOptions(
+      StringRef options,
+      function_ref<LogicalResult(const Twine &)> errorHandler) override {
+    if (failed(Pass::initializeOptions(options, errorHandler))) {
+      return failure();
+    }
+    auto selectedStrategy =
+        llvm::StringSwitch<FailureOr<ReorderWorkgroupsStrategy>>(strategy)
+            .Case("", ReorderWorkgroupsStrategy::None)
+            .Case("transpose", ReorderWorkgroupsStrategy::Transpose)
+            .Default(failure());
+    if (failed(selectedStrategy))
+      return failure();
+
+    reorderingStrategy = *selectedStrategy;
+    return success();
+  }
+
+private:
+  ReorderWorkgroupsStrategy reorderingStrategy =
+      ReorderWorkgroupsStrategy::None;
 };
 
 } // namespace
@@ -190,6 +217,28 @@ pruneDroppedLoops(ArrayRef<Attribute> inputs,
   return prunedAttrs;
 }
 
+// Checks whether we have static dimension for all the loop bounds and steps.
+// This is a requirement if the reordering strategy is set to `transpose`.
+static bool checkStaticLoopBounds(scf::ForallOp forallOp) {
+
+  SmallVector<OpFoldResult> mixedLbs = forallOp.getMixedLowerBound();
+  SmallVector<OpFoldResult> mixedUbs = forallOp.getMixedUpperBound();
+  SmallVector<OpFoldResult> mixedSteps = forallOp.getMixedStep();
+
+  for (auto [index, lb, ub, step] :
+       llvm::enumerate(mixedLbs, mixedUbs, mixedSteps)) {
+
+    std::optional<int64_t> lbVal = getConstantIntValue(lb);
+    std::optional<int64_t> ubVal = getConstantIntValue(ub);
+    std::optional<int64_t> stepVal = getConstantIntValue(step);
+
+    if (!(lbVal && ubVal && stepVal)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 /// Find dimensions of the loop that are unit-trip count and drop them from the
 /// distributed dimensions.
 static LogicalResult dropUnitDistributedDims(RewriterBase &rewriter,
@@ -516,6 +565,20 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
       // TODO: run producer and consumer fusion in one worklist.
       fuseProducersOfSlices(rewriter, newFusionOpportunities,
                             tileAndFuseOptions, newLoop);
+      forallOp = newLoop;
+    }
+
+    // Reorder the workgroups if the strategy is set to `transpose`.
+    // This just transposes the first two dimensions of the workgroup i.e., the
+    // #iree.codegen.workgroup_id_x and #iree.codegen.workgroup_id_y.
+    // Only reorders if the loop bounds are static.
+    if (reorderingStrategy == ReorderWorkgroupsStrategy::Transpose) {
+      SmallVector<Attribute> mappingAttrs(forallOp.getMappingAttr().getValue());
+      int64_t mappingSize = mappingAttrs.size();
+      if (checkStaticLoopBounds(forallOp) && mappingAttrs.size() >= 2) {
+        std::swap(mappingAttrs[mappingSize - 1], mappingAttrs[mappingSize - 2]);
+        forallOp.setMappingAttr(ArrayAttr::get(context, mappingAttrs));
+      }
     }
   }
 
@@ -538,4 +601,10 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
 
   return;
 }
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createTileAndDistributeToWorkgroupsWithReordering(
+    ReorderWorkgroupsStrategy strategy) {
+  return std::make_unique<TileAndDistributeToWorkgroupsUsingForallOpPass>(
+      strategy);
+}
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir
@@ -1,4 +1,5 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups-using-forall-op, cse))" --mlir-print-local-scope --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups-using-forall-op{strategy=transpose}, cse))" --mlir-print-local-scope --split-input-file %s | FileCheck %s --check-prefix=REORDER
 
 func.func @matmul_tensors(%0 : tensor<?x?xf32>, %1 : tensor<?x?xf32>, %2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %3 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0]]>}
@@ -672,3 +673,66 @@ func.func @v_shaped_graph(%0: tensor<12xf32>, %1: tensor<12xf32>) -> tensor<12xf
 //   CHECK-DAG:     %[[RIGHT:.+]] = linalg.generic {{.*}} ins(%[[SLICE1]]
 //       CHECK:     linalg.generic {{.*}} ins(%[[LEFT]], %[[RIGHT]]
 //       CHECK:   return %[[RESULT]]
+
+// -----
+
+func.func @dont_transpose_dynamic(%0 : tensor<?x?xf32>, %1 : tensor<?x?xf32>, %2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %3 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0]]>}
+      ins(%0, %1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %3 : tensor<?x?xf32>
+}
+
+// TRANSPOSE-LABEL: func @dont_transpose_dynamic(
+//       TRANSPOSE:   scf.forall
+//       TRANSPOSE:    [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]
+
+// -----
+
+func.func @transpose_static(%0 : tensor<128x128xf32>, %1 : tensor<128x128xf32>, %2 : tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %3 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0]]>}
+      ins(%0, %1 : tensor<128x128xf32>, tensor<128x128xf32>)
+      outs(%2 : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %3 : tensor<128x128xf32>
+}
+
+// TRANSPOSE-LABEL: func @transpose_static(
+//       TRANSPOSE:   scf.forall
+//       TRANSPOSE:    [#iree_codegen.workgroup_mapping<x>, #iree_codegen.workgroup_mapping<y>]
+
+// -----
+
+func.func @only_transpose_x_y(%7 : tensor<128x128x128x128xf32>, %8 : tensor<128x128x128x128xf32>) -> tensor<128x128x128x128xf32> {
+  %9 = tensor.empty() : tensor<128x128x128x128xf32>
+  %10 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%7, %8 : tensor<128x128x128x128xf32>, tensor<128x128x128x128xf32>)
+      outs(%9 : tensor<128x128x128x128xf32>)
+      attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[2, 64, 64, 64]]>} {
+    ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
+      %11 = arith.addf %arg0, %arg1 : f32
+      linalg.yield %11 : f32
+    } -> tensor<128x128x128x128xf32>
+  return %10 : tensor<128x128x128x128xf32>
+}
+
+// TRANSPOSE-LABEL: func @only_transpose_x_y(
+//       TRANSPOSE:   scf.forall
+//       TRANSPOSE:     mapping = [#iree_codegen.workgroup_mapping<z:1>, #iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<x>, #iree_codegen.workgroup_mapping<y>]
+
+// -----
+
+// Incase of less than 2 workgroup_mapping, don't apply transpose.
+func.func @dont_transpose_less(%0 : tensor<128x128xf32>, %1 : tensor<128x128xf32>, %2 : tensor<128x128xf32>) -> tensor<128x128xf32> {
+  %3 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 0, 0]]>}
+      ins(%0, %1 : tensor<128x128xf32>, tensor<128x128xf32>)
+      outs(%2 : tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %3 : tensor<128x128xf32>
+}
+
+// TRANSPOSE-LABEL: func @dont_transpose_less(
+//       TRANSPOSE:   scf.forall
+//       TRANSPOSE:    [#iree_codegen.workgroup_mapping<x>]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -186,10 +186,11 @@ static void addBufferizePasses(OpPassManager &funcPassManager) {
 static void tileAndDistributeToWorkgroup(
     OpPassManager &funcPassManager, bool useForall,
     std::optional<ConvertToDestinationPassingStylePassOptions>
-        convertToDpsOptions = ConvertToDestinationPassingStylePassOptions{}) {
+        convertToDpsOptions = ConvertToDestinationPassingStylePassOptions{},
+    ReorderWorkgroupsStrategy strategy = ReorderWorkgroupsStrategy::None) {
   if (useForall) {
     funcPassManager.addPass(
-        createTileAndDistributeToWorkgroupsUsingForallOpPass());
+        createTileAndDistributeToWorkgroupsWithReordering(strategy));
   } else {
     funcPassManager.addPass(createTileAndDistributeToWorkgroupsPass(
         kNumMaxParallelDims,
@@ -772,10 +773,11 @@ static void addVectorBufferizePasses(OpPassManager &funcPassManager) {
 void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
                                         const GPUPipelineOptions &options,
                                         bool usePadToModelSharedMemcpy) {
-  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true);
-
   ReorderWorkgroupsStrategy reorderStrategy =
       getReorderWorkgroupsStrategy(options.reorderStrategy);
+  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/true,
+                               std::nullopt, reorderStrategy);
+
   funcPassManager.addPass(
       createReorderWorkgroups(reorderStrategy, canReorderWorkgroups));
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
@@ -33,18 +33,16 @@ hal.executable public @main_0_dispatch_0 {
       // OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
       // OPT-OUT:         memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
       // OPT-OUT:         memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
-      // OPT-OUT-DAG:     %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
-      // OPT-OUT-DAG:     %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
-      // OPT-OUT-DAG:     arith.muli %[[WG_Y]], %{{.+}} : index
-      // OPT-OUT-DAG:     arith.addi %{{.+}}, %[[WG_X]] : index
-      // OPT-OUT:         scf.for
+      // OPT-OUT:         scf.forall
+      // OPT-OUT:          scf.for
+      // OPT-OUT:         } {mapping = [#iree_codegen.workgroup_mapping<x>, #iree_codegen.workgroup_mapping<y>]}
 
       // OPT-IN-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
       // OPT-IN:         memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
       // OPT-IN:         memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
-      // OPT-IN-DAG:     %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
-      // OPT-IN-DAG:     %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
-      // OPT-IN:         scf.for
+      // OPT-IN:         scf.forall
+      // OPT-IN:          scf.for
+      // OPT-IN:         } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
@@ -108,20 +106,16 @@ hal.executable public @main_0_dispatch_0 {
       // OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
       // OPT-OUT:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
       // OPT-OUT:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
-      // OPT-OUT-DAG:     %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
-      // OPT-OUT-DAG:     %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
-      // OPT-OUT-DAG:     arith.muli %[[WG_Y]], %{{.+}} : index
-      // OPT-OUT-DAG:     arith.addi %{{.+}}, %[[WG_X]] : index
-      // OPT-OUT:         scf.for
+      // OPT-OUT:         scf.forall
+      // OPT-OUT:          scf.for
+      // OPT-OUT:         } {mapping = [#iree_codegen.workgroup_mapping<x>, #iree_codegen.workgroup_mapping<y>]}
 
       // OPT-IN-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
       // OPT-IN:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
       // OPT-IN:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
-      // OPT-IN-DAG:     %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
-      // OPT-IN-DAG:     %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
-      // OPT-IN-DAG:     arith.muli %[[WG_Y]], %{{.+}} : index
-      // OPT-IN-DAG:     arith.addi %{{.+}}, %[[WG_X]] : index
-      // OPT-IN:         scf.for
+      // OPT-IN:         scf.forall
+      // OPT-IN:          scf.for
+      // OPT-IN:         } {mapping = [#iree_codegen.workgroup_mapping<x>, #iree_codegen.workgroup_mapping<y>]}
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <Transpose>>  // enable the 'reorderWorkgroups' pass.
@@ -180,9 +174,9 @@ hal.executable public @main_0_dispatch_0 {
       // OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
       // OPT-OUT:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
       // OPT-OUT:         memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
-      // OPT-OUT-DAG:     hal.interface.workgroup.id[1] : index
-      // OPT-OUT-DAG:     hal.interface.workgroup.id[0] : index
-      // OPT-OUT-NEXT:    scf.for
+      // OPT-OUT:         scf.forall
+      // OPT-OUT:          scf.for
+      // OPT-OUT:         } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = <None>>  // Disable the 'reorderWorkgroups' pass.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -1023,7 +1023,7 @@ hal.executable private @attention_20x4096x64x4096x64 {
 // Check that we only use alloc for Q, K, and V. No shared memory for S is
 // needed because the intrinsic layout mathes.
 // MEMORY-LABEL: func.func @attention_20x4096x64x4096x64()
-// MEMORY-COUNT-4: memref.alloc
+// MEMORY-COUNT-3: memref.alloc
 // MEMORY-NOT: memref.alloc
 
 // -----
@@ -1090,6 +1090,7 @@ hal.executable private @attention_multiple_m_transpose {
 
 // Check that we only use alloc for Q, K, and V. No shared memory for S is
 // needed because the intrinsic layout mathes.
+// TODO: With forall distribution it's allocating memory for S.
 // MEMORY-LABEL: func.func @attention_multiple_m_transpose()
 // MEMORY-COUNT-4: memref.alloc
 // MEMORY-NOT: memref.alloc
@@ -1159,7 +1160,7 @@ hal.executable private @attention_mfma_32x32x8 {
 // Check that we only use alloc for Q, K, and V. No shared memory for S is
 // needed because the intrinsic layout mathes.
 // MEMORY-LABEL: func.func @attention_mfma_32x32x8()
-// MEMORY-COUNT-3: memref.alloc
+// MEMORY-COUNT-4: memref.alloc
 // MEMORY-NOT: memref.alloc
 
 // -----
@@ -1311,3 +1312,4 @@ module {
 
 // MEMORY-LABEL: func.func @attention_gather_k
 // MEMORY-COUNT-3: memref.alloc
+// MEMORY-NOT:     memref.alloc