From 38c78858ae7d41ef1afe906ce83431d5c278ab1d Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Thu, 29 Aug 2024 08:51:56 -0700
Subject: [PATCH] add test of thread mapping and test of error

---
 .../Transforms/AMDAIETileAndFuse.cpp          | 60 +++++++++++--------
 .../test/tile_and_fuse_using_scf_forall.mlir  | 22 ++++++-
 2 files changed, 54 insertions(+), 28 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
index a7451d1d0..d1c7bba8a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp
@@ -27,9 +27,14 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-enum class GPUMappingType { Block, Thread };
+enum class GPUGroupType { Block, Thread };
+
+/// Assign GPU dialect thread/block mapping attributes to tiled dimensions.
+/// The returned vector's size is the number of non-zero values in
+/// `tileSizesVal`. Failure is returned if it is not possible to assign
+/// mapping attributes to the dimensions.
 FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
-    ArrayRef<int64_t> tileSizesVal, GPUMappingType mappingType,
+    ArrayRef<int64_t> tileSizesVal, GPUGroupType groupType,
     TilingInterface op) {
   MLIRContext *context = op.getContext();
 
@@ -43,57 +48,62 @@ FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
       std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
                     [](int64_t t) { return t > 1; });
 
-  // See mlir::gpu::MappingId enum. Currently 13 dimensions are supported.
-  // clang-format off
-  // https://github.com/llvm/llvm-project/blob/e8063702cfbbf39f0b92283d0588dee264b5eb2b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td#L37.
-  // clang-format on
+  // The mlir::gpu::MappingId enum supports 13 dimensions, see:
+  // https://github.com/llvm/llvm-project/blob/main
+  //   /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
   if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
     return op->emitOpError("has too many dimensions to tile, ")
            << "there are only " << mlir::gpu::getMaxEnumValForMappingId()
-           << " dimensions available in the mlir::gpu dialect (require "
-           << nbIndVars << " here).";
+           << " dimensions available in the mlir::gpu dialect, but "
+           << nbIndVars << " are required here..";
   }
 
   // Currently we expect only 2 tiled dimensions to be >1 when mapping
   // to thread dimensions. This is to target the 2-D AIE array.
   //
   // TODO(newling) if there are 3+ dimensions, we probably need to collapse
-  // them. I'm leaving this as follow-up pass. Instead of
-  //   (i,j,k) in (2,3,5)
-  // for example, could be
-  //   (i,l) in (2,15)
+  // them into just 2. I'm leaving this as a follow-up task. Basically, instead
+  // of
+  //   ```(i,j,k) in (2,3,5)```
+  // we want
+  //   ```(i,l) in (2,15)```
   // with then
   //   j=l/5 and k=l%5.
-  // Once the above is implemented, we can safely remove the following check
-  // entirely:
-  if (nbIndVarsAboveOne > 2 && mappingType == GPUMappingType::Thread) {
+  //
+  // Once the above is implemented, we can safely remove the following check:
+  if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) {
     auto tileSizesStr = std::to_string(tileSizesVal[0]);
     for (unsigned i = 1; i < tileSizesVal.size(); ++i) {
       tileSizesStr += ", " + std::to_string(tileSizesVal[i]);
     }
-    return op->emitOpError("has requested tile sizes (")
+    return op->emitOpError("has requested tile sizes [")
            << tileSizesStr
-           << "). Currently we only support tiling thread dimensions "
-           << "with at most 2 dimensions having a tile size greater than 1. ";
+           << "]. Currently we only support tiling thread dimensions "
+           << "with at most 2 dimensions having a tile size greater than 1, "
+           << "there are " << nbIndVarsAboveOne << " here.";
   }
 
   auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
     auto id = static_cast<gpu::MappingId>(i);
-    if (mappingType == GPUMappingType::Block)
+    if (groupType == GPUGroupType::Block)
       return gpu::GPUBlockMappingAttr::get(context, id);
-    else if (mappingType == GPUMappingType::Thread)
+    else if (groupType == GPUGroupType::Thread)
       return gpu::GPUThreadMappingAttr::get(context, id);
     else {
-      assert(false && "unhandled mapping type");
+      assert(false && "Unhandled group type, must be thread or block.");
     }
   };
 
+  // Map an integer to an Attribute as follows:
   // 0 -> DimY
   // 1 -> DimX
   // 2 -> DimZ
   // 3 -> LinearDim0
   // 4 -> LinearDim1
   // etc.
+  //
+  // Note that 0 and 1 are effectively swapped, because for AIE we want to
+  // map the first dimension to AIE array columns (or something like that).
   auto getAttribute = [&](uint32_t i) -> Attribute {
     if (i == 0)
       return getMappingAttributeForDimension(1);
@@ -103,9 +113,8 @@ FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
       return getMappingAttributeForDimension(i);
   };
 
-  // Map from tiling dimension to Attribute. 
   // We give priority to tiling dimensions of size > 1, so that they
-  // preferentially get DimY and DimX. 
+  // preferentially get DimY and DimX.
   SmallVector<Attribute> mapping(tileSizesVal.size(), {});
   uint32_t nAttributes = 0;
   for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
@@ -269,7 +278,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
     options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
     auto maybeMapping = getGPUMappingAttributes(
         tileSizesVal,
-        tilingLevel == 0 ? GPUMappingType::Block : GPUMappingType::Thread,
+        tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread,
         consumerOp);
     if (failed(maybeMapping)) {
       return signalPassFailure();
@@ -302,8 +311,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
                   // Fuse all Linalg ops (can be generalized later)
                   .Default([&](Operation *op) {
                     return op->getDialect() ==
-                           rewriter.getContext()
-                               ->getLoadedDialect<linalg::LinalgDialect>();
+                           context->getLoadedDialect<linalg::LinalgDialect>();
                   });
           return {fusableOp, false};
         });
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir
index 17269f221..407106fbe 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s                        | FileCheck %s --check-prefix=TILE-LEVEL-0
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s   | FileCheck %s --check-prefix=TILE-LEVEL-1
 // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY
 
 func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {
@@ -34,6 +34,24 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3
 //      TILE-LEVEL-0:       linalg.conv_2d_nhwc_hwcf
 //      TILE-LEVEL-0:   }  {mapping = [#gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}
 
+//      TILE-LEVEL-1: @conv_2d_nhwc_hwcf
+//      TILE-LEVEL-1:   scf.forall
+// TILE-LEVEL-1-SAME:   {
+//      TILE-LEVEL-1:       linalg.fill
+//      TILE-LEVEL-1:       linalg.conv_2d_nhwc_hwcf
+//      TILE-LEVEL-1:   }  {mapping = [#gpu.thread<z>, #gpu.thread<linear_dim_0>, #gpu.thread<y>, #gpu.thread<x>]}
+
+// -----
+
+func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<2x12x12x64xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  // expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}}
+  %2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 4, 0, 0, 0], [1, 4, 4, 4, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]]>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
+  return %2 : tensor<2x12x12x64xf32>
+}
+
 // -----
 
 func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {