diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index a7451d1d0..d1c7bba8a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -27,9 +27,14 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -enum class GPUMappingType { Block, Thread }; +enum class GPUGroupType { Block, Thread }; + +/// Assign GPU dialect thread/block mapping attributes to tiled dimensions. +/// The returned vector's size is the number of non-zero values in +/// `tileSizesVal`. Failure is returned if it is not possible to assign +/// mapping attributes to the dimensions. FailureOr> getGPUMappingAttributes( - ArrayRef tileSizesVal, GPUMappingType mappingType, + ArrayRef tileSizesVal, GPUGroupType groupType, TilingInterface op) { MLIRContext *context = op.getContext(); @@ -43,57 +48,62 @@ FailureOr> getGPUMappingAttributes( std::count_if(tileSizesVal.begin(), tileSizesVal.end(), [](int64_t t) { return t > 1; }); - // See mlir::gpu::MappingId enum. Currently 13 dimensions are supported. - // clang-format off - // https://github.com/llvm/llvm-project/blob/e8063702cfbbf39f0b92283d0588dee264b5eb2b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td#L37. - // clang-format on + // The mlir::gpu::MappingId enum supports 13 dimensions, see: + // https://github.com/llvm/llvm-project/blob/main + // /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) { return op->emitOpError("has too many dimensions to tile, ") << "there are only " << mlir::gpu::getMaxEnumValForMappingId() - << " dimensions available in the mlir::gpu dialect (require " - << nbIndVars << " here)."; + << " dimensions available in the mlir::gpu dialect, but " + << nbIndVars << " are required here.."; } // Currently we expect only 2 tiled dimensions to be >1 when mapping // to thread dimensions. This is to target the 2-D AIE array. // // TODO(newling) if there are 3+ dimensions, we probably need to collapse - // them. I'm leaving this as follow-up pass. Instead of - // (i,j,k) in (2,3,5) - // for example, could be - // (i,l) in (2,15) + // them into just 2. I'm leaving this as a follow-up task. Basically, instead + // of + // ```(i,j,k) in (2,3,5)``` + // we want + // ```(i,l) in (2,15)``` // with then // j=l/5 and k=l%5. - // Once the above is implemented, we can safely remove the following check - // entirely: - if (nbIndVarsAboveOne > 2 && mappingType == GPUMappingType::Thread) { + // + // Once the above is implemented, we can safely remove the following check: + if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) { auto tileSizesStr = std::to_string(tileSizesVal[0]); for (unsigned i = 1; i < tileSizesVal.size(); ++i) { tileSizesStr += ", " + std::to_string(tileSizesVal[i]); } - return op->emitOpError("has requested tile sizes (") + return op->emitOpError("has requested tile sizes [") << tileSizesStr - << "). Currently we only support tiling thread dimensions " - << "with at most 2 dimensions having a tile size greater than 1. "; + << "]. Currently we only support tiling thread dimensions " + << "with at most 2 dimensions having a tile size greater than 1, " + << "there are " << nbIndVarsAboveOne << " here."; } auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute { auto id = static_cast(i); - if (mappingType == GPUMappingType::Block) + if (groupType == GPUGroupType::Block) return gpu::GPUBlockMappingAttr::get(context, id); - else if (mappingType == GPUMappingType::Thread) + else if (groupType == GPUGroupType::Thread) return gpu::GPUThreadMappingAttr::get(context, id); else { - assert(false && "unhandled mapping type"); + assert(false && "Unhandled group type, must be thread or block."); } }; + // Map an integer to an Attribute as follows: // 0 -> DimY // 1 -> DimX // 2 -> DimZ // 3 -> LinearDim0 // 4 -> LinearDim1 // etc. + // + // Note that 0 and 1 are effectively swapped, because for AIE we want to + // map the first dimension to AIE array columns (or something like that). auto getAttribute = [&](uint32_t i) -> Attribute { if (i == 0) return getMappingAttributeForDimension(1); @@ -103,9 +113,8 @@ FailureOr> getGPUMappingAttributes( return getMappingAttributeForDimension(i); }; - // Map from tiling dimension to Attribute. // We give priority to tiling dimensions of size > 1, so that they - // preferentially get DimY and DimX. + // preferentially get DimY and DimX. SmallVector mapping(tileSizesVal.size(), {}); uint32_t nAttributes = 0; for (uint32_t i = 0; i < tileSizesVal.size(); ++i) { @@ -269,7 +278,7 @@ void AMDAIETileAndFusePass::runOnOperation() { options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); auto maybeMapping = getGPUMappingAttributes( tileSizesVal, - tilingLevel == 0 ? GPUMappingType::Block : GPUMappingType::Thread, + tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread, consumerOp); if (failed(maybeMapping)) { return signalPassFailure(); @@ -302,8 +311,7 @@ void AMDAIETileAndFusePass::runOnOperation() { // Fuse all Linalg ops (can be generalized later) .Default([&](Operation *op) { return op->getDialect() == - rewriter.getContext() - ->getLoadedDialect(); + context->getLoadedDialect(); }); return {fusableOp, false}; }); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir index 17269f221..407106fbe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/tile_and_fuse_using_scf_forall.mlir @@ -1,5 +1,5 @@ -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0 -// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1 +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0 +// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s | FileCheck %s --check-prefix=TILE-LEVEL-1 // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> { @@ -34,6 +34,24 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3 // TILE-LEVEL-0: linalg.conv_2d_nhwc_hwcf // TILE-LEVEL-0: } {mapping = [#gpu.block, #gpu.block, #gpu.block]} +// TILE-LEVEL-1: @conv_2d_nhwc_hwcf +// TILE-LEVEL-1: scf.forall +// TILE-LEVEL-1-SAME: { +// TILE-LEVEL-1: linalg.fill +// TILE-LEVEL-1: linalg.conv_2d_nhwc_hwcf +// TILE-LEVEL-1: } {mapping = [#gpu.thread, #gpu.thread, #gpu.thread, #gpu.thread]} + +// ----- + +func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<2x12x12x64xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + // expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}} + %2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32> + return %2 : tensor<2x12x12x64xf32> +} + // ----- func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {