diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp index 055e22069..0152db206 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIETileAndFuse.cpp @@ -16,12 +16,12 @@ #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/IR/Iterators.h" #include "mlir/IR/PatternMatch.h" #define DEBUG_TYPE "iree-amdaie-tile-and-fuse" - namespace mlir::iree_compiler::AMDAIE { namespace { @@ -157,27 +157,43 @@ void AMDAIETileAndFusePass::runOnOperation() { SmallVector tileSizes = getAsIndexOpFoldResult(context, tileSizesVal); + auto options = scf::SCFTilingOptions().setTileSizes(tileSizes); // When tiling using scf.for we do not need to set any mapping. if (!useSCFFor) { options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); - // Here we assume there are always two levels of parallel (scf.forall) - // loops, and the first level of tiling is always using scf.forall and - // mapped to blocks. Currently we are not using mapping attributes for - // Conv2d ops, because there could be four parallel tiling dimensions. - // TODO (vivian): create AIE specific mapping attributes. - if (!isa(consumerOp.getOperation())) { - if (tilingLevel == 0) { - options.setMapping( - {gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY), - gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)}); - } else { - options.setMapping( - {gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY), - gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)}); - } + + // There is one induction variables in the scf.forall for each of the + // non-zero tile sizes. Recall that a '0' tile size corresonds to 'do + // not tile'. + uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(), + [](int64_t t) { return t != 0; }); + + // See mlir::gpu::MappingId enum: there are currently 13 values. + if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) { + LLVM_DEBUG(llvm::dbgs() << "----- skip, too many loops to tile -----\n"); + return; + } + auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute { + auto id = static_cast(i); + if (tilingLevel == 0) return gpu::GPUBlockMappingAttr::get(context, id); + return gpu::GPUThreadMappingAttr::get(context, id); + }; + + SmallVector mapping; + mapping.reserve(nbIndVars); + for (uint32_t i = 0; i < nbIndVars; ++i) { + // DimY and DimX are swapped, so that DimX goes to AIE array columns and + // DimX goes to AIE arrat rows (or something like that...) + if (i == 0) + mapping.push_back(getMappingAttributeForDimension(1)); + else if (i == 1) + mapping.push_back(getMappingAttributeForDimension(0)); + else + mapping.push_back(getMappingAttributeForDimension(i)); } + options.setMapping(mapping); } IRRewriter rewriter(context); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index a1bd27d91..4c42d7a7a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -633,6 +633,24 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createAMDAIEBridgeToAIRPass()); + + // Running canonicalization for all pipelines here results in failures. + // Example + // ``` + // 'memref.cast' op is an unsupported operation. This pass currently only + // supports AllocOp and SubViewOp as inputs. + // ``` + // It is currently required for the convolution pipeline though, to remove the + // extra (size-1) thread- and group- dimensions. + // + // TODO(newling) there are better solutions like: + // 1) make canonicalization work for scf.forall + // 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls. + // 3) resolve above 'unsupproted operation' error. + if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) { + passManager.addPass(createCanonicalizerPass()); + } + // TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass. if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline) passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());