From 53b96d5e967358134bd1e5ba62283c6d5be4cac4 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Thu, 9 Jan 2025 23:12:08 +0100 Subject: [PATCH 1/2] [KernelDispatch] Add matmul RHS outer permutation (#1016) --- .../Transforms/KernelDispatch.cpp | 60 +++++++++++++++---- .../test/lowering_strategy_air.mlir | 10 ++-- .../test/lowering_strategy_generic.mlir | 4 +- .../lowering_strategy_objectfifo_npu1.mlir | 30 +++++----- .../lowering_strategy_objectfifo_npu4.mlir | 2 +- 5 files changed, 72 insertions(+), 34 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp index 8b0b5c95b..6d72a91e5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp @@ -316,6 +316,23 @@ FailureOr ParameterSetting::create( } } // namespace +/// Utility to set the packing inner permutation for A/LHS so that is packed as +/// [? ? m k] in case of matmul and [? ? ? m k] in case of batch_matmul. +static SmallVector setInnerPermA(bool isMatmulTransposeA) { + SmallVector innerPerm; + if (isMatmulTransposeA) { + innerPerm = {1, 0}; + } else { + innerPerm = {0, 1}; + } + return innerPerm; +} + +/// Utility to set the packing inner permutation for B/RHS so that is packed as +/// - [? ? k n] in case of matmul +/// - [? ? ? k n] in case of batch_matmul +/// - [? ? n k] in case of matmul_transpose_b +/// - [? ? ? n k] in case of batch_matmul_transpose_b. static SmallVector setInnerPermB(bool isMatmulTransposeB) { SmallVector innerPerm; if (isMatmulTransposeB) { @@ -326,14 +343,34 @@ static SmallVector setInnerPermB(bool isMatmulTransposeB) { return innerPerm; } -static SmallVector setInnerPermA(bool isMatmulTransposeA) { - SmallVector innerPerm; +/// Utility to set the packing outer permutation for A/LHS so that is packed as +/// [M K ? ?] in case of matmul and [Batch M K ? ?] in case of batch_matmul. +static SmallVector setOuterPermA(bool isMatmulTransposeA, + bool isBatchMatmul) { + SmallVector outerPerm; if (isMatmulTransposeA) { - innerPerm = {1, 0}; + outerPerm = isBatchMatmul ? SmallVector{0, 2, 1} + : SmallVector{1, 0}; } else { - innerPerm = {0, 1}; + outerPerm = isBatchMatmul ? SmallVector{0, 1, 2} + : SmallVector{0, 1}; } - return innerPerm; + return outerPerm; +} + +/// Utility to set the packing outer permutation for B/RHS so that is packed as +/// [N K ? ?] in case of matmul and [Batch N K ? ?] in case of batch_matmul. +static SmallVector setOuterPermB(bool isMatmulTransposeB, + bool isBatchMatmul) { + SmallVector outerPerm; + if (isMatmulTransposeB) { + outerPerm = isBatchMatmul ? SmallVector{0, 1, 2} + : SmallVector{0, 1}; + } else { + outerPerm = isBatchMatmul ? SmallVector{0, 2, 1} + : SmallVector{1, 0}; + } + return outerPerm; } //===----------------------------------------------------------------------===// @@ -362,7 +399,7 @@ static LogicalResult setRootConfigForPackPeelPipeline( packedSizesL0.insert(packedSizesL0.begin(), 0); } - // For matmul, transpose B matrix from [K N n k] to [K N k n] + // For matmul, transpose B matrix from [K N n k] to [N K k n] // For matmul_transpose_b, we don't have to transpose the B matrix, // since it is already [N K n k] SmallVector transposePackIndices = {0, 1}; @@ -372,11 +409,12 @@ static LogicalResult setRootConfigForPackPeelPipeline( SmallVector innerPermA = setInnerPermA(isMatmulTransposeA(linalgOp)); SmallVector innerPermB = setInnerPermB(isMatmulTransposeB(linalgOp)); SmallVector> innerPerm = {innerPermA, innerPermB}; - SmallVector outerPermVec = {0, 1}; - if (isa(linalgOp)) { - outerPermVec.push_back(2); - } - SmallVector> outerPerm = {outerPermVec, outerPermVec}; + bool isBatchMatmul = isa(linalgOp); + SmallVector outerPermA = + setOuterPermA(isMatmulTransposeA(linalgOp), isBatchMatmul); + SmallVector outerPermB = + setOuterPermB(isMatmulTransposeB(linalgOp), isBatchMatmul); + SmallVector> outerPerm = {outerPermA, outerPermB}; if (isObjectFifo) { // Add outer permutation for unpack. NOTE: This currently fails for some // tests in the AIR pipeline. diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir index 56e05d1a0..93445cd35 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir @@ -2,7 +2,7 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{use-lower-to-aie-pipeline=air use-tile-pipeline=pack-peel})' %s | FileCheck %s --check-prefix=CHECK-PACK-PEEL // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PAD-PACK{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -29,7 +29,7 @@ builtin.module { // ----- // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PAD-PACK{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -190,7 +190,7 @@ builtin.module { // ----- // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -217,7 +217,7 @@ builtin.module { // ----- // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -245,7 +245,7 @@ module { // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config // CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config // CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir index c5b66817f..0e8e80a01 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir @@ -3,7 +3,7 @@ // Test generic version of matmul. // CHECK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK{LITERAL}: #amdaie.packing_config +// CHECK{LITERAL}: #amdaie.packing_config module { func.func @matmul_generic_128x128x256_i32() { %c0_i32 = arith.constant 0 : i32 @@ -63,7 +63,7 @@ module { // Test generic version of matmul_transpose_a. // CHECK{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK{LITERAL}: #amdaie.packing_config +// CHECK{LITERAL}: #amdaie.packing_config module { func.func @matmul_transpose_a_generic_128x128x256_i32() { %c0_i32 = arith.constant 0 : i32 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir index ff53e7862..0638b7af9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir @@ -3,13 +3,13 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu1_4col})' %s | FileCheck %s --check-prefix=CHECK-4x4 // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -36,13 +36,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -69,13 +69,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -102,13 +102,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , @@ -168,13 +168,13 @@ module { // ----- // CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-2x2{LITERAL}: #amdaie.packing_config +// CHECK-2x2{LITERAL}: #amdaie.packing_config // CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x2{LITERAL}: #amdaie.packing_config +// CHECK-4x2{LITERAL}: #amdaie.packing_config // CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-4x4{LITERAL}: #amdaie.packing_config +// CHECK-4x4{LITERAL}: #amdaie.packing_config #pipeline_layout = #hal.pipeline.layout, , diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir index 2450a69d0..3863e4e27 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir @@ -7,7 +7,7 @@ // CHECK-SAME: innerPerm = [ // CHECK-SAME: [0, 1], [1, 0], [0, 1] // CHECK-SAME: ], outerPerm = [ -// CHECK-SAME: [0, 1], [0, 1], [1, 0] +// CHECK-SAME: [0, 1], [1, 0], [1, 0] // CHECK-SAME: ]}, {packedSizes = [0, 0, 0, 8, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], // CHECK-SAME: innerPerm = [ // CHECK-SAME: [0, 1], [1, 0], [0, 1] From 75ea24b5fbf4fd6350147e25d97fcb28fdb7039e Mon Sep 17 00:00:00 2001 From: Zhewen Yu Date: Thu, 9 Jan 2025 23:20:59 +0000 Subject: [PATCH 2/2] Add a new pass to generate column-wise control overlay (#1012) Adapted from https://github.com/Xilinx/mlir-aie/pull/1705. Introduces a new pass to automatically insert the following two groups of flows: 1. `route-shim-to-tct`, circuit flows from shim `CTRL` to shim `SOUTH` ports, for sending TCTs. 2. `route-shim-to-tile-ctrl`, packet flows between shim `DMA` to shim/mem/compute tile `CTRL` ports, for sending control packets. --- .../AMDAIEGenerateControlOverlay.cpp | 192 ++++++++++++++++++ .../Transforms/AMDAIELowerToAIE.cpp | 5 +- .../iree-amd-aie/Transforms/CMakeLists.txt | 1 + .../iree-amd-aie/Transforms/PassDetail.h | 1 + .../iree-amd-aie/Transforms/Passes.cpp | 5 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 5 + .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 11 + .../Transforms/test/CMakeLists.txt | 1 + .../test/generate_control_overlay.mlir | 102 ++++++++++ .../aie_runtime/iree_aie_configure.cc | 2 +- 10 files changed, 322 insertions(+), 3 deletions(-) create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp create mode 100644 compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp new file mode 100644 index 000000000..4087ee6b1 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEGenerateControlOverlay.cpp @@ -0,0 +1,192 @@ +// Copyright 2025 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" +#include "iree-amd-aie/aie_runtime/Utils/ChannelGenerator.h" + +#define DEBUG_TYPE "iree-amdaie-generate-control-overlay" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Initializes the channel generators for the shim tiles, excluding any +/// channels that are already in use by existing circuit flows. +LogicalResult initializeChannelsGenerators( + AMDAIE::WorkgroupOp workgroupOp, const AMDAIEDeviceModel &deviceModel, + const DenseSet &shimTileOps, + DenseMap &shimTileToGeneratorMap) { + // Check the number of DMA channels available for the shim tile. + uint8_t numShimDmaChannels = deviceModel.getDmaProp( + AMDAIETileType::SHIMNOC, AMDAIEDmaProp::NumChannels); + std::for_each(shimTileOps.begin(), shimTileOps.end(), [&](TileOp shimTileOp) { + shimTileToGeneratorMap[shimTileOp.getResult()] = + ChannelGenerator(numShimDmaChannels, numShimDmaChannels); + }); + // Exclude those channels that are already used by a circuit flow. + workgroupOp->walk([&](AMDAIE::FlowOp flowOp) { + if (flowOp.getIsPacketFlow()) return WalkResult::advance(); + SmallVector sourceChannels; + for (Value source : flowOp.getSources()) { + if (auto channelOp = + dyn_cast(source.getDefiningOp())) { + sourceChannels.push_back(channelOp); + } + } + for (AMDAIE::ChannelOp channelOp : sourceChannels) { + AMDAIE::TileOp tileOp = channelOp.getTileOp(); + uint8_t channel = channelOp.getValue(); + StrmSwPortType portType = channelOp.getPortType(); + AMDAIE::DMAChannelDir direction = channelOp.getDirection(); + if (shimTileOps.contains(tileOp) && portType == StrmSwPortType::DMA) { + // Assign to exclude. + if (direction == AMDAIE::DMAChannelDir::MM2S) { + shimTileToGeneratorMap[tileOp.getResult()].assignProducerDMAChannel( + channel); + } else if (direction == AMDAIE::DMAChannelDir::S2MM) { + shimTileToGeneratorMap[tileOp.getResult()].assignConsumerDMAChannel( + channel); + } else { + assert(false && "unexpected DMA channel direction"); + } + } + } + return WalkResult::advance(); + }); + return success(); +} + +LogicalResult generateControlOverlay(AMDAIE::WorkgroupOp workgroupOp, + bool routeShimToTileCtrl, + bool routeShimCtrlToTct) { + // Get the device model. + std::optional device = getConfigAMDAIEDevice(workgroupOp); + if (!device) { + return workgroupOp->emitOpError() + << "could not find an AMDAIEDevice attribute"; + } + AMDAIEDeviceModel deviceModel = AMDAIE::getDeviceModel(device.value()); + + IRRewriter rewriter(workgroupOp->getContext()); + DenseSet occupiedCols; + DenseMap columnToShimTile; + workgroupOp->walk([&](AMDAIE::TileOp tileOp) { + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); + occupiedCols.insert(col); + if (deviceModel.isShimNOCTile(col, row)) columnToShimTile[col] = tileOp; + }); + + // If the column is occupied, but the shim tile op is not present, then create + // one. + rewriter.setInsertionPoint(workgroupOp.getControlCode()); + for (uint32_t col : occupiedCols) { + if (!columnToShimTile.count(col)) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), col); + auto rowIndex = + rewriter.create(rewriter.getUnknownLoc(), 0); + columnToShimTile[col] = rewriter.create( + rewriter.getUnknownLoc(), colIndex, rowIndex); + } + } + + // Create a packet flow from the shim DMA to the tile CTRL, for sending + // control packets. + if (routeShimToTileCtrl) { + DenseMap shimTileToGeneratorMap; + DenseSet shimTileOps; + for (const auto &pair : columnToShimTile) shimTileOps.insert(pair.second); + if (failed(initializeChannelsGenerators( + workgroupOp, deviceModel, shimTileOps, shimTileToGeneratorMap))) { + return failure(); + } + WalkResult res = workgroupOp->walk([&](AMDAIE::TileOp tileOp) { + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + TileOp shimTileOp = columnToShimTile[col]; + // Get the available channel, but do not assign it. Allow it to be + // shared across multiple packet flows as needed. + std::optional maybeChannel = + shimTileToGeneratorMap[shimTileOp.getResult()] + .getProducerDMAChannel(); + if (!maybeChannel) { + shimTileOp.emitOpError() << "no producer DMA channel available"; + return WalkResult::interrupt(); + } + auto shimDmaChannelOp = rewriter.create( + rewriter.getUnknownLoc(), shimTileOp, maybeChannel.value(), + StrmSwPortType::DMA, AMDAIE::DMAChannelDir::MM2S); + auto tileCtrlChannelOp = rewriter.create( + rewriter.getUnknownLoc(), tileOp, 0, StrmSwPortType::CTRL, + AMDAIE::DMAChannelDir::S2MM); + rewriter.create( + rewriter.getUnknownLoc(), ValueRange{shimDmaChannelOp}, + ValueRange{tileCtrlChannelOp}, + /*isPacketFlow*/ true, /*packetId*/ nullptr); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + } + + // Create a circuit flow from the shim CTRL to the shim SOUTH 0, for sending + // Task Completion Tokens (TCTs). + if (routeShimCtrlToTct) { + for (auto [_, shimTileOp] : columnToShimTile) { + auto shimCtrlChannelOp = rewriter.create( + rewriter.getUnknownLoc(), shimTileOp, 0, StrmSwPortType::CTRL, + AMDAIE::DMAChannelDir::MM2S); + auto shimSouthChannelOp = rewriter.create( + rewriter.getUnknownLoc(), shimTileOp, 0, StrmSwPortType::SOUTH, + AMDAIE::DMAChannelDir::S2MM); + rewriter.create( + rewriter.getUnknownLoc(), ValueRange{shimCtrlChannelOp}, + ValueRange{shimSouthChannelOp}, + /*isPacketFlow*/ false, /*packetId*/ nullptr); + } + } + + return success(); +} + +class AMDAIEGenerateControlOverlayPass + : public impl::AMDAIEGenerateControlOverlayBase< + AMDAIEGenerateControlOverlayPass> { + public: + AMDAIEGenerateControlOverlayPass( + const AMDAIEGenerateControlOverlayOptions &options) + : AMDAIEGenerateControlOverlayBase(options) {} + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override; +}; + +void AMDAIEGenerateControlOverlayPass::runOnOperation() { + Operation *parentOp = getOperation(); + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + if (failed(generateControlOverlay(workgroupOp, routeShimToTileCtrl, + routeShimCtrlToTct))) { + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEGenerateControlOverlayPass( + AMDAIEGenerateControlOverlayOptions options) { + return std::make_unique(options); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index a4eda0494..7a0ce4ed2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -300,8 +300,9 @@ SmallVector AIEDeviceBuilder::createFlowOps( for (AMDAIE::ChannelOp consumerChannel : consumerChannels) { Value aieConsumerTile = mapper.lookup(consumerChannel.getTile()); AIE::FlowOp flowOp = rewriter.create( - rewriter.getUnknownLoc(), aieProducerTile, AIE::WireBundle::DMA, - producerChannel.getValue(), aieConsumerTile, AIE::WireBundle::DMA, + rewriter.getUnknownLoc(), aieProducerTile, + producerChannel.getPortType(), producerChannel.getValue(), + aieConsumerTile, consumerChannel.getPortType(), consumerChannel.getValue()); flowOps.push_back(flowOp.getOperation()); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index 012c004c0..b261103aa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -75,6 +75,7 @@ iree_cc_library( "AMDAIEFuseConsumerIntoLoop.cpp" "AMDAIEFuseFillIntoForall.cpp" "AMDAIEFusePackIntoLoop.cpp" + "AMDAIEGenerateControlOverlay.cpp" "AMDAIEHoistForAffineApply.cpp" "AMDAIEHoistLogicalObjFifo.cpp" "AMDAIEInsertCores.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 0fbed8c81..01db8faa2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -56,6 +56,7 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIEFUSECONSUMERINTOLOOP #define GEN_PASS_DEF_AMDAIEFUSEFILLINTOFORALL #define GEN_PASS_DEF_AMDAIEFUSEPACKINTOLOOP +#define GEN_PASS_DEF_AMDAIEGENERATECONTROLOVERLAY #define GEN_PASS_DEF_AMDAIEHOISTFORLOOPAFFINEAPPLY #define GEN_PASS_DEF_AMDAIEHOISTLOGICALOBJFIFO #define GEN_PASS_DEF_AMDAIEINSERTAIEWORKGROUP diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 62fbe0813..33bbcd318 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -670,6 +670,11 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIEObjFifoBufferizationPass()); passManager.addPass(createAMDAIETemporaryAllocBufferizationPass()); passManager.addPass(createAMDAIEConnectionToFlowPass()); + + passManager.addPass(createAMDAIEGenerateControlOverlayPass()); + passManager.addPass(createCSEPass()); + passManager.addPass(createCanonicalizerPass()); + passManager.addPass(createAMDAIEAssignPacketIdsPass()); passManager.addPass(createAMDAIENpuDmaToHalfDmaCpyNdPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index f1bc76649..805bc6740 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -195,6 +195,11 @@ std::unique_ptr createAMDAIEFuseConsumerIntoLoopPass( /// Create a pass to fuse the linalg.fill into the forall loops. std::unique_ptr createAMDAIEFuseFillIntoForallPass(); +/// Create pass to generate packet-flow routings for control packets entering or +/// leaving each tile. +std::unique_ptr createAMDAIEGenerateControlOverlayPass( + AMDAIEGenerateControlOverlayOptions options = {}); + /// Hoist an affine.apply op on a scf.for op's induction variable. std::unique_ptr createAMDAIEHoistForLoopAffineApplyPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index bcfed8890..87029784f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -330,6 +330,17 @@ def AMDAIEFusePackIntoLoop : ]; } +def AMDAIEGenerateControlOverlay : Pass<"iree-amdaie-generate-control-overlay"> { + let summary = "Spawn a streaming interconnect network for CTRL ports."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEGenerateControlOverlayPass()"; + let options = [ + Option<"routeShimCtrlToTct", "route-shim-to-tct", "bool", /*default=*/"true", + "Flag to generate TCT routing between tile CTRL and shim SOUTH ports.">, + Option<"routeShimToTileCtrl", "route-shim-to-tile-ctrl", "bool", /*default=*/"false", + "Flag to generate routing between shim dma DMA and tile CTRL ports, for configuration."> + ]; +} + def AMDAIEHoistForLoopAffineApply : Pass<"iree-amdaie-hoist-for-affine-apply"> { let summary = "Hoist an affine apply op on a scf.for op's induction variable."; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEHoistForLoopAffineApplyPass()"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index e4f4cbf37..224def85a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -48,6 +48,7 @@ iree_lit_test_suite( "fuse_consumer_into_loop.mlir" "fuse_fill_into_forall.mlir" "fuse_pack_into_loop.mlir" + "generate_control_overlay.mlir" "hoist_for_affine_apply.mlir" "hoist_logical_obj_fifo.mlir" "insert_cores.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir new file mode 100644 index 000000000..daf8ca6db --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/generate_control_overlay.mlir @@ -0,0 +1,102 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-generate-control-overlay{route-shim-to-tct=true route-shim-to-tile-ctrl=true},canonicalize,cse))" --split-input-file --verify-diagnostics %s | FileCheck %s + +// Device attribute is required for route-shim-to-tile-ctrl. +module { + func.func @no_amdaie_device() { + // expected-error @+1 {{could not find an AMDAIEDevice attribute}} + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// Shim tile (0, 0) has two producer (MM2S) channels, +// both of which are already utilized by existing circuit flows. +// No producer DMA channel is available for route-shim-to-tile-ctrl. +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_available_channel() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + // expected-error @+1 {{no producer DMA channel available}} + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %channel_0 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %flow_0 = amdaie.flow({%channel_0} -> {%channel_1}) {is_packet_flow = false} + %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) + %flow_1 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} + amdaie.controlcode { + amdaie.end + } + } + return + } +} + + +// ----- + +// Successfully inserted six packet flows from shim DMA channels to tile CTRL channels, +// and one circuit flow from shim CTRL to shim SOUTH 0. +// CHECK-LABEL: @column_control_overlay +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C3:.*]] = arith.constant 3 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: amdaie.workgroup { +// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK: %[[TILE_0_4:.*]] = amdaie.tile(%[[C0]], %[[C4]]) +// CHECK: %[[TILE_0_5:.*]] = amdaie.tile(%[[C0]], %[[C5]]) +// CHECK: %[[CHANNEL_0:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_0:.*]] = amdaie.flow({%[[CHANNEL_0]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_0]], 1, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_1:.*]] = amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_4:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_2:.*]] = amdaie.flow({%[[CHANNEL_0]]} -> {%[[CHANNEL_4]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_5:.*]] = amdaie.channel(%[[TILE_0_3]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_3:.*]] = amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_5]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_6:.*]] = amdaie.channel(%[[TILE_0_4]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_4:.*]] = amdaie.flow({%[[CHANNEL_0]]} -> {%[[CHANNEL_6]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_7:.*]] = amdaie.channel(%[[TILE_0_5]], 0, port_type = CTRL, direction = S2MM) +// CHECK: %[[FLOW_5:.*]] = amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_7]]}) {is_packet_flow = true} +// CHECK: %[[CHANNEL_8:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = CTRL, direction = MM2S) +// CHECK: %[[CHANNEL_9:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = SOUTH, direction = S2MM) +// CHECK: %[[FLOW_6:.*]] = amdaie.flow({%[[CHANNEL_8]]} -> {%[[CHANNEL_9]]}) {is_packet_flow = false} +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @column_control_overlay() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_2 = amdaie.tile(%c0, %c2) + %tile_0_3 = amdaie.tile(%c0, %c3) + %tile_0_4 = amdaie.tile(%c0, %c4) + %tile_0_5 = amdaie.tile(%c0, %c5) + amdaie.controlcode { + amdaie.end + } + } + return + } +} diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.cc index b89346b46..c64591db9 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_configure.cc @@ -267,7 +267,7 @@ LogicalResult configureStreamSwitch(const AMDAIEDeviceModel &deviceModel, const TileLoc &tileLoc, const std::vector &connects) { auto devInst = const_cast(&deviceModel.devInst); - // FIXME hack for TCT routing + // mlir-air legacy, hack for TCT routing // TODO copy-pasted: Support both channels // TODO(max): find a way to keep track so that multiple calls don't // rewrite/overwrite with same data.