diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index d15e210d7..89bc0fd05 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -5,7 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/IR/Iterators.h" #include "mlir/Pass/Pass.h" @@ -18,20 +21,114 @@ namespace { /// Utility struct to represent DMA split information. struct DmaSplitInfo { size_t sourceSplitDim{0}; + int64_t newSourceStride{1}; size_t targetSplitDim{0}; + int64_t newTargetStride{1}; + int64_t splitSize{1}; +}; + +/// Utility struct to represent objectFifo split information. +struct ObjFifoSplitInfo { + size_t splitDim{0}; + int64_t splitSize{1}; + int64_t splitStride{1}; }; using DmaObjFifoPairT = std::pair; +/// Utility to derive the split stride to be used from a vector of DMA ops by +/// analyzing the offset scales. Will fail if the provided DMA ops don't have a +/// consistent offset scale. +template +FailureOr getSplitStride(ArrayRef dmaOps, + int64_t sizeAfterSplit) { + int64_t splitStride{-1}; + for (AMDAIE::DmaCpyNdOp dmaOp : dmaOps) { + SmallVector offsets; + SmallVector strides; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + offsets = dmaOp.getSourceMixedOffsets(); + strides = dmaOp.getSourceMixedStrides(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + offsets = dmaOp.getTargetMixedOffsets(); + strides = dmaOp.getTargetMixedStrides(); + } else { + assert(false && "Function can only operate on Source or Target"); + } + SmallVector splitIndices = + getStrideIndicesWithDynamicOrNonZeroOffset(offsets, strides, + sizeAfterSplit); + if (splitIndices.size() > 1) + return dmaOp.emitError() << "multiple split indices found"; + int64_t step{-1}; + if (splitIndices.empty()) { + step = 1; + } else { + // splitIndices.size() == 1 + size_t splitIdx = splitIndices[0]; + OpFoldResult offset = offsets[splitIdx]; + + if (std::optional staticOffset = getConstantIntValue(offset); + staticOffset.has_value()) { + if (staticOffset.value() == 0) continue; + step = 1; + } else if (auto offsetValue = dyn_cast_if_present(offset)) { + if (isa_and_present( + offsetValue.getDefiningOp())) { + auto applyOp = + cast(offsetValue.getDefiningOp()); + if (applyOp.getNumOperands() != 1) + return applyOp.emitError() << "mulptiple operands is not supported"; + AffineMap affineMap = applyOp.getAffineMap(); + RetrieveScaleAndBias retriever; + if (failed(retriever.visit(affineMap.getResult(0)))) { + return applyOp.emitError() + << "could not retrieve scale and bias from expression: " + << *applyOp.getOperation(); + } + if (!retriever.scale.has_value()) { + return applyOp.emitError() + << "expected a scale for: " << *applyOp.getOperation(); + } + step = retriever.scale.value(); + } else if (auto blockArg = dyn_cast(offsetValue); + blockArg && isa( + blockArg.getOwner()->getParentOp())) { + step = 1; + } else { + return dmaOp.emitOpError() + << "has an offset value that is neither an " + "induction variable nor an affine expression"; + } + } else { + return dmaOp.emitOpError() + << "has an offset that is neither a constant nor an affine " + "expression, which is not supported"; + } + } + if (splitStride == -1) { + splitStride = step; + } else if (step != splitStride) { + return dmaOp.emitOpError() << "has an offset step: " << step + << ", which is different from " + "previous offset steps: " + << splitStride; + } + } + // If all offsets are zero (or no split index found). + if (splitStride == -1) return 1; + return splitStride; +} + /// Find the logical objectFifo and DMA source/target splitting dimensions for /// each DMA and objectFifo pair. /// /// Each pair is handled in the following way: -/// First, compute the objectFifo splitting dimension as the last non-unit shape -/// dimension. Afterwards, depending on which logical objectFifo is being -/// split on, find the outermost dimension in either the source or -/// target access pattern that has: +/// First, compute the objectFifo splitting dimension based on the last non-unit +/// shape dimension and the number of available columns. Afterwards, depending +/// on which logical objectFifo is being split on, find the outermost dimension +/// in either the source or target access pattern that has: /// - stride == sizeAfterSplit /// - size != 1 /// This is the splitting dimension to be used on the respective side of the DMA @@ -44,8 +141,9 @@ using DmaObjFifoPairT = LogicalResult collectSplittingDims( const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, - DenseMap - &objFifoSplitDimMap) { + DenseMap + &objFifoSplitInfoMap, + int64_t numCols) { for (auto [dmaOp, objFifo] : dmaObjFifoPairs) { LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n"); LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n"); @@ -62,10 +160,19 @@ LogicalResult collectSplittingDims( // If all dimensions are unit (1), no splitting can be done, so continue to // the next pair. if (objFifoSplitDim >= memrefShape.size()) continue; + int64_t splitDimSize = memrefShape[objFifoSplitDim]; int64_t sizeAfterSplit = std::accumulate(memrefShape.begin() + objFifoSplitDim + 1, memrefShape.end(), 1, std::multiplies<>()); + // Get the producers and consumers of the current objectFifoOp. + SmallVector producers; + SmallVector consumers; + if (failed(getDmaCpyNdOpProducersAndConsumers(objFifo, producers, + consumers))) { + return failure(); + } + size_t sourceSplitDim{0}; size_t targetSplitDim{0}; if (dmaOp.getTargetObjectFifo() == objFifo) { @@ -101,6 +208,27 @@ LogicalResult collectSplittingDims( break; } } + FailureOr maybeSplitStride = + getSplitStride(consumers, sizeAfterSplit); + if (failed(maybeSplitStride)) { + objFifo.emitOpError() + << "could not retrieve a split stride from the consumer DMA ops"; + } + int64_t splitStride = maybeSplitStride.value(); + // Calculate the new source stride to be used for splitting the DMA. + int64_t newSourceStride = + splitStride != 1 ? splitDimSize / splitStride : 1; + LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() + << "newSourceStride: " << newSourceStride << "\n"); + LLVM_DEBUG(llvm::dbgs() + << "objFifoSplitDim: " << objFifoSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim, + 1, numCols}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; } else if (dmaOp.getSourceObjectFifo() == objFifo) { // Find outermost dimension in the access pattern that has stride == // sizeAfterSplit and size != 1. @@ -136,13 +264,28 @@ LogicalResult collectSplittingDims( break; } } + FailureOr maybeSplitStride = + getSplitStride(producers, sizeAfterSplit); + if (failed(maybeSplitStride)) { + objFifo.emitOpError() + << "could not retrieve a split stride from the consumer DMA ops"; + } + int64_t splitStride = maybeSplitStride.value(); + // Calculate the new target stride to be used for splitting the DMA. + int64_t newTargetStride = + splitStride != 1 ? splitDimSize / splitStride : 1; + LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() + << "newTargetStride: " << newTargetStride << "\n"); + LLVM_DEBUG(llvm::dbgs() + << "objFifoSplitDim: " << objFifoSplitDim << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim, + newTargetStride, numCols}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; } - LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); - LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); - LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); - DmaSplitInfo dmaSplitInfo = {sourceSplitDim, targetSplitDim}; - dmaSplitInfoMap[dmaOp] = std::move(dmaSplitInfo); - objFifoSplitDimMap[objFifo] = objFifoSplitDim; } return success(); } @@ -157,9 +300,6 @@ class AMDAIESplitLogicalObjFifosPass AMDAIESplitLogicalObjFifosPass() = default; AMDAIESplitLogicalObjFifosPass(const AMDAIESplitLogicalObjFifosPass &pass){}; - AMDAIESplitLogicalObjFifosPass( - const AMDAIESplitLogicalObjFifosOptions &options) - : AMDAIESplitLogicalObjFifosBase(options) {} void runOnOperation() override; }; @@ -168,6 +308,18 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { MLIRContext *context = &getContext(); IRRewriter rewriter(context); + // Retrieve the device model. + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(moduleOp); + std::optional maybeNumColumns = getConfigNumColumns(targetAttr); + if (!maybeNumColumns) { + moduleOp.emitOpError() << "has no number of columns specified in the " + "target attribute configuration. This " + "device-specific information is required to " + "correctly split logical objectFifos."; + return signalPassFailure(); + } + int64_t numColumns = maybeNumColumns.value(); + // Walk and collect all dma ops between L3 and L2. SmallVector l3L2DmaOps; SmallVector dmaObjFifoPairs; @@ -189,9 +341,10 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { // Collect the split dimensions for all DMA and ojectFifo pairs. DenseMap dmaSplitInfoMap; - DenseMap objFifoSplitDimMap; + DenseMap + objFifoSplitInfoMap; if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap, - objFifoSplitDimMap))) { + objFifoSplitInfoMap, numColumns))) { return signalPassFailure(); } @@ -200,16 +353,19 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) { auto stridedOp = cast(dmaOp.getOperation()); - if (failed(splitDoublyStridedOp(rewriter, stridedOp, - dmaSplitInfo.sourceSplitDim, - dmaSplitInfo.targetSplitDim, numCols))) { + if (failed(splitDoublyStridedOp( + rewriter, stridedOp, dmaSplitInfo.sourceSplitDim, + dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize, + dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of the DMA op: " << dmaOp); return signalPassFailure(); } } - for (auto &&[objFifo, splitDim] : objFifoSplitDimMap) { - if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitDim, numCols))) { + for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) { + if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim, + splitInfo.splitSize, + splitInfo.splitStride))) { LLVM_DEBUG(llvm::dbgs() << "Failed to perform splitting of objectFifo op"); return signalPassFailure(); @@ -219,9 +375,8 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() { } // namespace -std::unique_ptr createAMDAIESplitLogicalObjFifosPass( - AMDAIESplitLogicalObjFifosOptions options) { - return std::make_unique(options); +std::unique_ptr createAMDAIESplitLogicalObjFifosPass() { + return std::make_unique(); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 2728c4633..0ded0b8fa 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -599,13 +599,8 @@ void addAMDAIEObjectFifoLoweringPasses( passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass()); // Currently, SplitLogicalObjFifos pass only works for matmul-like ops. - { - if (useTilePipeline == TilePassPipeline::PackPeelPipeline) { - AMDAIESplitLogicalObjFifosOptions splitOptions; - splitOptions.numCols = numCols; - passManager.addPass(createAMDAIESplitLogicalObjFifosPass(splitOptions)); - } - } + if (useTilePipeline == TilePassPipeline::PackPeelPipeline) + passManager.addPass(createAMDAIESplitLogicalObjFifosPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index dd660646d..4963b1709 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -291,8 +291,7 @@ std::unique_ptr createAMDAIERemoveMemorySpacePass(); std::unique_ptr createAMDAIESinkIntoCorePass(); /// Create a pass to split logicalobjectfifos for shimTile/memTile distribution. -std::unique_ptr createAMDAIESplitLogicalObjFifosPass( - AMDAIESplitLogicalObjFifosOptions options = {}); +std::unique_ptr createAMDAIESplitLogicalObjFifosPass(); /// Create a pass to split logicalobjectfifos for connection reuse. std::unique_ptr createAMDAIESplitLogicalObjFifosForConnectionReusePass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index f7f3db246..1aa0abc31 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -715,10 +715,6 @@ def AMDAIESplitLogicalObjFifos : `[1, 2, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers. }]; let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosPass()"; - let options = [ - Option<"numCols", "num-cols", "uint32_t", /*default=*/"4", - "Number of columns used in an AIE core array"> - ]; } def AMDAIESplitLogicalObjFifosForConnectionReuse : diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp index ebddb881d..db100c61c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -8,6 +8,7 @@ #include +#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h" #include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" @@ -502,8 +503,6 @@ LogicalResult splitLogicalObjectFifoForElementwiseOp( return success(); } -/// Utility to get the `DmaCpyNdOp` producers and consumers of a given -/// objectFifo op. LogicalResult getDmaCpyNdOpProducersAndConsumers( AMDAIE::LogicalObjectFifoFromMemrefOp op, SmallVector &producers, @@ -527,54 +526,119 @@ LogicalResult getDmaCpyNdOpProducersAndConsumers( return success(); } -using OffsetIndexAndNewOffsetT = std::tuple, int64_t>; - -/// Utility to return the index of the offsets array that refers to newly -/// splitted objectFifo and the respective offset value. Note that there might -/// not be a dimension with `stride == sizeAfterSplit`, in which case an offset -/// index can't be returned and the correct offset is `0`. -FailureOr getOffsetIndexAndOffset( - ArrayRef offsets, ArrayRef sizes, - ArrayRef strides, size_t sizeAfterSplit, - function_ref emitError) { - SmallVector offsetIndices; +/// Utility to return the indices of the dimensions with stride equal to the +/// expected stride and with dynamic or non-zero offsets. +SmallVector getStrideIndicesWithDynamicOrNonZeroOffset( + ArrayRef offsets, ArrayRef strides, + size_t expectedStride) { + SmallVector indices; for (auto iter : llvm::enumerate(llvm::zip(strides, offsets))) { - std::optional maybeStride = - getConstantIntValue(std::get<0>(iter.value())); std::optional maybeOffset = getConstantIntValue(std::get<1>(iter.value())); - if (maybeStride.has_value() && maybeOffset.has_value() && - maybeStride.value() == sizeAfterSplit && maybeOffset.value() != 0) { - offsetIndices.push_back(iter.index()); + bool strideEqualToSizeAfterSplit = + isConstantIntValue(std::get<0>(iter.value()), expectedStride); + bool dynamicOrNonZeroOffset = + !maybeOffset.has_value() || maybeOffset.value() != 0; + if (strideEqualToSizeAfterSplit && dynamicOrNonZeroOffset) { + indices.push_back(iter.index()); } } + return indices; +} + +/// Utility struct for new offset configurations. +struct OffsetConfig { + /// The optional index of the offset to be updated. + std::optional maybeOffsetIdx{std::nullopt}; + /// The index of the new objectFifo to be used. + size_t objFifoIndex{0}; + /// The new offset to be used. + OpFoldResult newOffset; +}; + +/// Utility to new offset configuration to be used for updating +/// consumer/producer DMA operations. +/// First, find the dimension for which the `stride` is equal to +/// `sizeAfterSplit` and whith a dynamic or non-zero offset. This is the new +/// offset index. Afterwards, check the offset value at this dimension and +/// compute the `newOffset` value and `objFifoIndex`. +FailureOr getNewOffsetConfig( + RewriterBase &rewriter, ArrayRef offsets, + ArrayRef strides, size_t sizeAfterSplit, int64_t splitDimSize, + int64_t splitStride, int64_t splitFactor, + function_ref emitError) { + SmallVector offsetIndices = + getStrideIndicesWithDynamicOrNonZeroOffset(offsets, strides, + sizeAfterSplit); if (offsetIndices.size() > 1) return emitError() << "multiple offset indices found"; - - int64_t offset{0}; - std::optional maybeOffsetIdx; - if (offsetIndices.size() == 1) { - size_t offsetIdx = offsetIndices[0]; - maybeOffsetIdx = offsetIdx; - std::optional maybeSize = getConstantIntValue(sizes[offsetIdx]); - std::optional maybeOffset = - getConstantIntValue(offsets[offsetIdx]); - if (!maybeSize || !maybeOffset) { - return emitError() - << "expected a static target offset and size on index: " - << offsetIdx; + if (offsetIndices.empty()) + return OffsetConfig{std::nullopt, 0, OpFoldResult{nullptr}}; + // Else, offsetIndices.size() == 1 + size_t offsetIdx = offsetIndices[0]; + size_t objFifoIndex{0}; + OpFoldResult newOffsetValue; + if (auto offsetValue = dyn_cast_if_present(offsets[offsetIdx])) { + if (isa_and_present(offsetValue.getDefiningOp())) { + auto applyOp = cast(offsetValue.getDefiningOp()); + if (applyOp.getNumOperands() != 1) { + return emitError() + << "AffineApplyOp with mulptiple operands is not supported"; + } + Value operand = applyOp.getMapOperands()[0]; + AffineMap affineMap = applyOp.getAffineMap(); + RetrieveScaleAndBias retriever; + if (failed(retriever.visit(affineMap.getResult(0)))) { + return emitError() + << "could not retrieve scale and bias from expression: " + << *applyOp.getOperation(); + } + if (!retriever.scale) { + return emitError() << "expected a scale for: " + << *applyOp.getOperation(); + } + objFifoIndex = retriever.bias.has_value() ? retriever.bias.value() : 0; + // In case of a unit stride, the index of the new objectFifo needs to be + // calculated by dividing by `splitDimSize`. For example, for + // `splitDimSize == 2`, offsets 0 and 1 are mapped to objectFifo 0 and + // offsets 2 and 3 are mapped to objectFifo 1. + if (splitStride == 1) objFifoIndex /= splitDimSize; + newOffsetValue = operand; + } else if (auto blockArg = dyn_cast(offsetValue); + blockArg && + isa(blockArg.getOwner()->getParentOp())) { + objFifoIndex = 0; + newOffsetValue = offsetValue; + } else { + return emitError() << "expected an affine expression or induction " + "variable for the offset on index: " + << offsetIdx; } - offset = maybeOffset.value(); + } else if (std::optional maybeOffset = + getConstantIntValue(offsets[offsetIdx])) { + // In case of a unit stride, the index of the new objectFifo needs to be + // calculated by dividing by `splitDimSize`. For example, for + // `splitDimSize == 2`, offsets 0 and 1 are mapped to objectFifo 0 and + // offsets 2 and 3 are mapped to objectFifo 1. + objFifoIndex = maybeOffset.value() / splitDimSize; + newOffsetValue = rewriter.getIndexAttr(maybeOffset.value() % splitDimSize); + + } else { + return emitError() + << "expected a static or affine expression offset on index: " + << offsetIdx; } - return OffsetIndexAndNewOffsetT{maybeOffsetIdx, offset}; + return OffsetConfig{offsetIdx, objFifoIndex, newOffsetValue}; } /// Split a logical objectFifo on the provided split dimension with the -/// specified splitting factor. +/// specified splitting factor and stride. LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromMemrefOp op, size_t splitDim, - std::optional maybeSplitFactor) { + std::optional maybeSplitFactor, + int64_t splitStride) { + OpBuilder::InsertionGuard g(rewriter); SmallVector memrefShape = llvm::to_vector(op.getMemrefType().getShape()); int64_t splitFactor = maybeSplitFactor.has_value() ? maybeSplitFactor.value() @@ -586,6 +650,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, memrefShape[splitDim] % splitFactor == 0 && "the target size for splitting is not divisible by the splitting factor"); memrefShape[splitDim] /= splitFactor; + int64_t splitDimSize = memrefShape[splitDim]; // Create `splitFactor` number of objectFifo ops. SmallVector newObjFifos; @@ -611,30 +676,31 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, SmallVector targetOffsets = producer.getTargetMixedOffsets(); SmallVector targetSizes = producer.getTargetMixedSizes(); SmallVector targetStrides = producer.getTargetMixedStrides(); - std::optional maybeOffsetIdx; - int64_t targetOffset{0}; - FailureOr maybeOffsetIdxAndNewOffset = - getOffsetIndexAndOffset(targetOffsets, targetSizes, targetStrides, - sizeAfterSplit, - [&]() { return producer.emitOpError(); }); - if (failed(maybeOffsetIdxAndNewOffset)) { + FailureOr maybeOffsetConfig = getNewOffsetConfig( + rewriter, targetOffsets, targetStrides, sizeAfterSplit, splitDimSize, + splitStride, splitFactor, [&]() { return producer.emitOpError(); }); + if (failed(maybeOffsetConfig)) { return producer.emitOpError() << "failed to find an offset index and new offset"; } - std::tie(maybeOffsetIdx, targetOffset) = maybeOffsetIdxAndNewOffset.value(); - - // Adjust offset if the new shape of the split dimension is larger than 1. - int64_t newOffset = 0; - if (memrefShape[splitDim] > 1) { - newOffset = targetOffset % splitFactor; - targetOffset /= splitFactor; - } - assert(targetOffset < newObjFifos.size() && + OffsetConfig offsetConfig = maybeOffsetConfig.value(); + assert(offsetConfig.objFifoIndex < newObjFifos.size() && "the targetOffset should be smaller than the number of objectFifos"); - if (maybeOffsetIdx.has_value()) - targetOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(newOffset); + if (offsetConfig.maybeOffsetIdx.has_value()) { + size_t offsetIdx = offsetConfig.maybeOffsetIdx.value(); + targetOffsets[offsetIdx] = offsetConfig.newOffset; + int64_t offsetIdxStride = + getConstantIndexOrAssert(targetStrides[offsetIdx]); + for (size_t i = 0; i < offsetIdx; i++) { + int64_t stride = getConstantIndexOrAssert(targetStrides[i]); + if (stride > offsetIdxStride && + !isConstantIntValue(targetSizes[i], 1)) { + targetStrides[i] = rewriter.getIndexAttr(stride / splitFactor); + } + } + } AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo = - newObjFifos[targetOffset]; + newObjFifos[offsetConfig.objFifoIndex]; rewriter.setInsertionPoint(producer); auto newDmaOp = rewriter.create( producer.getLoc(), newObjFifo, targetOffsets, targetSizes, @@ -648,30 +714,31 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, SmallVector sourceOffsets = consumer.getSourceMixedOffsets(); SmallVector sourceSizes = consumer.getSourceMixedSizes(); SmallVector sourceStrides = consumer.getSourceMixedStrides(); - std::optional maybeOffsetIdx; - int64_t sourceOffset{0}; - FailureOr maybeOffsetIdxAndNewOffset = - getOffsetIndexAndOffset(sourceOffsets, sourceSizes, sourceStrides, - sizeAfterSplit, - [&]() { return consumer.emitOpError(); }); - if (failed(maybeOffsetIdxAndNewOffset)) { + FailureOr maybeOffsetConfig = getNewOffsetConfig( + rewriter, sourceOffsets, sourceStrides, sizeAfterSplit, splitDimSize, + splitStride, splitFactor, [&]() { return consumer.emitOpError(); }); + if (failed(maybeOffsetConfig)) { return consumer.emitOpError() << "failed to find an offset index and offset"; } - std::tie(maybeOffsetIdx, sourceOffset) = maybeOffsetIdxAndNewOffset.value(); - - // Adjust offset if the new shape of the split dimension is larger than 1. - int64_t newOffset = 0; - if (memrefShape[splitDim] > 1) { - newOffset = sourceOffset % splitFactor; - sourceOffset /= splitFactor; + OffsetConfig offsetConfig = maybeOffsetConfig.value(); + assert(offsetConfig.objFifoIndex < newObjFifos.size() && + "the objFifoIndex should be smaller than the number of objectFifos"); + if (offsetConfig.maybeOffsetIdx.has_value()) { + size_t offsetIdx = offsetConfig.maybeOffsetIdx.value(); + sourceOffsets[offsetIdx] = offsetConfig.newOffset; + int64_t offsetIdxStride = + getConstantIndexOrAssert(sourceStrides[offsetIdx]); + for (size_t i = 0; i < offsetIdx; i++) { + int64_t stride = getConstantIndexOrAssert(sourceStrides[i]); + if (stride > offsetIdxStride && + !isConstantIntValue(sourceSizes[i], 1)) { + sourceStrides[i] = rewriter.getIndexAttr(stride / splitFactor); + } + } } - assert(sourceOffset < newObjFifos.size() && - "the sourceOffset should be smaller than the number of objectFifos"); - if (maybeOffsetIdx.has_value()) - sourceOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(newOffset); AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo = - newObjFifos[sourceOffset]; + newObjFifos[offsetConfig.objFifoIndex]; rewriter.setInsertionPoint(consumer); auto newDmaOp = rewriter.create( consumer.getLoc(), consumer.getTarget(), @@ -688,7 +755,9 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim, size_t targetSplitDim, - std::optional maybeSplitFactor) { + std::optional maybeSplitFactor, + int64_t sourceSplitStride, + int64_t targetSplitStride) { if (!op->use_empty()) return op.emitOpError() << "can't be split because it has uses"; SmallVector sourceOffsets = op.getSourceMixedOffsets(); @@ -703,6 +772,20 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, assert(targetSplitDim < targetOffsets.size() && "the dimension to be split on should be smaller than the number of " "target dimensions"); + std::optional maybeSourceStride = + getConstantIntValue(sourceStrides[sourceSplitDim]); + std::optional maybeTargetStride = + getConstantIntValue(targetStrides[targetSplitDim]); + if (!maybeSourceStride) { + return op.emitOpError() + << "does not have a static source stride on dim: " << sourceSplitDim; + } + if (!maybeTargetStride) { + return op.emitOpError() + << "does not have a static target stride on dim: " << targetSplitDim; + } + int64_t sourceStride = maybeSourceStride.value(); + int64_t targetStride = maybeTargetStride.value(); std::optional maybeSourceSize = getConstantIntValue(sourceSizes[sourceSplitDim]); std::optional maybeTargetSize = @@ -715,15 +798,11 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, return op.emitOpError() << "does not have a static target size on dim: " << targetSplitDim; } - int64_t sourceSize = maybeSourceSize.value(); int64_t targetSize = maybeTargetSize.value(); int64_t splitFactor = maybeSplitFactor.has_value() ? maybeSplitFactor.value() : std::gcd(sourceSize, targetSize); - if (sourceSize < splitFactor || targetSize < splitFactor) { - splitFactor = std::gcd(sourceSize, targetSize); - } if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { return op.emitOpError() << "the target or source size is not divisible by " "the provided splitting factor: " @@ -734,12 +813,38 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, int64_t newTargetSize = targetSize / splitFactor; sourceSizes[sourceSplitDim] = rewriter.getIndexAttr(newSourceSize); targetSizes[targetSplitDim] = rewriter.getIndexAttr(newTargetSize); + if (sourceSplitStride != 1) { + sourceSizes[sourceSplitDim] = + rewriter.getIndexAttr(newSourceSize / sourceSplitStride); + sourceSizes.insert(sourceSizes.begin() + sourceSplitDim, + rewriter.getIndexAttr(sourceSplitStride)); + sourceStrides.insert( + sourceStrides.begin() + sourceSplitDim, + rewriter.getIndexAttr(sourceSize / sourceSplitStride * sourceStride)); + sourceOffsets.insert(sourceOffsets.begin() + sourceSplitDim, + rewriter.getIndexAttr(0)); + sourceSplitDim++; + } + if (targetSplitStride != 1) { + targetSizes[targetSplitDim] = + rewriter.getIndexAttr(newTargetSize / targetSplitStride); + targetSizes.insert(targetSizes.begin() + targetSplitDim, + rewriter.getIndexAttr(targetSplitStride)); + targetStrides.insert( + targetStrides.begin() + targetSplitDim, + rewriter.getIndexAttr(targetSize / targetSplitStride * targetStride)); + targetOffsets.insert(targetOffsets.begin() + targetSplitDim, + rewriter.getIndexAttr(0)); + targetSplitDim++; + } rewriter.setInsertionPoint(op); for (int i = 0; i < splitFactor; ++i) { - FailureOr newSourceOffset = addToOffset( - rewriter, sourceOffsets[sourceSplitDim], newSourceSize); // i * - FailureOr newTargetOffset = addToOffset( - rewriter, targetOffsets[targetSplitDim], newTargetSize); // i * + FailureOr newSourceOffset = + addToOffset(rewriter, sourceOffsets[sourceSplitDim], + newSourceSize / sourceSplitStride); + FailureOr newTargetOffset = + addToOffset(rewriter, targetOffsets[targetSplitDim], + newTargetSize / targetSplitStride); if (failed(newSourceOffset)) return op.emitOpError() << "could not create a new source offset"; if (failed(newTargetOffset)) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index c470d917b..fee4e510c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -11,6 +11,19 @@ namespace mlir::iree_compiler::AMDAIE { +/// Utility to get the `DmaCpyNdOp` producers and consumers of a given +/// objectFifo op. +LogicalResult getDmaCpyNdOpProducersAndConsumers( + AMDAIE::LogicalObjectFifoFromMemrefOp op, + SmallVector &producers, + SmallVector &consumers); + +/// Utility to return the indices of the dimensions with stride equal to the +/// expected stride and with dynamic or non-zero offsets. +SmallVector getStrideIndicesWithDynamicOrNonZeroOffset( + ArrayRef offsets, ArrayRef strides, + size_t expectedStride); + /// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops. LogicalResult splitLogicalObjectFifoForElementwiseOp( IRRewriter &rewriter, SmallVector &l2ToL1DmaOps, @@ -21,7 +34,8 @@ LogicalResult splitLogicalObjectFifoForElementwiseOp( /// objectFifo will be split on the size of the dimension being split. LogicalResult splitLogicalObjectFifo( IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromMemrefOp op, - size_t splitDim = 0, std::optional splitFactor = std::nullopt); + size_t splitDim = 0, std::optional splitFactor = std::nullopt, + int64_t splitStride = 1); /// Split doubly strided operations on a source and target split dimension with /// the provided split factor. If no split factor is provided, the doubly @@ -29,7 +43,8 @@ LogicalResult splitLogicalObjectFifo( LogicalResult splitDoublyStridedOp( IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim = 0, size_t targetSplitDim = 0, - std::optional splitFactor = std::nullopt); + std::optional splitFactor = std::nullopt, + int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp index 497c44990..f14cc171e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp @@ -13,12 +13,20 @@ namespace mlir::iree_compiler::AMDAIE { -std::optional getConfigAMDAIEDevice( - IREE::HAL::ExecutableTargetAttr targetAttr) { +template +std::optional getConfigAttr(IREE::HAL::ExecutableTargetAttr targetAttr, + StringRef name) { if (!targetAttr) return std::nullopt; auto config = targetAttr.getConfiguration(); if (!config) return std::nullopt; - std::optional attr = config.getAs("target_device"); + std::optional attr = config.getAs(name); + return attr; +} + +std::optional getConfigAMDAIEDevice( + IREE::HAL::ExecutableTargetAttr targetAttr) { + std::optional attr = + getConfigAttr(targetAttr, "target_device"); if (!attr) return std::nullopt; return AMDAIE::symbolizeEnum(attr.value().getValue()); } @@ -29,6 +37,24 @@ std::optional getConfigAMDAIEDevice(Operation *op) { return getConfigAMDAIEDevice(targetAttr); } +/// Utility that returns the number of columns being targeted. +std::optional getConfigNumColumns( + IREE::HAL::ExecutableTargetAttr targetAttr) { + std::optional attr = + getConfigAttr(targetAttr, "num_cols"); + if (!attr) return std::nullopt; + return attr->getInt(); +} + +/// Utility that returns the number of rows being targeted. +std::optional getConfigNumRows( + IREE::HAL::ExecutableTargetAttr targetAttr) { + std::optional attr = + getConfigAttr(targetAttr, "num_rows"); + if (!attr) return std::nullopt; + return attr->getInt(); +} + /// Utility to retrieve a constant index from an OpFoldResult. int64_t getConstantIndexOrAssert(OpFoldResult ofr) { std::optional res = getConstantIntValue(ofr); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h index cbb9cbd63..21d2b32b0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h @@ -22,6 +22,14 @@ std::optional getConfigAMDAIEDevice( /// attr in the AST. std::optional getConfigAMDAIEDevice(Operation *op); +/// Returns the number of columns being targeted. +std::optional getConfigNumColumns( + IREE::HAL::ExecutableTargetAttr targetAttr); + +/// Returns the number of rows being targeted. +std::optional getConfigNumRows( + IREE::HAL::ExecutableTargetAttr targetAttr); + /// Utility to retrieve a constant index from an OpFoldResult. int64_t getConstantIndexOrAssert(OpFoldResult ofr); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index af17a7e5f..40f38621e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -1,4 +1,13 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos{num-cols=2})" --split-input-file --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{has no number of columns specified in the target attribute configuration. This device-specific information is required to correctly split logical objectFifos}} +module { + func.func @no_device(%arg0: memref<128x128xi32>) { + return + } +} + +// ----- // Test of splitting matmul lhs input objectFifo and dma operations on 2x2 AIE array. @@ -29,7 +38,8 @@ // CHECK: memref.dealloc %[[ALLOC_A0]] : memref<1x1x32x32xi32, 1 : i32> // CHECK: memref.dealloc %[[ALLOC_A1]] : memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> @@ -79,7 +89,8 @@ module { // CHECK: memref.dealloc %[[ALLOC_B0]] : memref<1x1x32x32xi32, 1 : i32> // CHECK: memref.dealloc %[[ALLOC_B1]] : memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> @@ -136,7 +147,8 @@ module { // CHECK: memref.dealloc %[[ALLOC_C0]] : memref<1x2x32x32xi32, 1 : i32> // CHECK: memref.dealloc %[[ALLOC_C1]] : memref<1x2x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_output(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> @@ -189,7 +201,8 @@ module { // CHECK-SAME: %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1] // CHECK: memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32> // CHECK: memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32> -module { +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> @@ -208,3 +221,173 @@ module { return } } + +// ----- + +// Tests splitting with the consumer DMA offsets depending on a loop induction variable. +// This results in a splitting factor that is different from the size of the dimension being split. + +// CHECK-LABEL: @split_producer_with_loop_dependency +// CHECK-DAG: %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1]) +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { +// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +#map = affine_map<(d0) -> (d0 + 4)> +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [256, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + scf.forall (%arg1, %arg2) in (2, 4) { + %3 = affine.apply #map(%arg2) + %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + return + } +} + +// ----- + +// CHECK-LABEL: @split_consumer_with_loop_dependency +// CHECK-DAG: %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { +// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: } +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +#map = affine_map<(d0) -> (d0 + 4)> +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg1, %arg2) in (2, 4) { + %3 = affine.apply #map(%arg2) + %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + return + } +} + +// ----- + +// Tests splitting a producer DMA with the consumer DMAs' offsets depending on a loop induction variable through an affine expression with a scale/stride. +// This results in a splitting factor that is different from the size of the dimension being split and more complex splitting along the stride. +// For example, if the data in a 4x4 objectFifo at some point is: +// +// [0, 0, 0, 0] +// [1, 1, 1, 1] +// [2, 2, 2, 2] +// [3, 3, 3, 3] +// +// and for an `index` from 0 -> 2, two consumer DMAs access the following rows: +// +// consumer 1: 2 * `index` (thus rows 0 and 2) +// consumer 2: 2 * `index` + 1 (thus rows 1 and 3) +// +// Therefore, the objectFifo is split into two objectFifos in the following way: +// +// new objectFifo 1: +// +// [0, 0, 0, 0] +// [2, 2, 2, 2] +// +// new objectFifo 2: +// +// [1, 1, 1, 1] +// [3, 3, 3, 3] + +// CHECK-LABEL: @split_producer_with_loop_dependency_and_stride +// CHECK-DAG: %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1]) +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { +// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +#map = affine_map<(d0) -> (d0 * 2)> +#map1 = affine_map<(d0) -> (d0 * 2 + 1)> +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %2 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [256, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + scf.forall (%arg1, %arg2) in (2, 4) { + %3 = affine.apply #map(%arg2) + %4 = affine.apply #map1(%arg2) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + return + } +} + +// ----- + +// Tests splitting a consumer DMA with the producer DMAs' offsets depending on a loop induction variable through an affine expression with a scale/stride. +// This results in a splitting factor that is different from the size of the dimension being split and more complex splitting along the stride. + +// CHECK-LABEL: @split_consumer_with_loop_dependency_and_stride +// CHECK-DAG: %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { +// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: } +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> +#map = affine_map<(d0) -> (d0 * 2)> +#map1 = affine_map<(d0) -> (d0 * 2 + 1)> +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { + %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> + %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg1, %arg2) in (2, 4) { + %3 = affine.apply #map(%arg2) + %4 = affine.apply #map1(%arg2) + %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> + memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> + return + } +}