diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp index eb85e6567..2ed884107 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp @@ -23,27 +23,25 @@ namespace { /// possible to do so without changing the underlying access pattern of `X`. For /// example if /// -/// X : offset = [5] sizes = [1] strides = [6] -/// Y : offset = [a] sizes = [b] strides = [3] +/// X has access pattern (offset: [5] sizes: [1] strides: [6]) and +/// Y has access pattern (offset: [a] sizes: [b] strides: [3]) /// -/// Then the access pattern for X can be changed to have the same stride as Y: +/// Then the access pattern for X can be changed to have access pattern +/// (offset: [10] sizes: [1] strides: [3]) so that its stride matches Y's. /// -/// X : offset = [10] sizes = [1] strides = [3] +/// For this transformation to be possible in dimension `d` is it necessary that /// -/// For this transformation to be possible in dimension `d` we need: -/// -/// 1) the size in dimension `d` of `X` to be 1, and +/// 1) the size in dimension `d` of `X` is 1, and /// 2) the updated offset in `d` of `X` (i.e. offset * strideX / strideY) -/// to be an integer. +/// is an integer. /// /// As another example, if we have: /// -/// X : offset = [4, 8] sizes = [1, 1] strides = [3, 8] -/// Y : offset = [a, b] sizes = [d, e] strides = [6, 2] -/// -/// Then this function transforms X to: +/// X with access pattern (offset: [4, 8] sizes: [1, 1] strides: [3, 8]) +/// Y with access pattern (offset: [a, b] sizes: [d, e] strides: [6, 2]) /// -/// X : offset = [2, 32] sizes = [1, 1] strides = [6, 2] +/// then X can be transformed to have access pattern +/// (offset: [2, 32] sizes: [1, 1] strides: [6, 2]) void matchStridesOfUnitDims(MLIRContext *ctx, ArrayRef sizesX, SmallVector &stridesX, SmallVector &offsetsX, @@ -93,7 +91,7 @@ void matchStridesOfUnitDims(MLIRContext *ctx, ArrayRef sizesX, /// /// This function computes the difference between the global offsets of two /// access patterns. If it is not constant, i.e. if the difference contains -/// an MLIR Value which is not a constant, then nullopt is returned. +/// an MLIR value which is not a constant, then nullopt is returned. /// /// This function is useful when determining if the access pattern A, followed /// by the access pattern B, can be merged into a single access pattern. @@ -161,17 +159,14 @@ std::optional getGlobalOffsetDifference( /// singleton dimension into the access pattern for X, at the first dimension /// from the back where the access patterns differ. /// -/// For example if X and Y are: -/// -/// X : offset = [0, 0] sizes = [2, 8] strides = [8, 1] -/// Y : offset = [0, 0, 0] sizes = [2, 4, 8] strides = [8, 16, 1] +/// For example if X and Y have access patterns /// -/// then X becomes: +/// X: (offset: [0, 0] sizes: [2, 8] strides: [8, 1]) +/// Y: (offset: [0, 0, 0] sizes: [2, 4, 8] strides: [8, 16, 1]) /// -/// X : offset = [0, 0, 0] sizes = [2, 1, 8] strides = [8, 16, 1] +/// then X is transformed into /// -/// After this function call, the access patterns for X and Y will have the same -/// rank. +/// X: (offset: [0, 0, 0] sizes: [2, 1, 8] strides: [8, 16, 1]) void insertUnitDimension(MLIRContext *ctx, SmallVector &offsetsX, SmallVector &sizesX, SmallVector &stridesX, @@ -189,8 +184,8 @@ void insertUnitDimension(MLIRContext *ctx, SmallVector &offsetsX, offsetsX.insert(offsetsX.begin() + index, zeroFoldResult); } -/// If pattern `A` followed by `B` can be merged, merge `B` into `A` and return -/// true. Otherwise return false. +/// If access pattern `A` followed by `B` can be merged into a single access +/// pattern, merge `B` into `A` and return true. Otherwise return false. bool mergeInFirst(MLIRContext *ctx, SmallVector &offsetsA, SmallVector &sizesA, SmallVector &stridesA, @@ -486,7 +481,7 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock( } /// Try to add `offsetToMerge` to one of the offsets in `offsets`. This -/// function assumes an effective stride for `offsetToMerge` of 1, so to add +/// function assumes an effective stride for `offsetToMerge` of 1one so to add /// `offsetToMerge` to the offset in dimension `d`, `offsetToMerge` must be /// divisable by the stride in dimension `d`. /// @@ -512,63 +507,62 @@ bool mergeOffset(MLIRContext *ctx, int64_t offsetToMerge, } /// This function tries to reduce the rank of the access pattern by merging -/// unit dimensions into other dimensions. It also permutes the dimensions so -/// that any remaining unit dimensions appear at the front. +/// unit dimensions into other dimensions. /// -/// Example 1: -/// offset = [2, 2, 15] sizes = [1, 1, 10] strides = [4, 6, 10] -/// becomes -/// offset = [17] sizes = [10] strides = [10] -/// -/// Example 2: -/// offset = [3, 1, 15] sizes = [1, 1, 10] strides = [4, 6, 10]. -/// becomes -/// offset = [1, 15] sizes = [1, 10] strides = [18, 10] /// LogicalResult foldUnitDims(MLIRContext *ctx, SmallVector &offsets, SmallVector &sizes, SmallVector &strides) { - auto N = offsets.size(); + auto initialRank = offsets.size(); + + int64_t cumulativeOffset{0}; + int64_t insertionIndex{0}; + + SmallVector newOffsets; + SmallVector newSizes; + SmallVector newStrides; - // All size-1 dimensions with constant offset will be merged into one - // dimension (see example 1 above) or zero dimensions (see example 2 above). - int64_t cumulativeConstantOffset{0}; - int index{0}; for (int i = 0; i < offsets.size(); ++i) { // If in dimension `i` there is constant offset, constant stride, and size - // of 1, then update the cumulative offset. Otherwise push back into new - // offsets, sizes, and strides. + // of 1, then update the cumulative offset. auto cOffset = getConstantIntValue(offsets[i]); auto cSize = getConstantIntValue(sizes[i]); auto cStride = getConstantIntValue(strides[i]); - if (cOffset.has_value() && cStride.has_value() && cSize.has_value() && - cSize.value() == 1) { - cumulativeConstantOffset += cOffset.value() * cStride.value(); + bool isSizeOne = cSize.has_value() && cSize.value() == 1; + if (cOffset.has_value() && cStride.has_value() && isSizeOne) { + cumulativeOffset += cOffset.value() * cStride.value(); } else { - offsets[index] = offsets[i]; - sizes[index] = sizes[i]; - strides[index] = strides[i]; - ++index; + insertionIndex += isSizeOne; + newOffsets.push_back(offsets[i]); + newSizes.push_back(sizes[i]); + newStrides.push_back(strides[i]); } } - offsets.resize(index); - sizes.resize(index); - strides.resize(index); + // This is the case where there are no unit dimensions to fold. + if (newStrides.size() == initialRank) return failure(); + + bool merged = mergeOffset(ctx, cumulativeOffset, newOffsets, newStrides); + + // This is the case where there is one unit dimension, but it cannot be + // merged into another dimension. + if (!merged && (newStrides.size() + 1 == initialRank)) return failure(); - bool mergedIntoExistingDim = - mergeOffset(ctx, cumulativeConstantOffset, offsets, strides); - if (!mergedIntoExistingDim) { - // TODO(newling) comment on this. - index = index > 0 ? index - 1 : 0; + // At this point we know that we will be able to reduce the rank, and so will + // start updating offsets, sizes, and strides. + offsets = newOffsets; + sizes = newSizes; + strides = newStrides; + if (!merged) { OpFoldResult one = getAsIndexOpFoldResult(ctx, 1); - OpFoldResult offsetFoldResult = - getAsIndexOpFoldResult(ctx, cumulativeConstantOffset); - offsets.insert(offsets.begin() + index, one); - sizes.insert(sizes.begin() + index, one); - strides.insert(strides.begin() + index, offsetFoldResult); + OpFoldResult offset = getAsIndexOpFoldResult(ctx, cumulativeOffset); + offsets.insert(offsets.begin() + insertionIndex, one); + sizes.insert(sizes.begin() + insertionIndex, one); + strides.insert(strides.begin() + insertionIndex, offset); } - return offsets.size() < N ? success() : failure(); + + assert(offsets.size() < initialRank && "Rank should have been reduced"); + return success(); } //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h index 219528ed4..ab93804ad 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h @@ -141,36 +141,72 @@ LogicalResult foldSingleDim(SmallVector &offsets, SmallVector &strides); /// Fold unit dimensions within a strided access pattern. Returns `success` if -/// folding took place. There are two cases being handled here: -/// 1. If a dimension has `size == 1` and `offset == 0`, the dimension can be -/// folded entirely. -/// 2. If a dimension has `size == 1` and `offset != 0`, it can be folded into -/// another dimension with the same stride if that exists. +/// folding took place. If `success` is returned, then the rank of the access +/// pattern has been reduced. If `failure` is returned, `offsets`, `sizes`, and +/// `strides` are left unchanged. /// -/// Example for case 1: +/// Unit dimensions without any offset can be directly removed. Other unit +/// dimensions with offsets contribute to a global offset, which can be +/// considered an 'initial pointer address'. /// -/// offsets: [0, 0, 0], sizes: [32, 1, 8], strides: [32, 1024, 1] +/// The algorithm works roughly as follows: +/// (1) find all the unit dimensions with constant stride and offset and combine +/// them into a single dimension. +/// (2) try to merge the offset from step (1) into a non-unit dimension. This +/// requires finding a non-unit dimension with a stride that divides the +/// offset in (1). /// -/// will be transformed into: +/// After this function has been called, `sizes` will contain either one or +/// zero unit dimensions `d` where offset[d] and stride[d] are constant. It will +/// contain zero such dimensions if step (2) above was successful, and one if +/// it was not. /// -/// offsets: [0, 0], sizes: [32, 8], strides: [32, 1] +/// Example 1: +/// --------- /// -/// Example for case 2: +/// offsets: [0, 0, 0], sizes: [32, 1, 8], strides: [32, 1024, 1] /// -/// offsets: [1, 0, 1, 0], sizes: [1, 32, 1, 8], strides: [1024, 32, 1024, 1] +/// this has a global offset of 0, and will be transformed into: /// -/// will be transformed into: +/// offsets: [0, 0], sizes: [32, 8], strides: [32, 1] +/// +/// Example 2: +/// --------- +/// +/// offsets: [1, 0, 1, 0], sizes: [1, 32, 1, 8], strides: [1024, 32, 1024, 1] +/// +/// this has a global offset of 2048, and will be transformed into: +/// +/// offsets: [64, 0], sizes: [32, 8], strides: [32, 1] +/// +/// it could equally well have been transformed into +/// +/// offsets: [0, 2048], sizes: [32, 8], strides: [32, 1] +/// +/// but the current implementation arbitrarily attempts to be merge the offset +/// starting from the left-most dimension. +/// +/// Example 3: +/// --------- +/// +/// offsets: [2, 2, 15], sizes: [1, 1, 10], strides: [4, 6, 10] +/// +/// becomes +/// +/// offset: [17], sizes: [10], strides: [10] +/// +/// Example 4: +/// --------- +/// +/// offset: [3, 1, 15], sizes: [1, 1, 10], strides: [4, 6, 10] +/// +/// becomes /// -/// offsets: [2, 0, 0], sizes: [1, 32, 8], strides: [1024, 32, 1] +/// offset: [1, 15], sizes: [1, 10], strides: [18, 10] /// -/// Note that the dimensions are merged into the outermost one. Heuristically, -/// this works out best with other strided access pattern transformations, but -/// could be made an option in the future. -LogicalResult foldUnitDims(MLIRContext *, - // ArrayRef offsets, - // ArrayRef strides, - // ArrayRef sizes, - SmallVector &offsets, +/// In this example, step (2) of the algorithm failed, but `success` is still +/// returned because the rank of the access pattern was reduced. +LogicalResult foldUnitDims(MLIRContext *, SmallVector &offsets, SmallVector &strides, SmallVector &sizes);