diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
index eb85e6567..2ed884107 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp
@@ -23,27 +23,25 @@ namespace {
 /// possible to do so without changing the underlying access pattern of `X`. For
 /// example if
 ///
-/// X : offset = [5] sizes = [1] strides = [6]
-/// Y : offset = [a] sizes = [b] strides = [3]
+/// X has access pattern (offset: [5] sizes: [1] strides: [6]) and
+/// Y has access pattern (offset: [a] sizes: [b] strides: [3])
 ///
-/// Then the access pattern for X can be changed to have the same stride as Y:
+/// Then the access pattern for X can be changed to have access pattern
+/// (offset: [10] sizes: [1] strides: [3]) so that its stride matches Y's.
 ///
-/// X : offset = [10] sizes = [1] strides = [3]
+/// For this transformation to be possible in dimension `d` is it necessary that
 ///
-/// For this transformation to be possible in dimension `d` we need:
-///
-/// 1) the size in dimension `d` of `X` to be 1, and
+/// 1) the size in dimension `d` of `X` is 1, and
 /// 2) the updated offset in `d` of `X` (i.e. offset * strideX / strideY)
-///    to be an integer.
+///    is an integer.
 ///
 /// As another example, if we have:
 ///
-/// X : offset = [4, 8]  sizes = [1, 1] strides = [3, 8]
-/// Y : offset = [a, b]  sizes = [d, e] strides = [6, 2]
-///
-/// Then this function transforms X to:
+/// X with access pattern (offset: [4, 8]  sizes: [1, 1] strides: [3, 8])
+/// Y with access pattern (offset: [a, b]  sizes: [d, e] strides: [6, 2])
 ///
-/// X : offset = [2, 32] sizes = [1, 1] strides = [6, 2]
+/// then X can be transformed to have access pattern
+///                       (offset: [2, 32] sizes: [1, 1] strides: [6, 2])
 void matchStridesOfUnitDims(MLIRContext *ctx, ArrayRef<OpFoldResult> sizesX,
                             SmallVector<OpFoldResult> &stridesX,
                             SmallVector<OpFoldResult> &offsetsX,
@@ -93,7 +91,7 @@ void matchStridesOfUnitDims(MLIRContext *ctx, ArrayRef<OpFoldResult> sizesX,
 ///
 /// This function computes the difference between the global offsets of two
 /// access patterns. If it is not constant, i.e. if the difference contains
-/// an MLIR Value which is not a constant, then nullopt is returned.
+/// an MLIR value which is not a constant, then nullopt is returned.
 ///
 /// This function is useful when determining if the access pattern A, followed
 /// by the access pattern B, can be merged into a single access pattern.
@@ -161,17 +159,14 @@ std::optional<int64_t> getGlobalOffsetDifference(
 /// singleton dimension into the access pattern for X, at the first dimension
 /// from the back where the access patterns differ.
 ///
-/// For example if X and Y are:
-///
-/// X : offset = [0, 0]    sizes = [2, 8]    strides = [8, 1]
-/// Y : offset = [0, 0, 0] sizes = [2, 4, 8] strides = [8, 16, 1]
+/// For example if X and Y have access patterns
 ///
-/// then X becomes:
+/// X:  (offset: [0, 0]    sizes: [2, 8]    strides: [8, 1])
+/// Y:  (offset: [0, 0, 0] sizes: [2, 4, 8] strides: [8, 16, 1])
 ///
-/// X : offset = [0, 0, 0] sizes = [2, 1, 8] strides = [8, 16, 1]
+/// then X is transformed into
 ///
-/// After this function call, the access patterns for X and Y will have the same
-/// rank.
+/// X:  (offset: [0, 0, 0] sizes: [2, 1, 8] strides: [8, 16, 1])
 void insertUnitDimension(MLIRContext *ctx, SmallVector<OpFoldResult> &offsetsX,
                          SmallVector<OpFoldResult> &sizesX,
                          SmallVector<OpFoldResult> &stridesX,
@@ -189,8 +184,8 @@ void insertUnitDimension(MLIRContext *ctx, SmallVector<OpFoldResult> &offsetsX,
   offsetsX.insert(offsetsX.begin() + index, zeroFoldResult);
 }
 
-/// If pattern `A` followed by `B` can be merged, merge `B` into `A` and return
-/// true. Otherwise return false.
+/// If access pattern `A` followed by `B` can be merged into a single access
+/// pattern, merge `B` into `A` and return true. Otherwise return false.
 bool mergeInFirst(MLIRContext *ctx, SmallVector<OpFoldResult> &offsetsA,
                   SmallVector<OpFoldResult> &sizesA,
                   SmallVector<OpFoldResult> &stridesA,
@@ -486,7 +481,7 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
 }
 
 /// Try to add `offsetToMerge` to one of the offsets in `offsets`.  This
-/// function assumes an effective stride for `offsetToMerge` of 1, so to add
+/// function assumes an effective stride for `offsetToMerge` of 1one so to add
 /// `offsetToMerge` to the offset in dimension `d`, `offsetToMerge` must be
 /// divisable by the stride in dimension `d`.
 ///
@@ -512,63 +507,62 @@ bool mergeOffset(MLIRContext *ctx, int64_t offsetToMerge,
 }
 
 /// This function tries to reduce the rank of the access pattern by merging
-/// unit dimensions into other dimensions. It also permutes the dimensions so
-/// that any remaining unit dimensions appear at the front.
+/// unit dimensions into other dimensions.
 ///
-/// Example 1:
-///   offset = [2, 2, 15] sizes = [1, 1, 10] strides = [4, 6, 10]
-/// becomes
-///   offset = [17] sizes = [10] strides = [10]
-///
-/// Example 2:
-///   offset = [3, 1, 15] sizes = [1, 1, 10] strides = [4, 6, 10].
-/// becomes
-///   offset = [1, 15] sizes = [1, 10] strides = [18, 10]
 ///
 LogicalResult foldUnitDims(MLIRContext *ctx, SmallVector<OpFoldResult> &offsets,
                            SmallVector<OpFoldResult> &sizes,
                            SmallVector<OpFoldResult> &strides) {
-  auto N = offsets.size();
+  auto initialRank = offsets.size();
+
+  int64_t cumulativeOffset{0};
+  int64_t insertionIndex{0};
+
+  SmallVector<OpFoldResult> newOffsets;
+  SmallVector<OpFoldResult> newSizes;
+  SmallVector<OpFoldResult> newStrides;
 
-  // All size-1 dimensions with constant offset will be merged into one
-  // dimension (see example 1 above) or zero dimensions (see example 2 above).
-  int64_t cumulativeConstantOffset{0};
-  int index{0};
   for (int i = 0; i < offsets.size(); ++i) {
     // If in dimension `i` there is constant offset, constant stride, and size
-    // of 1, then update the cumulative offset. Otherwise push back into new
-    // offsets, sizes, and strides.
+    // of 1, then update the cumulative offset.
     auto cOffset = getConstantIntValue(offsets[i]);
     auto cSize = getConstantIntValue(sizes[i]);
     auto cStride = getConstantIntValue(strides[i]);
-    if (cOffset.has_value() && cStride.has_value() && cSize.has_value() &&
-        cSize.value() == 1) {
-      cumulativeConstantOffset += cOffset.value() * cStride.value();
+    bool isSizeOne = cSize.has_value() && cSize.value() == 1;
+    if (cOffset.has_value() && cStride.has_value() && isSizeOne) {
+      cumulativeOffset += cOffset.value() * cStride.value();
     } else {
-      offsets[index] = offsets[i];
-      sizes[index] = sizes[i];
-      strides[index] = strides[i];
-      ++index;
+      insertionIndex += isSizeOne;
+      newOffsets.push_back(offsets[i]);
+      newSizes.push_back(sizes[i]);
+      newStrides.push_back(strides[i]);
     }
   }
 
-  offsets.resize(index);
-  sizes.resize(index);
-  strides.resize(index);
+  // This is the case where there are no unit dimensions to fold.
+  if (newStrides.size() == initialRank) return failure();
+
+  bool merged = mergeOffset(ctx, cumulativeOffset, newOffsets, newStrides);
+
+  // This is the case where there is one unit dimension, but it cannot be
+  // merged into another dimension.
+  if (!merged && (newStrides.size() + 1 == initialRank)) return failure();
 
-  bool mergedIntoExistingDim =
-      mergeOffset(ctx, cumulativeConstantOffset, offsets, strides);
-  if (!mergedIntoExistingDim) {
-    // TODO(newling) comment on this.
-    index = index > 0 ? index - 1 : 0;
+  // At this point we know that we will be able to reduce the rank, and so will
+  // start updating offsets, sizes, and strides.
+  offsets = newOffsets;
+  sizes = newSizes;
+  strides = newStrides;
+  if (!merged) {
     OpFoldResult one = getAsIndexOpFoldResult(ctx, 1);
-    OpFoldResult offsetFoldResult =
-        getAsIndexOpFoldResult(ctx, cumulativeConstantOffset);
-    offsets.insert(offsets.begin() + index, one);
-    sizes.insert(sizes.begin() + index, one);
-    strides.insert(strides.begin() + index, offsetFoldResult);
+    OpFoldResult offset = getAsIndexOpFoldResult(ctx, cumulativeOffset);
+    offsets.insert(offsets.begin() + insertionIndex, one);
+    sizes.insert(sizes.begin() + insertionIndex, one);
+    strides.insert(strides.begin() + insertionIndex, offset);
   }
-  return offsets.size() < N ? success() : failure();
+
+  assert(offsets.size() < initialRank && "Rank should have been reduced");
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
index 219528ed4..ab93804ad 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h
@@ -141,36 +141,72 @@ LogicalResult foldSingleDim(SmallVector<OpFoldResult> &offsets,
                             SmallVector<OpFoldResult> &strides);
 
 /// Fold unit dimensions within a strided access pattern. Returns `success` if
-/// folding took place. There are two cases being handled here:
-/// 1. If a dimension has `size == 1` and `offset == 0`, the dimension can be
-/// folded entirely.
-/// 2. If a dimension has `size == 1` and `offset != 0`, it can be folded into
-/// another dimension with the same stride if that exists.
+/// folding took place. If `success` is returned, then the rank of the access
+/// pattern has been reduced. If `failure` is returned, `offsets`, `sizes`, and
+/// `strides` are left unchanged.
 ///
-/// Example for case 1:
+/// Unit dimensions without any offset can be directly removed. Other unit
+/// dimensions with offsets contribute to a global offset, which can be
+/// considered an 'initial pointer address'.
 ///
-/// offsets: [0, 0, 0], sizes: [32, 1, 8], strides: [32, 1024, 1]
+/// The algorithm works roughly as follows:
+/// (1) find all the unit dimensions with constant stride and offset and combine
+///     them into a single dimension.
+/// (2) try to merge the offset from step (1) into a non-unit dimension. This
+///     requires finding a non-unit dimension with a stride that divides the
+///     offset in (1).
 ///
-/// will be transformed into:
+/// After this function has been called, `sizes` will contain either one or
+/// zero unit dimensions `d` where offset[d] and stride[d] are constant. It will
+/// contain zero such dimensions if step (2) above was successful, and one if
+/// it was not.
 ///
-/// offsets: [0, 0], sizes: [32, 8], strides: [32, 1]
+/// Example 1:
+/// ---------
 ///
-/// Example for case 2:
+///   offsets: [0, 0, 0], sizes: [32, 1, 8], strides: [32, 1024, 1]
 ///
-/// offsets: [1, 0, 1, 0], sizes: [1, 32, 1, 8], strides: [1024, 32, 1024, 1]
+/// this has a global offset of 0, and will be transformed into:
 ///
-/// will be transformed into:
+///   offsets: [0, 0], sizes: [32, 8], strides: [32, 1]
+///
+/// Example 2:
+/// ---------
+///
+///   offsets: [1, 0, 1, 0], sizes: [1, 32, 1, 8], strides: [1024, 32, 1024, 1]
+///
+/// this has a global offset of 2048, and will be transformed into:
+///
+///   offsets: [64, 0], sizes: [32, 8], strides: [32, 1]
+///
+/// it could equally well have been transformed into
+///
+///   offsets: [0, 2048], sizes: [32, 8], strides: [32, 1]
+///
+/// but the current implementation arbitrarily attempts to be merge the offset
+/// starting from the left-most dimension.
+///
+/// Example 3:
+/// ---------
+///
+///   offsets: [2, 2, 15],  sizes: [1, 1, 10],  strides: [4, 6, 10]
+///
+/// becomes
+///
+///   offset: [17],  sizes: [10], strides: [10]
+///
+/// Example 4:
+/// ---------
+///
+///   offset: [3, 1, 15],  sizes: [1, 1, 10],  strides: [4, 6, 10]
+///
+/// becomes
 ///
-/// offsets: [2, 0, 0], sizes: [1, 32, 8], strides: [1024, 32, 1]
+///   offset: [1, 15],  sizes: [1, 10],  strides: [18, 10]
 ///
-/// Note that the dimensions are merged into the outermost one. Heuristically,
-/// this works out best with other strided access pattern transformations, but
-/// could be made an option in the future.
-LogicalResult foldUnitDims(MLIRContext *,
-                           //  ArrayRef<OpFoldResult> offsets,
-                           //  ArrayRef<OpFoldResult> strides,
-                           //  ArrayRef<OpFoldResult> sizes,
-                           SmallVector<OpFoldResult> &offsets,
+/// In this example, step (2) of the algorithm failed, but `success` is still
+/// returned because the rank of the access pattern was reduced.
+LogicalResult foldUnitDims(MLIRContext *, SmallVector<OpFoldResult> &offsets,
                            SmallVector<OpFoldResult> &strides,
                            SmallVector<OpFoldResult> &sizes);