diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index d15e210d7..89bc0fd05 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -5,7 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 #include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h"
+#include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/Pass/Pass.h"
 
@@ -18,20 +21,114 @@ namespace {
 /// Utility struct to represent DMA split information.
 struct DmaSplitInfo {
   size_t sourceSplitDim{0};
+  int64_t newSourceStride{1};
   size_t targetSplitDim{0};
+  int64_t newTargetStride{1};
+  int64_t splitSize{1};
+};
+
+/// Utility struct to represent objectFifo split information.
+struct ObjFifoSplitInfo {
+  size_t splitDim{0};
+  int64_t splitSize{1};
+  int64_t splitStride{1};
 };
 
 using DmaObjFifoPairT =
     std::pair<AMDAIE::DmaCpyNdOp, AMDAIE::LogicalObjectFifoFromMemrefOp>;
 
+/// Utility to derive the split stride to be used from a vector of DMA ops by
+/// analyzing the offset scales. Will fail if the provided DMA ops don't have a
+/// consistent offset scale.
+template <CopyOpOperateOn OperateOn>
+FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
+                                  int64_t sizeAfterSplit) {
+  int64_t splitStride{-1};
+  for (AMDAIE::DmaCpyNdOp dmaOp : dmaOps) {
+    SmallVector<OpFoldResult> offsets;
+    SmallVector<OpFoldResult> strides;
+    if constexpr (OperateOn == CopyOpOperateOn::Source) {
+      offsets = dmaOp.getSourceMixedOffsets();
+      strides = dmaOp.getSourceMixedStrides();
+    } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
+      offsets = dmaOp.getTargetMixedOffsets();
+      strides = dmaOp.getTargetMixedStrides();
+    } else {
+      assert(false && "Function can only operate on Source or Target");
+    }
+    SmallVector<size_t> splitIndices =
+        getStrideIndicesWithDynamicOrNonZeroOffset(offsets, strides,
+                                                   sizeAfterSplit);
+    if (splitIndices.size() > 1)
+      return dmaOp.emitError() << "multiple split indices found";
+    int64_t step{-1};
+    if (splitIndices.empty()) {
+      step = 1;
+    } else {
+      // splitIndices.size() == 1
+      size_t splitIdx = splitIndices[0];
+      OpFoldResult offset = offsets[splitIdx];
+
+      if (std::optional<int64_t> staticOffset = getConstantIntValue(offset);
+          staticOffset.has_value()) {
+        if (staticOffset.value() == 0) continue;
+        step = 1;
+      } else if (auto offsetValue = dyn_cast_if_present<Value>(offset)) {
+        if (isa_and_present<affine::AffineApplyOp>(
+                offsetValue.getDefiningOp())) {
+          auto applyOp =
+              cast<affine::AffineApplyOp>(offsetValue.getDefiningOp());
+          if (applyOp.getNumOperands() != 1)
+            return applyOp.emitError() << "mulptiple operands is not supported";
+          AffineMap affineMap = applyOp.getAffineMap();
+          RetrieveScaleAndBias retriever;
+          if (failed(retriever.visit(affineMap.getResult(0)))) {
+            return applyOp.emitError()
+                   << "could not retrieve scale and bias from expression: "
+                   << *applyOp.getOperation();
+          }
+          if (!retriever.scale.has_value()) {
+            return applyOp.emitError()
+                   << "expected a scale for: " << *applyOp.getOperation();
+          }
+          step = retriever.scale.value();
+        } else if (auto blockArg = dyn_cast<BlockArgument>(offsetValue);
+                   blockArg && isa<LoopLikeOpInterface>(
+                                   blockArg.getOwner()->getParentOp())) {
+          step = 1;
+        } else {
+          return dmaOp.emitOpError()
+                 << "has an offset value that is neither an "
+                    "induction variable nor an affine expression";
+        }
+      } else {
+        return dmaOp.emitOpError()
+               << "has an offset that is neither a constant nor an affine "
+                  "expression, which is not supported";
+      }
+    }
+    if (splitStride == -1) {
+      splitStride = step;
+    } else if (step != splitStride) {
+      return dmaOp.emitOpError() << "has an offset step: " << step
+                                 << ", which is different from "
+                                    "previous offset steps: "
+                                 << splitStride;
+    }
+  }
+  // If all offsets are zero (or no split index found).
+  if (splitStride == -1) return 1;
+  return splitStride;
+}
+
 /// Find the logical objectFifo and DMA source/target splitting dimensions for
 /// each DMA and objectFifo pair.
 ///
 /// Each pair is handled in the following way:
-/// First, compute the objectFifo splitting dimension as the last non-unit shape
-/// dimension. Afterwards, depending on which logical objectFifo is being
-/// split on, find the outermost dimension in either the source or
-/// target access pattern that has:
+/// First, compute the objectFifo splitting dimension based on the last non-unit
+/// shape dimension and the number of available columns. Afterwards, depending
+/// on which logical objectFifo is being split on, find the outermost dimension
+/// in either the source or target access pattern that has:
 /// - stride == sizeAfterSplit
 /// - size != 1
 /// This is the splitting dimension to be used on the respective side of the DMA
@@ -44,8 +141,9 @@ using DmaObjFifoPairT =
 LogicalResult collectSplittingDims(
     const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
-    DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, size_t>
-        &objFifoSplitDimMap) {
+    DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
+        &objFifoSplitInfoMap,
+    int64_t numCols) {
   for (auto [dmaOp, objFifo] : dmaObjFifoPairs) {
     LLVM_DEBUG(llvm::dbgs() << "dmaOp: " << dmaOp << "\n");
     LLVM_DEBUG(llvm::dbgs() << "objFifo: " << objFifo << "\n");
@@ -62,10 +160,19 @@ LogicalResult collectSplittingDims(
     // If all dimensions are unit (1), no splitting can be done, so continue to
     // the next pair.
     if (objFifoSplitDim >= memrefShape.size()) continue;
+    int64_t splitDimSize = memrefShape[objFifoSplitDim];
     int64_t sizeAfterSplit =
         std::accumulate(memrefShape.begin() + objFifoSplitDim + 1,
                         memrefShape.end(), 1, std::multiplies<>());
 
+    // Get the producers and consumers of the current objectFifoOp.
+    SmallVector<AMDAIE::DmaCpyNdOp> producers;
+    SmallVector<AMDAIE::DmaCpyNdOp> consumers;
+    if (failed(getDmaCpyNdOpProducersAndConsumers(objFifo, producers,
+                                                  consumers))) {
+      return failure();
+    }
+
     size_t sourceSplitDim{0};
     size_t targetSplitDim{0};
     if (dmaOp.getTargetObjectFifo() == objFifo) {
@@ -101,6 +208,27 @@ LogicalResult collectSplittingDims(
           break;
         }
       }
+      FailureOr<int64_t> maybeSplitStride =
+          getSplitStride<CopyOpOperateOn::Source>(consumers, sizeAfterSplit);
+      if (failed(maybeSplitStride)) {
+        objFifo.emitOpError()
+            << "could not retrieve a split stride from the consumer DMA ops";
+      }
+      int64_t splitStride = maybeSplitStride.value();
+      // Calculate the new source stride to be used for splitting the DMA.
+      int64_t newSourceStride =
+          splitStride != 1 ? splitDimSize / splitStride : 1;
+      LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "newSourceStride: " << newSourceStride << "\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "objFifoSplitDim: " << objFifoSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
+                                1, numCols};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
     } else if (dmaOp.getSourceObjectFifo() == objFifo) {
       // Find outermost dimension in the access pattern that has stride ==
       // sizeAfterSplit and size != 1.
@@ -136,13 +264,28 @@ LogicalResult collectSplittingDims(
           break;
         }
       }
+      FailureOr<int64_t> maybeSplitStride =
+          getSplitStride<CopyOpOperateOn::Target>(producers, sizeAfterSplit);
+      if (failed(maybeSplitStride)) {
+        objFifo.emitOpError()
+            << "could not retrieve a split stride from the consumer DMA ops";
+      }
+      int64_t splitStride = maybeSplitStride.value();
+      // Calculate the new target stride to be used for splitting the DMA.
+      int64_t newTargetStride =
+          splitStride != 1 ? splitDimSize / splitStride : 1;
+      LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "newTargetStride: " << newTargetStride << "\n");
+      LLVM_DEBUG(llvm::dbgs()
+                 << "objFifoSplitDim: " << objFifoSplitDim << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
+                                newTargetStride, numCols};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
     }
-    LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
-    LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
-    LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n");
-    DmaSplitInfo dmaSplitInfo = {sourceSplitDim, targetSplitDim};
-    dmaSplitInfoMap[dmaOp] = std::move(dmaSplitInfo);
-    objFifoSplitDimMap[objFifo] = objFifoSplitDim;
   }
   return success();
 }
@@ -157,9 +300,6 @@ class AMDAIESplitLogicalObjFifosPass
 
   AMDAIESplitLogicalObjFifosPass() = default;
   AMDAIESplitLogicalObjFifosPass(const AMDAIESplitLogicalObjFifosPass &pass){};
-  AMDAIESplitLogicalObjFifosPass(
-      const AMDAIESplitLogicalObjFifosOptions &options)
-      : AMDAIESplitLogicalObjFifosBase(options) {}
   void runOnOperation() override;
 };
 
@@ -168,6 +308,18 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   MLIRContext *context = &getContext();
   IRRewriter rewriter(context);
 
+  // Retrieve the device model.
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(moduleOp);
+  std::optional<int64_t> maybeNumColumns = getConfigNumColumns(targetAttr);
+  if (!maybeNumColumns) {
+    moduleOp.emitOpError() << "has no number of columns specified in the "
+                              "target attribute configuration. This "
+                              "device-specific information is required to "
+                              "correctly split logical objectFifos.";
+    return signalPassFailure();
+  }
+  int64_t numColumns = maybeNumColumns.value();
+
   // Walk and collect all dma ops between L3 and L2.
   SmallVector<AMDAIE::DmaCpyNdOp> l3L2DmaOps;
   SmallVector<DmaObjFifoPairT> dmaObjFifoPairs;
@@ -189,9 +341,10 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
 
   // Collect the split dimensions for all DMA and ojectFifo pairs.
   DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> dmaSplitInfoMap;
-  DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, size_t> objFifoSplitDimMap;
+  DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp, ObjFifoSplitInfo>
+      objFifoSplitInfoMap;
   if (failed(collectSplittingDims(dmaObjFifoPairs, dmaSplitInfoMap,
-                                  objFifoSplitDimMap))) {
+                                  objFifoSplitInfoMap, numColumns))) {
     return signalPassFailure();
   }
 
@@ -200,16 +353,19 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
   for (auto &&[dmaOp, dmaSplitInfo] : dmaSplitInfoMap) {
     auto stridedOp =
         cast<AMDAIE::DoublyStridedOpInterface>(dmaOp.getOperation());
-    if (failed(splitDoublyStridedOp(rewriter, stridedOp,
-                                    dmaSplitInfo.sourceSplitDim,
-                                    dmaSplitInfo.targetSplitDim, numCols))) {
+    if (failed(splitDoublyStridedOp(
+            rewriter, stridedOp, dmaSplitInfo.sourceSplitDim,
+            dmaSplitInfo.targetSplitDim, dmaSplitInfo.splitSize,
+            dmaSplitInfo.newSourceStride, dmaSplitInfo.newTargetStride))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of the DMA op: " << dmaOp);
       return signalPassFailure();
     }
   }
-  for (auto &&[objFifo, splitDim] : objFifoSplitDimMap) {
-    if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitDim, numCols))) {
+  for (auto &&[objFifo, splitInfo] : objFifoSplitInfoMap) {
+    if (failed(splitLogicalObjectFifo(rewriter, objFifo, splitInfo.splitDim,
+                                      splitInfo.splitSize,
+                                      splitInfo.splitStride))) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to perform splitting of objectFifo op");
       return signalPassFailure();
@@ -219,9 +375,8 @@ void AMDAIESplitLogicalObjFifosPass::runOnOperation() {
 
 }  // namespace
 
-std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass(
-    AMDAIESplitLogicalObjFifosOptions options) {
-  return std::make_unique<AMDAIESplitLogicalObjFifosPass>(options);
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass() {
+  return std::make_unique<AMDAIESplitLogicalObjFifosPass>();
 }
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 2728c4633..0ded0b8fa 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -599,13 +599,8 @@ void addAMDAIEObjectFifoLoweringPasses(
 
   passManager.addPass(createAMDAIESplitLogicalObjFifosForConnectionReusePass());
   // Currently, SplitLogicalObjFifos pass only works for matmul-like ops.
-  {
-    if (useTilePipeline == TilePassPipeline::PackPeelPipeline) {
-      AMDAIESplitLogicalObjFifosOptions splitOptions;
-      splitOptions.numCols = numCols;
-      passManager.addPass(createAMDAIESplitLogicalObjFifosPass(splitOptions));
-    }
-  }
+  if (useTilePipeline == TilePassPipeline::PackPeelPipeline)
+    passManager.addPass(createAMDAIESplitLogicalObjFifosPass());
 
   passManager.addPass(createCSEPass());
   passManager.addPass(createCanonicalizerPass());
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index dd660646d..4963b1709 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -291,8 +291,7 @@ std::unique_ptr<Pass> createAMDAIERemoveMemorySpacePass();
 std::unique_ptr<Pass> createAMDAIESinkIntoCorePass();
 
 /// Create a pass to split logicalobjectfifos for shimTile/memTile distribution.
-std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass(
-    AMDAIESplitLogicalObjFifosOptions options = {});
+std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosPass();
 
 /// Create a pass to split logicalobjectfifos for connection reuse.
 std::unique_ptr<Pass> createAMDAIESplitLogicalObjFifosForConnectionReusePass();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index f7f3db246..1aa0abc31 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -715,10 +715,6 @@ def AMDAIESplitLogicalObjFifos :
     `[1, 2, 32, 32]`, will be split to two `[1, 1, 32, 32]` buffers.
   }];
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIESplitLogicalObjFifosPass()";
-  let options = [
-    Option<"numCols", "num-cols", "uint32_t", /*default=*/"4",
-      "Number of columns used in an AIE core array">
-  ];
 }
 
 def AMDAIESplitLogicalObjFifosForConnectionReuse :
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
index ebddb881d..db100c61c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -8,6 +8,7 @@
 
 #include <numeric>
 
+#include "iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.h"
 #include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -502,8 +503,6 @@ LogicalResult splitLogicalObjectFifoForElementwiseOp(
   return success();
 }
 
-/// Utility to get the `DmaCpyNdOp` producers and consumers of a given
-/// objectFifo op.
 LogicalResult getDmaCpyNdOpProducersAndConsumers(
     AMDAIE::LogicalObjectFifoFromMemrefOp op,
     SmallVector<AMDAIE::DmaCpyNdOp> &producers,
@@ -527,54 +526,119 @@ LogicalResult getDmaCpyNdOpProducersAndConsumers(
   return success();
 }
 
-using OffsetIndexAndNewOffsetT = std::tuple<std::optional<size_t>, int64_t>;
-
-/// Utility to return the index of the offsets array that refers to newly
-/// splitted objectFifo and the respective offset value. Note that there might
-/// not be a dimension with `stride == sizeAfterSplit`, in which case an offset
-/// index can't be returned and the correct offset is `0`.
-FailureOr<OffsetIndexAndNewOffsetT> getOffsetIndexAndOffset(
-    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
-    ArrayRef<OpFoldResult> strides, size_t sizeAfterSplit,
-    function_ref<InFlightDiagnostic()> emitError) {
-  SmallVector<size_t> offsetIndices;
+/// Utility to return the indices of the dimensions with stride equal to the
+/// expected stride and with dynamic or non-zero offsets.
+SmallVector<size_t> getStrideIndicesWithDynamicOrNonZeroOffset(
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> strides,
+    size_t expectedStride) {
+  SmallVector<size_t> indices;
   for (auto iter : llvm::enumerate(llvm::zip(strides, offsets))) {
-    std::optional<int64_t> maybeStride =
-        getConstantIntValue(std::get<0>(iter.value()));
     std::optional<int64_t> maybeOffset =
         getConstantIntValue(std::get<1>(iter.value()));
-    if (maybeStride.has_value() && maybeOffset.has_value() &&
-        maybeStride.value() == sizeAfterSplit && maybeOffset.value() != 0) {
-      offsetIndices.push_back(iter.index());
+    bool strideEqualToSizeAfterSplit =
+        isConstantIntValue(std::get<0>(iter.value()), expectedStride);
+    bool dynamicOrNonZeroOffset =
+        !maybeOffset.has_value() || maybeOffset.value() != 0;
+    if (strideEqualToSizeAfterSplit && dynamicOrNonZeroOffset) {
+      indices.push_back(iter.index());
     }
   }
+  return indices;
+}
+
+/// Utility struct for new offset configurations.
+struct OffsetConfig {
+  /// The optional index of the offset to be updated.
+  std::optional<size_t> maybeOffsetIdx{std::nullopt};
+  /// The index of the new objectFifo to be used.
+  size_t objFifoIndex{0};
+  /// The new offset to be used.
+  OpFoldResult newOffset;
+};
+
+/// Utility to new offset configuration to be used for updating
+/// consumer/producer DMA operations.
+/// First, find the dimension for which the `stride` is equal to
+/// `sizeAfterSplit` and whith a dynamic or non-zero offset. This is the new
+/// offset index. Afterwards, check the offset value at this dimension and
+/// compute the `newOffset` value and `objFifoIndex`.
+FailureOr<OffsetConfig> getNewOffsetConfig(
+    RewriterBase &rewriter, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> strides, size_t sizeAfterSplit, int64_t splitDimSize,
+    int64_t splitStride, int64_t splitFactor,
+    function_ref<InFlightDiagnostic()> emitError) {
+  SmallVector<size_t> offsetIndices =
+      getStrideIndicesWithDynamicOrNonZeroOffset(offsets, strides,
+                                                 sizeAfterSplit);
   if (offsetIndices.size() > 1)
     return emitError() << "multiple offset indices found";
-
-  int64_t offset{0};
-  std::optional<size_t> maybeOffsetIdx;
-  if (offsetIndices.size() == 1) {
-    size_t offsetIdx = offsetIndices[0];
-    maybeOffsetIdx = offsetIdx;
-    std::optional<int64_t> maybeSize = getConstantIntValue(sizes[offsetIdx]);
-    std::optional<int64_t> maybeOffset =
-        getConstantIntValue(offsets[offsetIdx]);
-    if (!maybeSize || !maybeOffset) {
-      return emitError()
-             << "expected a static target offset and size on index: "
-             << offsetIdx;
+  if (offsetIndices.empty())
+    return OffsetConfig{std::nullopt, 0, OpFoldResult{nullptr}};
+  // Else, offsetIndices.size() == 1
+  size_t offsetIdx = offsetIndices[0];
+  size_t objFifoIndex{0};
+  OpFoldResult newOffsetValue;
+  if (auto offsetValue = dyn_cast_if_present<Value>(offsets[offsetIdx])) {
+    if (isa_and_present<affine::AffineApplyOp>(offsetValue.getDefiningOp())) {
+      auto applyOp = cast<affine::AffineApplyOp>(offsetValue.getDefiningOp());
+      if (applyOp.getNumOperands() != 1) {
+        return emitError()
+               << "AffineApplyOp with mulptiple operands is not supported";
+      }
+      Value operand = applyOp.getMapOperands()[0];
+      AffineMap affineMap = applyOp.getAffineMap();
+      RetrieveScaleAndBias retriever;
+      if (failed(retriever.visit(affineMap.getResult(0)))) {
+        return emitError()
+               << "could not retrieve scale and bias from expression: "
+               << *applyOp.getOperation();
+      }
+      if (!retriever.scale) {
+        return emitError() << "expected a scale for: "
+                           << *applyOp.getOperation();
+      }
+      objFifoIndex = retriever.bias.has_value() ? retriever.bias.value() : 0;
+      // In case of a unit stride, the index of the new objectFifo needs to be
+      // calculated by dividing by `splitDimSize`. For example, for
+      // `splitDimSize == 2`, offsets 0 and 1 are mapped to objectFifo 0 and
+      // offsets 2 and 3 are mapped to objectFifo 1.
+      if (splitStride == 1) objFifoIndex /= splitDimSize;
+      newOffsetValue = operand;
+    } else if (auto blockArg = dyn_cast<BlockArgument>(offsetValue);
+               blockArg &&
+               isa<LoopLikeOpInterface>(blockArg.getOwner()->getParentOp())) {
+      objFifoIndex = 0;
+      newOffsetValue = offsetValue;
+    } else {
+      return emitError() << "expected an affine expression or induction "
+                            "variable for the offset on index: "
+                         << offsetIdx;
     }
-    offset = maybeOffset.value();
+  } else if (std::optional<int64_t> maybeOffset =
+                 getConstantIntValue(offsets[offsetIdx])) {
+    // In case of a unit stride, the index of the new objectFifo needs to be
+    // calculated by dividing by `splitDimSize`. For example, for
+    // `splitDimSize == 2`, offsets 0 and 1 are mapped to objectFifo 0 and
+    // offsets 2 and 3 are mapped to objectFifo 1.
+    objFifoIndex = maybeOffset.value() / splitDimSize;
+    newOffsetValue = rewriter.getIndexAttr(maybeOffset.value() % splitDimSize);
+
+  } else {
+    return emitError()
+           << "expected a static or affine expression offset on index: "
+           << offsetIdx;
   }
-  return OffsetIndexAndNewOffsetT{maybeOffsetIdx, offset};
+  return OffsetConfig{offsetIdx, objFifoIndex, newOffsetValue};
 }
 
 /// Split a logical objectFifo on the provided split dimension with the
-/// specified splitting factor.
+/// specified splitting factor and stride.
 LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
                                      AMDAIE::LogicalObjectFifoFromMemrefOp op,
                                      size_t splitDim,
-                                     std::optional<size_t> maybeSplitFactor) {
+                                     std::optional<size_t> maybeSplitFactor,
+                                     int64_t splitStride) {
+  OpBuilder::InsertionGuard g(rewriter);
   SmallVector<int64_t> memrefShape =
       llvm::to_vector(op.getMemrefType().getShape());
   int64_t splitFactor = maybeSplitFactor.has_value() ? maybeSplitFactor.value()
@@ -586,6 +650,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
       memrefShape[splitDim] % splitFactor == 0 &&
       "the target size for splitting is not divisible by the splitting factor");
   memrefShape[splitDim] /= splitFactor;
+  int64_t splitDimSize = memrefShape[splitDim];
 
   // Create `splitFactor` number of objectFifo ops.
   SmallVector<AMDAIE::LogicalObjectFifoFromMemrefOp> newObjFifos;
@@ -611,30 +676,31 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
     SmallVector<OpFoldResult> targetOffsets = producer.getTargetMixedOffsets();
     SmallVector<OpFoldResult> targetSizes = producer.getTargetMixedSizes();
     SmallVector<OpFoldResult> targetStrides = producer.getTargetMixedStrides();
-    std::optional<size_t> maybeOffsetIdx;
-    int64_t targetOffset{0};
-    FailureOr<OffsetIndexAndNewOffsetT> maybeOffsetIdxAndNewOffset =
-        getOffsetIndexAndOffset(targetOffsets, targetSizes, targetStrides,
-                                sizeAfterSplit,
-                                [&]() { return producer.emitOpError(); });
-    if (failed(maybeOffsetIdxAndNewOffset)) {
+    FailureOr<OffsetConfig> maybeOffsetConfig = getNewOffsetConfig(
+        rewriter, targetOffsets, targetStrides, sizeAfterSplit, splitDimSize,
+        splitStride, splitFactor, [&]() { return producer.emitOpError(); });
+    if (failed(maybeOffsetConfig)) {
       return producer.emitOpError()
              << "failed to find an offset index and new offset";
     }
-    std::tie(maybeOffsetIdx, targetOffset) = maybeOffsetIdxAndNewOffset.value();
-
-    // Adjust offset if the new shape of the split dimension is larger than 1.
-    int64_t newOffset = 0;
-    if (memrefShape[splitDim] > 1) {
-      newOffset = targetOffset % splitFactor;
-      targetOffset /= splitFactor;
-    }
-    assert(targetOffset < newObjFifos.size() &&
+    OffsetConfig offsetConfig = maybeOffsetConfig.value();
+    assert(offsetConfig.objFifoIndex < newObjFifos.size() &&
            "the targetOffset should be smaller than the number of objectFifos");
-    if (maybeOffsetIdx.has_value())
-      targetOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(newOffset);
+    if (offsetConfig.maybeOffsetIdx.has_value()) {
+      size_t offsetIdx = offsetConfig.maybeOffsetIdx.value();
+      targetOffsets[offsetIdx] = offsetConfig.newOffset;
+      int64_t offsetIdxStride =
+          getConstantIndexOrAssert(targetStrides[offsetIdx]);
+      for (size_t i = 0; i < offsetIdx; i++) {
+        int64_t stride = getConstantIndexOrAssert(targetStrides[i]);
+        if (stride > offsetIdxStride &&
+            !isConstantIntValue(targetSizes[i], 1)) {
+          targetStrides[i] = rewriter.getIndexAttr(stride / splitFactor);
+        }
+      }
+    }
     AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo =
-        newObjFifos[targetOffset];
+        newObjFifos[offsetConfig.objFifoIndex];
     rewriter.setInsertionPoint(producer);
     auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
         producer.getLoc(), newObjFifo, targetOffsets, targetSizes,
@@ -648,30 +714,31 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
     SmallVector<OpFoldResult> sourceOffsets = consumer.getSourceMixedOffsets();
     SmallVector<OpFoldResult> sourceSizes = consumer.getSourceMixedSizes();
     SmallVector<OpFoldResult> sourceStrides = consumer.getSourceMixedStrides();
-    std::optional<size_t> maybeOffsetIdx;
-    int64_t sourceOffset{0};
-    FailureOr<OffsetIndexAndNewOffsetT> maybeOffsetIdxAndNewOffset =
-        getOffsetIndexAndOffset(sourceOffsets, sourceSizes, sourceStrides,
-                                sizeAfterSplit,
-                                [&]() { return consumer.emitOpError(); });
-    if (failed(maybeOffsetIdxAndNewOffset)) {
+    FailureOr<OffsetConfig> maybeOffsetConfig = getNewOffsetConfig(
+        rewriter, sourceOffsets, sourceStrides, sizeAfterSplit, splitDimSize,
+        splitStride, splitFactor, [&]() { return consumer.emitOpError(); });
+    if (failed(maybeOffsetConfig)) {
       return consumer.emitOpError()
              << "failed to find an offset index and offset";
     }
-    std::tie(maybeOffsetIdx, sourceOffset) = maybeOffsetIdxAndNewOffset.value();
-
-    // Adjust offset if the new shape of the split dimension is larger than 1.
-    int64_t newOffset = 0;
-    if (memrefShape[splitDim] > 1) {
-      newOffset = sourceOffset % splitFactor;
-      sourceOffset /= splitFactor;
+    OffsetConfig offsetConfig = maybeOffsetConfig.value();
+    assert(offsetConfig.objFifoIndex < newObjFifos.size() &&
+           "the objFifoIndex should be smaller than the number of objectFifos");
+    if (offsetConfig.maybeOffsetIdx.has_value()) {
+      size_t offsetIdx = offsetConfig.maybeOffsetIdx.value();
+      sourceOffsets[offsetIdx] = offsetConfig.newOffset;
+      int64_t offsetIdxStride =
+          getConstantIndexOrAssert(sourceStrides[offsetIdx]);
+      for (size_t i = 0; i < offsetIdx; i++) {
+        int64_t stride = getConstantIndexOrAssert(sourceStrides[i]);
+        if (stride > offsetIdxStride &&
+            !isConstantIntValue(sourceSizes[i], 1)) {
+          sourceStrides[i] = rewriter.getIndexAttr(stride / splitFactor);
+        }
+      }
     }
-    assert(sourceOffset < newObjFifos.size() &&
-           "the sourceOffset should be smaller than the number of objectFifos");
-    if (maybeOffsetIdx.has_value())
-      sourceOffsets[maybeOffsetIdx.value()] = rewriter.getIndexAttr(newOffset);
     AMDAIE::LogicalObjectFifoFromMemrefOp newObjFifo =
-        newObjFifos[sourceOffset];
+        newObjFifos[offsetConfig.objFifoIndex];
     rewriter.setInsertionPoint(consumer);
     auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
         consumer.getLoc(), consumer.getTarget(),
@@ -688,7 +755,9 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
 LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
                                    AMDAIE::DoublyStridedOpInterface op,
                                    size_t sourceSplitDim, size_t targetSplitDim,
-                                   std::optional<size_t> maybeSplitFactor) {
+                                   std::optional<size_t> maybeSplitFactor,
+                                   int64_t sourceSplitStride,
+                                   int64_t targetSplitStride) {
   if (!op->use_empty())
     return op.emitOpError() << "can't be split because it has uses";
   SmallVector<OpFoldResult> sourceOffsets = op.getSourceMixedOffsets();
@@ -703,6 +772,20 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
   assert(targetSplitDim < targetOffsets.size() &&
          "the dimension to be split on should be smaller than the number of "
          "target dimensions");
+  std::optional<int64_t> maybeSourceStride =
+      getConstantIntValue(sourceStrides[sourceSplitDim]);
+  std::optional<int64_t> maybeTargetStride =
+      getConstantIntValue(targetStrides[targetSplitDim]);
+  if (!maybeSourceStride) {
+    return op.emitOpError()
+           << "does not have a static source stride on dim: " << sourceSplitDim;
+  }
+  if (!maybeTargetStride) {
+    return op.emitOpError()
+           << "does not have a static target stride on dim: " << targetSplitDim;
+  }
+  int64_t sourceStride = maybeSourceStride.value();
+  int64_t targetStride = maybeTargetStride.value();
   std::optional<int64_t> maybeSourceSize =
       getConstantIntValue(sourceSizes[sourceSplitDim]);
   std::optional<int64_t> maybeTargetSize =
@@ -715,15 +798,11 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
     return op.emitOpError()
            << "does not have a static target size on dim: " << targetSplitDim;
   }
-
   int64_t sourceSize = maybeSourceSize.value();
   int64_t targetSize = maybeTargetSize.value();
   int64_t splitFactor = maybeSplitFactor.has_value()
                             ? maybeSplitFactor.value()
                             : std::gcd(sourceSize, targetSize);
-  if (sourceSize < splitFactor || targetSize < splitFactor) {
-    splitFactor = std::gcd(sourceSize, targetSize);
-  }
   if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
     return op.emitOpError() << "the target or source size is not divisible by "
                                "the provided splitting factor: "
@@ -734,12 +813,38 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
   int64_t newTargetSize = targetSize / splitFactor;
   sourceSizes[sourceSplitDim] = rewriter.getIndexAttr(newSourceSize);
   targetSizes[targetSplitDim] = rewriter.getIndexAttr(newTargetSize);
+  if (sourceSplitStride != 1) {
+    sourceSizes[sourceSplitDim] =
+        rewriter.getIndexAttr(newSourceSize / sourceSplitStride);
+    sourceSizes.insert(sourceSizes.begin() + sourceSplitDim,
+                       rewriter.getIndexAttr(sourceSplitStride));
+    sourceStrides.insert(
+        sourceStrides.begin() + sourceSplitDim,
+        rewriter.getIndexAttr(sourceSize / sourceSplitStride * sourceStride));
+    sourceOffsets.insert(sourceOffsets.begin() + sourceSplitDim,
+                         rewriter.getIndexAttr(0));
+    sourceSplitDim++;
+  }
+  if (targetSplitStride != 1) {
+    targetSizes[targetSplitDim] =
+        rewriter.getIndexAttr(newTargetSize / targetSplitStride);
+    targetSizes.insert(targetSizes.begin() + targetSplitDim,
+                       rewriter.getIndexAttr(targetSplitStride));
+    targetStrides.insert(
+        targetStrides.begin() + targetSplitDim,
+        rewriter.getIndexAttr(targetSize / targetSplitStride * targetStride));
+    targetOffsets.insert(targetOffsets.begin() + targetSplitDim,
+                         rewriter.getIndexAttr(0));
+    targetSplitDim++;
+  }
   rewriter.setInsertionPoint(op);
   for (int i = 0; i < splitFactor; ++i) {
-    FailureOr<OpFoldResult> newSourceOffset = addToOffset(
-        rewriter, sourceOffsets[sourceSplitDim], newSourceSize);  // i *
-    FailureOr<OpFoldResult> newTargetOffset = addToOffset(
-        rewriter, targetOffsets[targetSplitDim], newTargetSize);  // i *
+    FailureOr<OpFoldResult> newSourceOffset =
+        addToOffset(rewriter, sourceOffsets[sourceSplitDim],
+                    newSourceSize / sourceSplitStride);
+    FailureOr<OpFoldResult> newTargetOffset =
+        addToOffset(rewriter, targetOffsets[targetSplitDim],
+                    newTargetSize / targetSplitStride);
     if (failed(newSourceOffset))
       return op.emitOpError() << "could not create a new source offset";
     if (failed(newTargetOffset))
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index c470d917b..fee4e510c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -11,6 +11,19 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
+/// Utility to get the `DmaCpyNdOp` producers and consumers of a given
+/// objectFifo op.
+LogicalResult getDmaCpyNdOpProducersAndConsumers(
+    AMDAIE::LogicalObjectFifoFromMemrefOp op,
+    SmallVector<AMDAIE::DmaCpyNdOp> &producers,
+    SmallVector<AMDAIE::DmaCpyNdOp> &consumers);
+
+/// Utility to return the indices of the dimensions with stride equal to the
+/// expected stride and with dynamic or non-zero offsets.
+SmallVector<size_t> getStrideIndicesWithDynamicOrNonZeroOffset(
+    ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> strides,
+    size_t expectedStride);
+
 /// Utility to split logicalobjectfifos given a vector of L2->L1 dma ops.
 LogicalResult splitLogicalObjectFifoForElementwiseOp(
     IRRewriter &rewriter, SmallVector<AMDAIE::DmaCpyNdOp> &l2ToL1DmaOps,
@@ -21,7 +34,8 @@ LogicalResult splitLogicalObjectFifoForElementwiseOp(
 /// objectFifo will be split on the size of the dimension being split.
 LogicalResult splitLogicalObjectFifo(
     IRRewriter &rewriter, AMDAIE::LogicalObjectFifoFromMemrefOp op,
-    size_t splitDim = 0, std::optional<size_t> splitFactor = std::nullopt);
+    size_t splitDim = 0, std::optional<size_t> splitFactor = std::nullopt,
+    int64_t splitStride = 1);
 
 /// Split doubly strided operations on a source and target split dimension with
 /// the provided split factor. If no split factor is provided, the doubly
@@ -29,7 +43,8 @@ LogicalResult splitLogicalObjectFifo(
 LogicalResult splitDoublyStridedOp(
     IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
     size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
-    std::optional<size_t> splitFactor = std::nullopt);
+    std::optional<size_t> splitFactor = std::nullopt,
+    int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
 
 }  // namespace mlir::iree_compiler::AMDAIE
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
index 497c44990..f14cc171e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.cpp
@@ -13,12 +13,20 @@
 
 namespace mlir::iree_compiler::AMDAIE {
 
-std::optional<AMDAIEDevice> getConfigAMDAIEDevice(
-    IREE::HAL::ExecutableTargetAttr targetAttr) {
+template <typename T>
+std::optional<T> getConfigAttr(IREE::HAL::ExecutableTargetAttr targetAttr,
+                               StringRef name) {
   if (!targetAttr) return std::nullopt;
   auto config = targetAttr.getConfiguration();
   if (!config) return std::nullopt;
-  std::optional<StringAttr> attr = config.getAs<StringAttr>("target_device");
+  std::optional<T> attr = config.getAs<T>(name);
+  return attr;
+}
+
+std::optional<AMDAIEDevice> getConfigAMDAIEDevice(
+    IREE::HAL::ExecutableTargetAttr targetAttr) {
+  std::optional<StringAttr> attr =
+      getConfigAttr<StringAttr>(targetAttr, "target_device");
   if (!attr) return std::nullopt;
   return AMDAIE::symbolizeEnum<AMDAIEDevice>(attr.value().getValue());
 }
@@ -29,6 +37,24 @@ std::optional<AMDAIEDevice> getConfigAMDAIEDevice(Operation *op) {
   return getConfigAMDAIEDevice(targetAttr);
 }
 
+/// Utility that returns the number of columns being targeted.
+std::optional<int64_t> getConfigNumColumns(
+    IREE::HAL::ExecutableTargetAttr targetAttr) {
+  std::optional<IntegerAttr> attr =
+      getConfigAttr<IntegerAttr>(targetAttr, "num_cols");
+  if (!attr) return std::nullopt;
+  return attr->getInt();
+}
+
+/// Utility that returns the number of rows being targeted.
+std::optional<int64_t> getConfigNumRows(
+    IREE::HAL::ExecutableTargetAttr targetAttr) {
+  std::optional<IntegerAttr> attr =
+      getConfigAttr<IntegerAttr>(targetAttr, "num_rows");
+  if (!attr) return std::nullopt;
+  return attr->getInt();
+}
+
 /// Utility to retrieve a constant index from an OpFoldResult.
 int64_t getConstantIndexOrAssert(OpFoldResult ofr) {
   std::optional<int64_t> res = getConstantIntValue(ofr);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
index cbb9cbd63..21d2b32b0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEUtils.h
@@ -22,6 +22,14 @@ std::optional<AMDAIEDevice> getConfigAMDAIEDevice(
 /// attr in the AST.
 std::optional<AMDAIEDevice> getConfigAMDAIEDevice(Operation *op);
 
+/// Returns the number of columns being targeted.
+std::optional<int64_t> getConfigNumColumns(
+    IREE::HAL::ExecutableTargetAttr targetAttr);
+
+/// Returns the number of rows being targeted.
+std::optional<int64_t> getConfigNumRows(
+    IREE::HAL::ExecutableTargetAttr targetAttr);
+
 /// Utility to retrieve a constant index from an OpFoldResult.
 int64_t getConstantIndexOrAssert(OpFoldResult ofr);
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index af17a7e5f..40f38621e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -1,4 +1,13 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos{num-cols=2})" --split-input-file --verify-diagnostics %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-split-logical-objectfifos)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// expected-error @+1 {{has no number of columns specified in the target attribute configuration. This device-specific information is required to correctly split logical objectFifos}}
+module {
+  func.func @no_device(%arg0: memref<128x128xi32>) {
+    return
+  }
+}
+
+// -----
 
 // Test of splitting matmul lhs input objectFifo and dma operations on 2x2 AIE array.
 
@@ -29,7 +38,8 @@
 //       CHECK:   memref.dealloc %[[ALLOC_A0]] : memref<1x1x32x32xi32, 1 : i32>
 //       CHECK:   memref.dealloc %[[ALLOC_A1]] : memref<1x1x32x32xi32, 1 : i32>
 #map = affine_map<(d0) -> (d0 * 64)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
@@ -79,7 +89,8 @@ module {
 //       CHECK:   memref.dealloc %[[ALLOC_B0]] : memref<1x1x32x32xi32, 1 : i32>
 //       CHECK:   memref.dealloc %[[ALLOC_B1]] : memref<1x1x32x32xi32, 1 : i32>
 #map = affine_map<(d0) -> (d0 * 64)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
@@ -136,7 +147,8 @@ module {
 //       CHECK:   memref.dealloc %[[ALLOC_C0]] : memref<1x2x32x32xi32, 1 : i32>
 //       CHECK:   memref.dealloc %[[ALLOC_C1]] : memref<1x2x32x32xi32, 1 : i32>
 #map = affine_map<(d0) -> (d0 * 64)>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_output(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
@@ -189,7 +201,8 @@ module {
 //  CHECK-SAME:                                   %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]
 //       CHECK:   memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32>
 //       CHECK:   memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32>
-module {
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}  {
   func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
@@ -208,3 +221,173 @@ module {
     return
   }
 }
+
+// -----
+
+// Tests splitting with the consumer DMA offsets depending on a loop induction variable.
+// This results in a splitting factor that is different from the size of the dimension being split.
+
+// CHECK-LABEL: @split_producer_with_loop_dependency
+// CHECK-DAG:   %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1])
+// CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
+// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+#map = affine_map<(d0) -> (d0 + 4)>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
+    %2 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [256, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+    scf.forall (%arg1, %arg2) in (2, 4) {
+      %3 = affine.apply #map(%arg2)
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @split_consumer_with_loop_dependency
+// CHECK-DAG:   %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
+// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:       }
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+#map = affine_map<(d0) -> (d0 + 4)>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
+    scf.forall (%arg1, %arg2) in (2, 4) {
+      %3 = affine.apply #map(%arg2)
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+    memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+    return
+  }
+}
+
+// -----
+
+// Tests splitting a producer DMA with the consumer DMAs' offsets depending on a loop induction variable through an affine expression with a scale/stride.
+// This results in a splitting factor that is different from the size of the dimension being split and more complex splitting along the stride.
+// For example, if the data in a 4x4 objectFifo at some point is:
+// 
+// [0, 0, 0, 0]
+// [1, 1, 1, 1]
+// [2, 2, 2, 2]
+// [3, 3, 3, 3]
+// 
+// and for an `index` from 0 -> 2, two consumer DMAs access the following rows:
+// 
+// consumer 1: 2 * `index`  (thus rows 0 and 2) 
+// consumer 2: 2 * `index` + 1  (thus rows 1 and 3)
+//
+// Therefore, the objectFifo is split into two objectFifos in the following way:
+//
+// new objectFifo 1:
+//
+// [0, 0, 0, 0]
+// [2, 2, 2, 2]
+//
+// new objectFifo 2:
+//
+// [1, 1, 1, 1]
+// [3, 3, 3, 3]
+
+// CHECK-LABEL: @split_producer_with_loop_dependency_and_stride
+// CHECK-DAG:   %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1])
+// CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
+// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+#map = affine_map<(d0) -> (d0 * 2)>
+#map1 = affine_map<(d0) -> (d0 * 2 + 1)>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
+    %2 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [256, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+    scf.forall (%arg1, %arg2) in (2, 4) {
+      %3 = affine.apply #map(%arg2)
+      %4 = affine.apply #map1(%arg2)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+    return
+  }
+}
+
+// -----
+
+// Tests splitting a consumer DMA with the producer DMAs' offsets depending on a loop induction variable through an affine expression with a scale/stride.
+// This results in a splitting factor that is different from the size of the dimension being split and more complex splitting along the stride.
+
+// CHECK-LABEL: @split_consumer_with_loop_dependency_and_stride
+// CHECK-DAG:   %[[OBJ_FIFO_L3:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
+// CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
+// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:       }
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
+#map = affine_map<(d0) -> (d0 * 2)>
+#map1 = affine_map<(d0) -> (d0 * 2 + 1)>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
+    %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
+    %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
+    %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
+    scf.forall (%arg1, %arg2) in (2, 4) {
+      %3 = affine.apply #map(%arg2)
+      %4 = affine.apply #map1(%arg2)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+    memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
+    memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
+    return
+  }
+}