diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp index f433e681d..476bf5a74 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp @@ -46,10 +46,12 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { if (l2ToL1DmaOps.size() == 0) return; - SmallVector baseSourceOffsets = l2ToL1DmaOps[0].getSourceOffsets(); + SmallVector baseSourceOffsets = + l2ToL1DmaOps[0].getSourceMixedOffsets(); DenseSet splitDimensionsSet; for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) { - SmallVector sourceOffsets = l2ToL1DmaOps[i].getSourceOffsets(); + SmallVector sourceOffsets = + l2ToL1DmaOps[i].getSourceMixedOffsets(); for (unsigned j = 0, m = baseSourceOffsets.size(); j < m; j++) { if (baseSourceOffsets[j] != sourceOffsets[j]) { splitDimensionsSet.insert(j); @@ -100,14 +102,8 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { cast( l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType()) .getShape()); - llvm::outs() << "FOR " << l2ToL1DmaOp << "\n"; - llvm::outs().flush(); - // SmallVector splitDimensionConstants; DenseMap dimToOffsetMapForL3AsSource; for (unsigned dim : splitDimensionsSet) { - // llvm::outs()<<"\tdim : - // "< constantOffset = getConstantIntValue(staticL2AsSourceOffsets[dim]); if (!constantOffset) { @@ -149,14 +145,19 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); for (auto [dim, offsetToAdd] : dimToOffsetMapForL3AsSource) { - llvm::outs() << "Dim = " << dim << ", offsetToAdd = " << offsetToAdd - << "\n"; - llvm::outs().flush(); auto applyOp = cast( cast(staticL3AsSourceOffsets[dim]).getDefiningOp()); AffineMap affineMap = applyOp.getAffineMap(); - llvm::outs() << "AffineMap = " << affineMap << "\n"; - llvm::outs().flush(); + AffineExpr affineExpr = affineMap.getResult(0); + AffineExpr newAffineExpr = affineExpr + offsetToAdd; + auto newAffineMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, + {newAffineExpr}, context); + IRRewriter::InsertPoint oldInsertionPoint = rewriter.saveInsertionPoint(); + rewriter.setInsertionPoint(applyOp); + auto newAffineApplyOp = rewriter.create( + applyOp.getLoc(), newAffineMap, applyOp.getMapOperands()); + rewriter.restoreInsertionPoint(oldInsertionPoint); + staticL3AsSourceOffsets[dim] = newAffineApplyOp.getResult(); } // Create new L3 -> L2 Dma Op. rewriter.setInsertionPoint(l3ToL2DmaOp); @@ -164,7 +165,7 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { l3ToL2DmaOp.getLoc(), source, llvm::ArrayRef(staticL2AsTargetOffsets), llvm::ArrayRef(staticL2AsTargetSizes), llvm::ArrayRef(staticL2AsTargetStrides), l3ToL2DmaOp.getSource(), - l3ToL2DmaOp.getSourceMixedOffsets(), + llvm::ArrayRef(staticL3AsSourceOffsets), llvm::ArrayRef(staticL2AsTargetSizes), l3ToL2DmaOp.getSourceMixedStrides()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir index fbd4b697b..8441c2683 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir @@ -7,7 +7,7 @@ // Test non-candidate core op. // CHECK-LABEL: @split_l2_buffer_no_candidate_core_op -// CHECK-NOT: memref<1024xi32, 1 : i32> +// CHECK-NOT: memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> @@ -48,140 +48,58 @@ module { // ----- -// Test one candidate core op. - -// CHECK-LABEL: @split_l2_buffer_one_candidate_core_op -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> -// CHECK-DAG: %[[L2_ALLOC:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> -// CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> -// CHECK: %[[TILE:.*]] = amdaie.tile(%[[C1]], %[[C3]]) -// CHECK: %[[L2_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC]], {%[[TILE]]} : -// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE]]} : -// CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK: scf.forall -// CHECK: %[[DMA_CPY_ND_L3_TO_L2:.*]] = amdaie.dma_cpy_nd( -// CHECK-SAME: %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK-SAME: %[[L3_OBJECTFIFO]] -// CHECK: amdaie.logicalobjectfifo.from_memref -// CHECK: amdaie.logicalobjectfifo.from_memref -// CHECK: amdaie.dma_cpy_nd -// CHECK: amdaie.dma_cpy_nd -// CHECK: %[[L1_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]] -// CHECK: %[[DMA_CPY_ND_L2_TO_L1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO]] -// CHECK-SAME: %[[L2_OBJECTFIFO]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK: amdaie.core(%[[TILE]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1]]], out : -// CHECK: linalg.generic -// CHECK: } -// CHECK: memref.dealloc %[[L2_ALLOC]] : memref<1024xi32, 1 : i32> -#map = affine_map<(d0) -> (d0 * 64)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> -#map3 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d4, d3, d6, d7)> -#map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)> -module { - func.func @split_l2_buffer_one_candidate_core_op(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { - %c3 = arith.constant 3 : index - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %alloc = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> - %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> - %alloc_1 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> - %alloc_2 = memref.alloc() : memref<128x128xi32> - %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> - %tile = amdaie.tile(%c1, %c3) - %0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> - scf.forall (%arg4, %arg5) in (2, 2) { - %2 = affine.apply #map(%arg5) - %3 = affine.apply #map(%arg4) - %4 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [2, 2, 32, 32] [2048, 1024, 32, 1], %1[0, 0, %3, %2] [2, 2, 32, 32] [4096, 32, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %tile_4 = amdaie.tile(%c1, %c3) - %5 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> - %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %12 = amdaie.core(%tile_4, in : [%7, %8, %10], out : [%11]) { - %13 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %14 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %15 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%13, %14 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%15 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_5: i32, %out: i32): - %18 = arith.muli %in, %in_5 : i32 - %19 = arith.addi %out, %18 : i32 - linalg.yield %19 : i32 - } - %16 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %17 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%15, %16 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%17 : memref<1x1x8x8x4x4xi32, 2 : i32>) { - ^bb0(%in: i32, %in_5: i32, %out: i32): - %18 = arith.addi %in, %in_5 : i32 - linalg.yield %18 : i32 - } - amdaie.end - } - } {mapping = [#gpu.block, #gpu.block]} - memref.dealloc %alloc : memref<2x1x32x32xi32, 1 : i32> - memref.dealloc %alloc_3 : memref<1x1x8x8x4x4xi32, 2 : i32> - memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> - memref.dealloc %alloc_1 : memref<2x2x32x32xi32, 1 : i32> - memref.dealloc %alloc_2 : memref<128x128xi32> - return - } -} - -// ----- - // Test two candidate core ops. -// CHECK-LABEL: @split_l2_buffer_two_core_ops +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @split_l2_buffer_two_core_ops // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> -// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> -// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> // CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) // CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) // CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : -// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : -// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK: scf.forall +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) // CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( -// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( -// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.dma_cpy_nd // CHECK: amdaie.dma_cpy_nd // CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} -// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]] -// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : // CHECK: linalg.generic // CHECK: } // CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} -// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]] -// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : // CHECK: linalg.generic // CHECK: } -// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1024xi32, 1 : i32> -// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1024xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> @@ -212,7 +130,7 @@ module { %7 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 256, 32, 8, 1], %5[1, 0, 0, 0, 0, 0] [1, 1, 4, 8, 4, 8] [1024, 1024, 8, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %9 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%9[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %11 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %12 = amdaie.core(%tile_5, in : [%7, %8, %10], out : [%11]) { %16 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> @@ -268,34 +186,40 @@ module { // Test mix of candidate and non-candidate core ops. -// CHECK-LABEL: @split_l2_buffer_mixed_core_ops +// CHECK-DAG: #map = affine_map<(d0) -> (d0 * 64)> +// CHECK-DAG: #map1 = affine_map<(d0) -> (d0 * 64 + 32)> +// CHECK: @split_l2_buffer_mixed_core_ops // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> -// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> -// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1024xi32, 1 : i32> -// CHECK-NOT: memref.alloc() : memref<1024xi32, 1 : i32> -// CHECK-NOT: memref.alloc() : memref<1024xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> // CHECK-DAG: %[[TILE_0:.*]] = amdaie.tile(%[[C1]], %[[C3]]) // CHECK-DAG: %[[TILE_1:.*]] = amdaie.tile(%[[C0]], %[[C2]]) // CHECK-DAG: %[[TILE_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) // CHECK-DAG: %[[TILE_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) // CHECK: %[[L2_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_0]], {%[[TILE_0]]} : -// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : -// CHECK-SAME: memref<1024xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK: scf.forall +// CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) +// CHECK-DAG: %[[IV1_0:.*]] = affine.apply #map(%[[IV1]]) +// CHECK-DAG: %[[IV1_32:.*]] = affine.apply #map1(%[[IV1]]) +// CHECK-DAG: %[[IV0_0:.*]] = affine.apply #map(%[[IV0]]) +// CHECK-DAG: %[[IV0_32:.*]] = affine.apply #map1(%[[IV0]]) // CHECK: %[[DMA_CPY_ND_L3_TO_L2_0:.*]] = amdaie.dma_cpy_nd( -// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( -// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] -// CHECK-SAME: %[[L3_OBJECTFIFO]] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.dma_cpy_nd @@ -305,8 +229,9 @@ module { // CHECK: amdaie.end // CHECK: } // CHECK: %[[L1_OBJECTFIFO_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_1]]} -// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_0]] -// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_0:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_1]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_0]]], out : // CHECK: linalg.generic // CHECK: amdaie.end @@ -316,16 +241,17 @@ module { // CHECK: amdaie.end // CHECK: } // CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} -// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd(%[[L1_OBJECTFIFO_1]] -// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 1024, 1] +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] // CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : // CHECK: linalg.generic // CHECK: amdaie.end // CHECK: } -// CHECK-DAG: memref.dealloc %[[L2_ALLOC_0]] : memref<1024xi32, 1 : i32> -// CHECK-DAG: memref.dealloc %[[L2_ALLOC_1]] : memref<1024xi32, 1 : i32> -// CHECK-NOT: memref.dealloc %{{.*}} : memref<1024xi32, 1 : i32> -// CHECK-NOT: memref.dealloc %{{.*}} : memref<1024xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> +// CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d2, d5, d3, d6, d8)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d2, d1, d4, d5, d8, d7)> @@ -365,7 +291,7 @@ module { amdaie.end } %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) { %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32>