diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp index 476bf5a74..20b5a994c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjectFifos.cpp @@ -48,6 +48,9 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { SmallVector baseSourceOffsets = l2ToL1DmaOps[0].getSourceMixedOffsets(); + // We will now capture those dimensions where L2 memory was split. The way we + // do this is by checking all L2->L1 DmaOps' source offset and marking those + // dimensions which are not equal to at least one of the source offsets. DenseSet splitDimensionsSet; for (unsigned i = 1, n = l2ToL1DmaOps.size(); i < n; i++) { SmallVector sourceOffsets = @@ -102,6 +105,17 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { cast( l3ToL2DmaOp.getTargetObjectFifo().getMemref().getType()) .getShape()); + // We traverse through the split dimensions we captured earlier and for each + // such dimension we perform the following updates :- + // 1. Maintain a map: DIM -> CONST_OFFSET_TO_ADD. This is done with the + // assumption that L3<->L2 is 4D and L2<->L1 is 6D. `DIM` here is split + // dimension + 2. `CONST_OFFSET_TO_ADD` is the constant we get by + // multiplying L2 as source's offset at split dimension with L2 as + // target's size at split dimension + 2. We are maintaining this to later + // update the extraction offset of L3 -> L2. + // 2. Update L2 as source/target offset => 0. + // 3. Update L2 as source/target size => 1. + // 4. Compute the shape of L2 buffer after split. DenseMap dimToOffsetMapForL3AsSource; for (unsigned dim : splitDimensionsSet) { std::optional constantOffset = @@ -114,7 +128,6 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { {dim + 2, constantOffset.value() * (getConstantIntValue(staticL2AsTargetSizes[dim + 2]).value())}); - // splitDimensions.push_back(dim); staticL2AsSourceOffsets[dim] = zeroVal; staticL2AsSourceSizes[dim] = oneVal; staticL2AsTargetOffsets[dim] = zeroVal; @@ -144,6 +157,8 @@ void AMDAIESplitLogicalObjectFifosPass::runOnOperation() { SmallVector staticL3AsSourceOffsets = l3ToL2DmaOp.getSourceMixedOffsets(); + // We now traverse the map : DIM -> CONST_OFFSET_TO_ADD we created earlier + // to update extraction offsets while splitting L3->L2. for (auto [dim, offsetToAdd] : dimToOffsetMapForL3AsSource) { auto applyOp = cast( cast(staticL3AsSourceOffsets[dim]).getDefiningOp()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir index 8441c2683..435fc1d0c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logical_objectfifos.mlir @@ -196,6 +196,8 @@ module { // CHECK-DAG: %[[L3_ALLOC:.*]] = memref.alloc() : memref<128x128xi32> // CHECK-DAG: %[[L2_ALLOC_0:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-DAG: %[[L2_ALLOC_1:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_2:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[L2_ALLOC_3:.*]] = memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-NOT: memref.alloc() : memref<1x1x32x32xi32, 1 : i32> // CHECK-DAG: %[[L1_ALLOC:.*]] = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> @@ -207,6 +209,10 @@ module { // CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L2_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_1]], {%[[TILE_0]]} : // CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_2]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[L2_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L2_ALLOC_3]], {%[[TILE_0]]} : +// CHECK-SAME: memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: %[[L3_OBJECTFIFO:.*]] = amdaie.logicalobjectfifo.from_memref %[[L3_ALLOC]], {%[[TILE_0]]} : // CHECK-SAME: memref<128x128xi32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 2) @@ -219,6 +225,12 @@ module { // CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: %[[DMA_CPY_ND_L3_TO_L2_1:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_0:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] +// CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_0:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] +// CHECK: %[[DMA_CPY_ND_L3_TO_L2_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1] // CHECK-SAME: %[[L3_OBJECTFIFO]][0, 0, %[[IV0_32:.*]], %[[IV1_32:.*]]] [1, 1, 32, 32] [4096, 32, 128, 1] // CHECK: amdaie.logicalobjectfifo.from_memref // CHECK: amdaie.logicalobjectfifo.from_memref @@ -240,16 +252,34 @@ module { // CHECK: linalg.fill // CHECK: amdaie.end // CHECK: } -// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} +// CHECK: %[[L1_OBJECTFIFO_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_0]]} // CHECK: %[[DMA_CPY_ND_L2_TO_L1_1:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[L1_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] // CHECK-SAME: %[[L2_OBJECTFIFO_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] -// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: amdaie.core(%[[TILE_0]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_1]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_2]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_2:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_2]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_2]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_2]]], out : +// CHECK: linalg.generic +// CHECK: amdaie.end +// CHECK: } +// CHECK: %[[L1_OBJECTFIFO_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[L1_ALLOC]], {%[[TILE_3]]} +// CHECK: %[[DMA_CPY_ND_L2_TO_L1_3:.*]] = amdaie.dma_cpy_nd( +// CHECK-SAME: %[[L1_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1] +// CHECK-SAME: %[[L2_OBJECTFIFO_3]][0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1] +// CHECK: amdaie.core(%[[TILE_3]], in : [%{{.*}}, %{{.*}}, %[[DMA_CPY_ND_L2_TO_L1_3]]], out : // CHECK: linalg.generic // CHECK: amdaie.end // CHECK: } // CHECK-DAG: memref.dealloc %[[L2_ALLOC_0]] : memref<1x1x32x32xi32, 1 : i32> // CHECK-DAG: memref.dealloc %[[L2_ALLOC_1]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_2]] : memref<1x1x32x32xi32, 1 : i32> +// CHECK-DAG: memref.dealloc %[[L2_ALLOC_3]] : memref<1x1x32x32xi32, 1 : i32> // CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> // CHECK-NOT: memref.dealloc %{{.*}} : memref<1x1x32x32xi32, 1 : i32> #map = affine_map<(d0) -> (d0 * 64)> @@ -286,54 +316,96 @@ module { %8 = amdaie.dma_cpy_nd(%arg1[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 128, 32, 4, 1], %6[0, 1, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [2048, 1024, 4, 256, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %9 = amdaie.dma_cpy_nd(%arg3[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %arg2[0, 0, 0, 0] [8, 4, 8, 4] [16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %10 = amdaie.core(%tile_7, in : [%7], out : []) { - %11 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - linalg.fill ins(%c0_i32 : i32) outs(%11 : memref<1x1x4x8x4x8xi32, 2 : i32>) + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>) amdaie.end } %11 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_4} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> %12 = amdaie.dma_cpy_nd(%11[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %13 = amdaie.core(%tile_4, in : [%7, %8, %12], out : [%9]) { - %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.muli %in, %in_8 : i32 - %24 = arith.addi %out, %23 : i32 - linalg.yield %24 : i32 + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 } - %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.addi %in, %in_8 : i32 - linalg.yield %23 : i32 + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 } amdaie.end } %14 = amdaie.core(%tile_5, in : [%7], out : []) { - %15 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - linalg.fill ins(%c0_i32 : i32) outs(%15 : memref<1x1x4x8x4x8xi32, 2 : i32>) + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + linalg.fill ins(%c0_i32 : i32) outs(%24 : memref<1x1x4x8x4x8xi32, 2 : i32>) + amdaie.end + } + %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[0, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.core(%tile, in : [%7, %8, %16], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } + amdaie.end + } + %18 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_5} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %19 = amdaie.dma_cpy_nd(%18[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = amdaie.core(%tile_5, in : [%7, %8, %19], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 + } + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + ^bb0(%in: i32, %in_8: i32, %out: i32): + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 + } amdaie.end } - %15 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %16 = amdaie.dma_cpy_nd(%15[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %17 = amdaie.core(%tile_6, in : [%7, %8, %16], out : [%9]) { - %18 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> - %19 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> - %20 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%18, %19 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%20 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + %21 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_6} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %22 = amdaie.dma_cpy_nd(%21[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [1024, 1024, 128, 16, 4, 1], %0[1, 1, 0, 0, 0, 0] [1, 1, 8, 8, 4, 4] [2048, 1024, 4, 128, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %23 = amdaie.core(%tile_6, in : [%7, %8, %22], out : [%9]) { + %24 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x4x8x4x8xi32, 2 : i32> + %25 = amdaie.logicalobjectfifo.access(%arg1, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x4x8x4xi32, 2 : i32> + %26 = amdaie.logicalobjectfifo.access(%arg2, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%24, %25 : memref<1x1x4x8x4x8xi32, 2 : i32>, memref<1x1x8x4x8x4xi32, 2 : i32>) outs(%26 : memref<1x1x8x8x4x4xi32, 2 : i32>) { ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.muli %in, %in_8 : i32 - %24 = arith.addi %out, %23 : i32 - linalg.yield %24 : i32 + %29 = arith.muli %in, %in_8 : i32 + %30 = arith.addi %out, %29 : i32 + linalg.yield %30 : i32 } - %21 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - %22 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> - linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%20, %21 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%22 : memref<1x1x8x8x4x4xi32, 2 : i32>) { + %27 = amdaie.logicalobjectfifo.access(%arg2, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + %28 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x8x4x4xi32, 2 : i32> + linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%26, %27 : memref<1x1x8x8x4x4xi32, 2 : i32>, memref<1x1x8x8x4x4xi32, 2 : i32>) outs(%28 : memref<1x1x8x8x4x4xi32, 2 : i32>) { ^bb0(%in: i32, %in_8: i32, %out: i32): - %23 = arith.addi %in, %in_8 : i32 - linalg.yield %23 : i32 + %29 = arith.addi %in, %in_8 : i32 + linalg.yield %29 : i32 } amdaie.end }