From ac25bbbcfa24e025c09be53ed9838f9f6b7e30a2 Mon Sep 17 00:00:00 2001 From: Prashant Kumar Date: Wed, 8 Jan 2025 23:55:32 +0530 Subject: [PATCH] Move tensor_extract slice up in the block to respect bufferization --- .../OptimizeTensorInsertExtractSlices.cpp | 19 +++++++++++++++++++ .../ROCDL/pipeline_igemm_tile_and_fuse.mlir | 7 ------- .../test/ROCDL/pipeline_tile_and_fuse.mlir | 4 ---- .../pipeline_vector_distribute_gfx942.mlir | 4 ++-- third_party/benchmark | 2 +- third_party/llvm-project | 2 +- 6 files changed, 23 insertions(+), 15 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp index 9f4717582773f..c27572a32c8c8 100644 --- a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp @@ -260,6 +260,25 @@ void OptimizeTensorInsertExtractSlicesPass::runOnOperation() { funcOp.walk([&](scf::ForOp forOp) { moveLoopInvariantCode(forOp); }); LDBG("after hoisting loop invariant code\n" << funcOp); + funcOp.walk([&](tensor::ExtractSliceOp extractSliceOp) { + // Check that all operands of tensor extractSliceOp are block arguments. + // In that case we can move the extract_slice op to the beginning of the + // block. + // TODO: Use a more general dominance analysis to move the op to the + // earliest point in same/other block. + for (Value operand : extractSliceOp.getOperands()) { + auto blockArg = dyn_cast(operand); + if (!blockArg || blockArg.getParentBlock() != + extractSliceOp.getOperation()->getBlock()) { + return; + } + } + Value sourceSlice = extractSliceOp.getSource(); + auto blockArg = dyn_cast(sourceSlice); + auto &op = blockArg.getParentBlock()->getOperations().front(); + extractSliceOp->moveBefore(&op); + }); + // TODO: walking in some reverse / inside-out order would be more efficient // and would capture more cases. funcOp.walk( diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir index 3d3504d87a089..64d22308ab408 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir @@ -142,13 +142,6 @@ hal.executable private @main { // CHECK: scf.forall ({{.*}}) in (17, 81) { // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C721]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>) // CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<1xf16> -// CHECK-DAG: vector.transfer_write %[[LHS_RD]] -// Note that to simplify the test we are not showing the mapping of the RHS_RD -// to its buffer as it goes through an scf.if/else control structure -// involving allocas. -// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read {{.*}} vector<1xf16> -// CHECK-DAG: vector.transfer_write %[[RHS_RD]] // CHECK: gpu.barrier // CHECK-DAG: %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16> // CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x1x1xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index f71add60f4b1a..764a1ee55bfa4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -1150,10 +1150,6 @@ hal.executable public @main { // CHECK: scf.forall ({{.*}}) in (12, 37, 10) { // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c145 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>) // CHECK: gpu.barrier -// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read {{.*}} vector<4xf32> -// CHECK-DAG: vector.transfer_write %[[LHS_RD]] -// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read {{.*}} vector<1xf32> -// CHECK-DAG: vector.transfer_write %[[RHS_RD]] // CHECK: gpu.barrier // CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<1xf32> // CHECK-DAG: vector.transfer_read {{.*}} #gpu.address_space>, vector<1xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir index 454863e4b79dc..19410c58f23b0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir @@ -1092,7 +1092,7 @@ hal.executable private @attention_multiple_m_transpose { // needed because the intrinsic layout mathes. // TODO: With forall distribution it's allocating memory for S. // MEMORY-LABEL: func.func @attention_multiple_m_transpose() -// MEMORY-COUNT-4: memref.alloc +// MEMORY-COUNT-3: memref.alloc // MEMORY-NOT: memref.alloc // ----- @@ -1160,7 +1160,7 @@ hal.executable private @attention_mfma_32x32x8 { // Check that we only use alloc for Q, K, and V. No shared memory for S is // needed because the intrinsic layout mathes. // MEMORY-LABEL: func.func @attention_mfma_32x32x8() -// MEMORY-COUNT-4: memref.alloc +// MEMORY-COUNT-3: memref.alloc // MEMORY-NOT: memref.alloc // ----- diff --git a/third_party/benchmark b/third_party/benchmark index 1e96bb0ab5e75..99bdb2127d1fa 160000 --- a/third_party/benchmark +++ b/third_party/benchmark @@ -1 +1 @@ -Subproject commit 1e96bb0ab5e758861f5bbbd4edbd0a8d9a2a7cae +Subproject commit 99bdb2127d1fa1cff444bbefb814e105c7d20c45 diff --git a/third_party/llvm-project b/third_party/llvm-project index 38b454257e16b..6a722c20b272d 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 38b454257e16b614d20cc16d8dc3ea01e8eb4c03 +Subproject commit 6a722c20b272dac9a0e5a2a27a06587b15adb0a3