Move tensor_extract slice up in the block to respect bufferization

iree-org · Jan 9, 2025 · ac25bbb · ac25bbb
1 parent 09d6cf4
commit ac25bbb
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 15 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
@@ -260,6 +260,25 @@ void OptimizeTensorInsertExtractSlicesPass::runOnOperation() {
   funcOp.walk([&](scf::ForOp forOp) { moveLoopInvariantCode(forOp); });
   LDBG("after hoisting loop invariant code\n" << funcOp);
 
+  funcOp.walk([&](tensor::ExtractSliceOp extractSliceOp) {
+    // Check that all operands of tensor extractSliceOp are block arguments.
+    // In that case we can move the extract_slice op to the beginning of the
+    // block.
+    // TODO: Use a more general dominance analysis to move the op to the
+    // earliest point in same/other block.
+    for (Value operand : extractSliceOp.getOperands()) {
+      auto blockArg = dyn_cast<BlockArgument>(operand);
+      if (!blockArg || blockArg.getParentBlock() !=
+                           extractSliceOp.getOperation()->getBlock()) {
+        return;
+      }
+    }
+    Value sourceSlice = extractSliceOp.getSource();
+    auto blockArg = dyn_cast<BlockArgument>(sourceSlice);
+    auto &op = blockArg.getParentBlock()->getOperations().front();
+    extractSliceOp->moveBefore(&op);
+  });
+
   // TODO: walking in some reverse / inside-out order would be more efficient
   // and would capture more cases.
   funcOp.walk(

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -142,13 +142,6 @@ hal.executable private @main {
 //          CHECK:   scf.forall ({{.*}}) in (17, 81) {
 //          CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %[[C0]] to %[[C721]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
 //          CHECK:       gpu.barrier
-//      CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read %[[B0]]{{.*}} vector<1xf16>
-//      CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
-// Note that to simplify the test we are not showing the mapping of the RHS_RD
-// to its buffer as it goes through an scf.if/else control structure
-// involving allocas.
-//      CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.*}} vector<1xf16>
-//      CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
 //          CHECK:       gpu.barrier
 //      CHECK-DAG:       %[[LHS_MM0:.+]] = vector.transfer_read {{.*}} vector<4xf16>
 //      CHECK-DAG:       %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x1x1xf16>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1150,10 +1150,6 @@ hal.executable public @main {
 //       CHECK:   scf.forall ({{.*}}) in (12, 37, 10) {
 //       CHECK:     %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c145 step %c1 {{.*}} -> (vector<1x1x1x4x1xf32>)
 //       CHECK:       gpu.barrier
-//   CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read {{.*}} vector<4xf32>
-//   CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
-//   CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.*}} vector<1xf32>
-//   CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
 //       CHECK:       gpu.barrier
 //   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>
 //   CHECK-DAG:       vector.transfer_read {{.*}} #gpu.address_space<workgroup>>, vector<1xf32>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -1092,7 +1092,7 @@ hal.executable private @attention_multiple_m_transpose {
 // needed because the intrinsic layout mathes.
 // TODO: With forall distribution it's allocating memory for S.
 // MEMORY-LABEL: func.func @attention_multiple_m_transpose()
-// MEMORY-COUNT-4: memref.alloc
+// MEMORY-COUNT-3: memref.alloc
 // MEMORY-NOT: memref.alloc
 
 // -----
@@ -1160,7 +1160,7 @@ hal.executable private @attention_mfma_32x32x8 {
 // Check that we only use alloc for Q, K, and V. No shared memory for S is
 // needed because the intrinsic layout mathes.
 // MEMORY-LABEL: func.func @attention_mfma_32x32x8()
-// MEMORY-COUNT-4: memref.alloc
+// MEMORY-COUNT-3: memref.alloc
 // MEMORY-NOT: memref.alloc
 
 // -----

diff --git a/third_party/benchmark b/third_party/benchmark
diff --git a/third_party/llvm-project b/third_party/llvm-project
+6 −7		.github/workflows/pre-commit.yml
+5 −5		.github/workflows/test_bindings.yml
+12 −11		.github/workflows/wheels.yml
+3 −3		.pre-commit-config.yaml
+1 −0		CONTRIBUTORS
+3 −3		MODULE.bazel
+11 −3		WORKSPACE
+11 −19		bazel/benchmark_deps.bzl
+9 −5		include/benchmark/benchmark.h
+4 −5		pyproject.toml
+9 −0		setup.py
+4 −4		src/complexity.cc
+6 −5		src/cycleclock.h
+2 −0		src/statistics.cc
+2 −0		src/sysinfo.cc
+1 −1		test/CMakeLists.txt
+1 −1		test/perf_counters_gtest.cc