feat: support intermediate insertions/deletions while copying

avik-pal · avik-pal · commit 9a63a40eb4c9 · 2025-10-26T11:37:51.000-04:00
test: missing results

feat: try more generalization (partial progress)

fix: uncomment

fix: add comment on how to handle remaining cases

fix: remove unwanted code

revert: unwanted changes

feat: handle transpose without explicit transpose op

chore: run fmt

chore: remove old comment
diff --git a/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp b/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp
@@ -24299,40 +24299,18 @@ struct WhileIsCopySimplify
           blockArg.getArgNumber() != idx)
         continue;
 
-      // check if update is a DS
-      stablehlo::DynamicSliceOp sliceOp;
-      bool iotaMapping = false;
-      Value sliceOperand;
-      SmallVector<int64_t> mapDStoDUSDim;
-
-      if (sliceOp =
-              dusOp.getUpdate().getDefiningOp<stablehlo::DynamicSliceOp>()) {
-        // simple case where we slice and update
-        iotaMapping = true;
-        mapDStoDUSDim = SmallVector<int64_t>(sliceOp.getStartIndices().size());
-        std::iota(mapDStoDUSDim.begin(), mapDStoDUSDim.end(), 0);
-        sliceOperand = sliceOp.getOperand();
-        if (!isValueAccessibleFromBlock(domInfo, sliceOperand, parentBlock))
-          continue;
-      } else if (auto transposeOp =
-                     dusOp.getUpdate()
-                         .getDefiningOp<stablehlo::TransposeOp>()) {
-        // slice => transpose => update
-        auto tperm = transposeOp.getPermutation();
-        mapDStoDUSDim = SmallVector<int64_t>(tperm.size());
-        for (int i = 0; i < tperm.size(); i++)
-          mapDStoDUSDim[tperm[i]] = i;
-
-        sliceOp =
-            transposeOp.getOperand().getDefiningOp<stablehlo::DynamicSliceOp>();
-        if (!sliceOp)
-          continue;
-        sliceOperand = sliceOp.getOperand();
-        if (!isValueAccessibleFromBlock(domInfo, sliceOperand, parentBlock))
-          continue;
-      } else {
+      int32_t dusInductionVarDim =
+          getInductionVariableDimension(dusOp, inductionVarOffsets, whileOp);
+      if (dusInductionVarDim == -1)
+        continue;
+
+      auto [success, iotaMapping, sliceOp, sliceInductionVarDim, sliceOperand,
+            mapDStoDUSDim] =
+          getIndexMappingInfo(dusOp.getUpdate().getDefiningOp(), domInfo,
+                              parentBlock, rewriter, whileOp,
+                              inductionVarOffsets, dusOp, dusInductionVarDim);
+      if (!success)
         continue;
-      }
 
       bool indicesMatch = true, foundInductionVar = false;
       auto dsShape = cast<ShapedType>(sliceOp.getType()).getShape();
@@ -24345,7 +24323,7 @@ struct WhileIsCopySimplify
       SmallVector<IndexInfo> dusStartIndices(dusOp.getStartIndices().size());
 
       for (size_t i = 0; i < sliceOp.getStartIndices().size(); i++) {
-        int j = mapDStoDUSDim[i];
+        int32_t j = mapDStoDUSDim[i];
 
         auto dsStartIndex = sliceOp.getStartIndices()[i];
         auto dusStartIndex = dusOp.getStartIndices()[j];
@@ -24409,6 +24387,7 @@ struct WhileIsCopySimplify
           rewriter.getDenseI64ArrayAttr(copyInfo.sliceSizes));
 
       auto dusUpdate = sliceOp.getResult();
+
       if (!copyInfo.iotaMapping) {
         SmallVector<int64_t> permutation(copyInfo.mapDStoDUSDim.size());
         for (int i = 0; i < copyInfo.mapDStoDUSDim.size(); i++)
@@ -24444,10 +24423,161 @@ struct WhileIsCopySimplify
     SmallVector<int64_t> sliceSizes;
     SmallVector<IndexInfo> dusStartIndices;
     bool iotaMapping;
-    SmallVector<int64_t> mapDStoDUSDim;
+    SmallVector<int32_t> mapDStoDUSDim;
     unsigned blockArgIdx;
   };
 
+  struct IndexMappingInfo {
+    bool success;
+    bool iotaMapping;
+    stablehlo::DynamicSliceOp sliceOp;
+    int32_t sliceInductionVarDim;
+    Value sliceOperand;
+    SmallVector<int32_t> mapDStoDUSDim;
+  };
+
+  IndexMappingInfo unsupportedIndexMappingInfo() const {
+    return IndexMappingInfo{false, false, nullptr, -1, nullptr, {}};
+  }
+
+  IndexMappingInfo
+  getIndexMappingInfo(Operation *op, DominanceInfo &domInfo, Block *parentBlock,
+                      PatternRewriter &rewriter, stablehlo::WhileOp whileOp,
+                      DenseMap<Value, APInt> &inductionVarOffsets,
+                      stablehlo::DynamicUpdateSliceOp dusOp,
+                      int32_t dusInductionVarDim) const {
+    if (auto sliceOp = dyn_cast<stablehlo::DynamicSliceOp>(op)) {
+      // base case, we have reached the dynamic slice
+      Value sliceOperand = sliceOp.getOperand();
+
+      if (!isValueAccessibleFromBlock(domInfo, sliceOperand, parentBlock))
+        return unsupportedIndexMappingInfo();
+
+      auto inductionVarDim =
+          getInductionVariableDimension(sliceOp, inductionVarOffsets, whileOp);
+      if (inductionVarDim == -1)
+        return unsupportedIndexMappingInfo();
+
+      auto sliceSizes = sliceOp.getSliceSizes();
+
+      SmallVector<int32_t> mapDStoDUSDim(sliceOp.getStartIndices().size(), -1);
+      bool isIotaMapping = false;
+      if (inductionVarDim == dusInductionVarDim) {
+        isIotaMapping = true;
+        std::iota(mapDStoDUSDim.begin(), mapDStoDUSDim.end(), 0);
+      } else {
+        auto minVal = std::min(dusInductionVarDim, inductionVarDim);
+        auto maxVal = std::max(dusInductionVarDim, inductionVarDim);
+
+        for (int32_t i = 0; i < minVal; i++)
+          mapDStoDUSDim[i] = i;
+
+        bool allOnes = true;
+        for (int32_t i = minVal; i <= maxVal; i++) {
+          if (sliceSizes[i] != 1) {
+            allOnes = false;
+            break;
+          }
+          mapDStoDUSDim[i] = i;
+        }
+
+        mapDStoDUSDim[dusInductionVarDim] = inductionVarDim;
+        mapDStoDUSDim[inductionVarDim] = dusInductionVarDim;
+
+        if (!allOnes)
+          return unsupportedIndexMappingInfo();
+
+        for (int32_t i = maxVal + 1; i < sliceOp.getStartIndices().size(); i++)
+          mapDStoDUSDim[i] = i;
+      }
+
+      return IndexMappingInfo{true,         isIotaMapping,
+                              sliceOp,      inductionVarDim,
+                              sliceOperand, mapDStoDUSDim};
+    }
+
+    if (auto transposeOp = dyn_cast<stablehlo::TransposeOp>(op)) {
+      // recursive case: apply transpose on the mapped indices
+      auto tperm = transposeOp.getPermutation();
+
+      int32_t mappedDusInductionVarDim;
+      for (int32_t i = 0; i < tperm.size(); i++) {
+        if (tperm[i] == dusInductionVarDim) {
+          mappedDusInductionVarDim = i;
+          break;
+        }
+      }
+
+      auto prevInfo = getIndexMappingInfo(
+          transposeOp.getOperand().getDefiningOp(), domInfo, parentBlock,
+          rewriter, whileOp, inductionVarOffsets, dusOp,
+          mappedDusInductionVarDim);
+      if (!prevInfo.success)
+        return prevInfo;
+
+      // apply transpose on the mapped indices
+      SmallVector<int32_t> newMapping(tperm.size());
+      int32_t sliceInductionVarDim;
+      for (int32_t i = 0; i < tperm.size(); i++) {
+        if (tperm[i] == prevInfo.sliceInductionVarDim) {
+          sliceInductionVarDim = i;
+        }
+        newMapping[tperm[i]] = prevInfo.mapDStoDUSDim[i];
+      }
+
+      return IndexMappingInfo{true,
+                              false,
+                              prevInfo.sliceOp,
+                              sliceInductionVarDim,
+                              prevInfo.sliceOperand,
+                              newMapping};
+    }
+
+    return unsupportedIndexMappingInfo();
+  }
+
+  int32_t
+  getInductionVariableDimension(stablehlo::DynamicSliceOp sliceOp,
+                                DenseMap<Value, APInt> &inductionVarOffsets,
+                                stablehlo::WhileOp whileOp) const {
+    int32_t inductionVarDimension = -1;
+
+    for (size_t i = 0; i < sliceOp.getStartIndices().size(); i++) {
+      auto dsStartIndex = sliceOp.getStartIndices()[i];
+
+      if (!isConstantAcrossLoopIterations(dsStartIndex, whileOp)) {
+        if (inductionVarDimension > 0 || // multiple indices with induction var
+            !inductionVarOffsets.contains(dsStartIndex))
+          return -1;
+
+        inductionVarDimension = i;
+      }
+    }
+
+    return inductionVarDimension;
+  }
+
+  int32_t
+  getInductionVariableDimension(stablehlo::DynamicUpdateSliceOp dusOp,
+                                DenseMap<Value, APInt> &inductionVarOffsets,
+                                stablehlo::WhileOp whileOp) const {
+    int32_t inductionVarDimension = -1;
+
+    for (size_t i = 0; i < dusOp.getStartIndices().size(); i++) {
+      auto dusStartIndex = dusOp.getStartIndices()[i];
+
+      if (!isConstantAcrossLoopIterations(dusStartIndex, whileOp)) {
+        if (inductionVarDimension > 0 || // multiple indices with induction var
+            !inductionVarOffsets.contains(dusStartIndex))
+          return -1;
+
+        inductionVarDimension = i;
+      }
+    }
+
+    return inductionVarDimension;
+  }
+
   SmallVector<Value> indexInfoToValues(Location loc,
                                        ArrayRef<IndexInfo> indices,
                                        PatternRewriter &rewriter) const {
diff --git a/test/lit_tests/while_is_copy.mlir b/test/lit_tests/while_is_copy.mlir
@@ -1,4 +1,4 @@
-// RUN: enzymexlamlir-opt --enzyme-hlo-opt %s | FileCheck %s
+// RUN: enzymexlamlir-opt --enzyme-hlo-generate-td="patterns=reshape_dynamic_slice(1);reshape_licm(1);transpose_dynamic_slice;transpose_licm(1)" --transform-interpreter --enzyme-hlo-remove-transform --enzyme-hlo-opt %s | FileCheck %s
 
 module {
   func.func @main(%arg0: tensor<10xf32> {tf.aliasing_output = 0 : i32}, %arg1: tensor<10xf32>) -> tensor<10xf32> {
@@ -165,3 +165,129 @@ module {
 // CHECK-NEXT:     %2 = stablehlo.transpose %1, dims = [2, 0, 1] : (tensor<5x10x4xf32>) -> tensor<4x5x10xf32>
 // CHECK-NEXT:     return %2 : tensor<4x5x10xf32>
 // CHECK-NEXT: }
+
+module {
+  func.func @main(%arg0: tensor<10xf64>) -> tensor<10xf64> {
+    %c = stablehlo.constant dense<1> : tensor<i32>
+    %c_0 = stablehlo.constant dense<0> : tensor<i64>
+    %c_1 = stablehlo.constant dense<10> : tensor<i64>
+    %c_2 = stablehlo.constant dense<1> : tensor<i64>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<10xf64>
+    %0 = stablehlo.reshape %arg0 : (tensor<10xf64>) -> tensor<10x1xf64>
+    %1:2 = stablehlo.while(%iterArg = %c_0, %iterArg_3 = %cst) : tensor<i64>, tensor<10xf64>
+    cond {
+      %2 = stablehlo.compare  LT, %iterArg, %c_1 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %2 : tensor<i1>
+    } do {
+      %2 = stablehlo.add %c_2, %iterArg : tensor<i64>
+      %3 = stablehlo.convert %2 : (tensor<i64>) -> tensor<i32>
+      %4 = stablehlo.subtract %3, %c : tensor<i32>
+      %5 = stablehlo.dynamic_slice %0, %iterArg, %c_0, sizes = [1, 1] : (tensor<10x1xf64>, tensor<i64>, tensor<i64>) -> tensor<1x1xf64>
+      %6 = stablehlo.reshape %5 : (tensor<1x1xf64>) -> tensor<1xf64>
+      %7 = stablehlo.dynamic_update_slice %iterArg_3, %6, %4 : (tensor<10xf64>, tensor<1xf64>, tensor<i32>) -> tensor<10xf64>
+      stablehlo.return %2, %7 : tensor<i64>, tensor<10xf64>
+    }
+    return %1#1 : tensor<10xf64>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<10xf64>) -> tensor<10xf64> {
+// CHECK-NEXT:   return %arg0 : tensor<10xf64>
+// CHECK-NEXT: }
+
+module {
+  func.func @main(%arg0: tensor<5x4x3xf32>) -> tensor<4x5x3xf32> {
+    %c = stablehlo.constant dense<0> : tensor<i32>
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<4x5x3xf32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i32>
+    %c_1 = stablehlo.constant dense<0> : tensor<i64>
+    %c_2 = stablehlo.constant dense<4> : tensor<i64>
+    %c_3 = stablehlo.constant dense<1> : tensor<i64>
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [2, 0, 3] : (tensor<5x4x3xf32>) -> tensor<4x1x5x3xf32>
+    %1:2 = stablehlo.while(%iterArg = %c_1, %iterArg_4 = %cst) : tensor<i64>, tensor<4x5x3xf32>
+    cond {
+      %2 = stablehlo.compare  LT, %iterArg, %c_2 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %2 : tensor<i1>
+    } do {
+      %2 = stablehlo.add %c_3, %iterArg : tensor<i64>
+      %3 = stablehlo.convert %2 : (tensor<i64>) -> tensor<i32>
+      %4 = stablehlo.subtract %3, %c_0 : tensor<i32>
+      %5 = stablehlo.dynamic_slice %0, %iterArg, %c_1, %c_1, %c_1, sizes = [1, 1, 5, 3] : (tensor<4x1x5x3xf32>, tensor<i64>, tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<1x1x5x3xf32>
+      %6 = stablehlo.reshape %5 : (tensor<1x1x5x3xf32>) -> tensor<1x5x3xf32>
+      %7 = stablehlo.dynamic_update_slice %iterArg_4, %6, %4, %c, %c : (tensor<4x5x3xf32>, tensor<1x5x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<4x5x3xf32>
+      stablehlo.return %2, %7 : tensor<i64>, tensor<4x5x3xf32>
+    }
+    return %1#1 : tensor<4x5x3xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<5x4x3xf32>) -> tensor<4x5x3xf32> {
+// CHECK-NEXT:   %0 = stablehlo.transpose %arg0, dims = [1, 0, 2] : (tensor<5x4x3xf32>) -> tensor<4x5x3xf32>
+// CHECK-NEXT:   return %0 : tensor<4x5x3xf32>
+// CHECK-NEXT: }
+
+module {
+  func.func @main(%arg0: tensor<5x4x3xf32>, %arg1: tensor<3x1x4x1x5xf32>) -> tensor<5x4x3xf32> {
+    %c = stablehlo.constant dense<0> : tensor<i32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i32>
+    %c_1 = stablehlo.constant dense<0> : tensor<i64>
+    %c_2 = stablehlo.constant dense<1> : tensor<i64>
+    %c_3 = stablehlo.constant dense<4> : tensor<i64>
+    %0 = stablehlo.transpose %arg1, dims = [4, 1, 2, 3, 0] : (tensor<3x1x4x1x5xf32>) -> tensor<5x1x4x1x3xf32>
+    %1:2 = stablehlo.while(%iterArg = %c_1, %iterArg_3 = %arg0) : tensor<i64>, tensor<5x4x3xf32>
+    cond {
+      %2 = stablehlo.compare  LT, %iterArg, %c_3 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %2 : tensor<i1>
+    } do {
+      %2 = stablehlo.add %c_2, %iterArg : tensor<i64>
+      %3 = stablehlo.convert %2 : (tensor<i64>) -> tensor<i32>
+      %4 = stablehlo.subtract %3, %c_0 : tensor<i32>
+      %5 = stablehlo.dynamic_slice %0, %c, %c, %4, %c, %c, sizes = [5, 1, 1, 1, 3] : (tensor<5x1x4x1x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x1x1x3xf32>
+      %6 = stablehlo.reshape %5 : (tensor<5x1x1x1x3xf32>) -> tensor<5x1x3xf32>
+      %7 = stablehlo.dynamic_update_slice %iterArg_3, %6, %c, %4, %c : (tensor<5x4x3xf32>, tensor<5x1x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x4x3xf32>
+      stablehlo.return %2, %7 : tensor<i64>, tensor<5x4x3xf32>
+    }
+    return %1#1 : tensor<5x4x3xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<5x4x3xf32>, %arg1: tensor<3x1x4x1x5xf32>) -> tensor<5x4x3xf32> {
+// CHECK-NEXT:   %0 = stablehlo.transpose %arg1, dims = [4, 1, 2, 3, 0] : (tensor<3x1x4x1x5xf32>) -> tensor<5x1x4x1x3xf32>
+// CHECK-NEXT:   %1 = stablehlo.reshape %0 : (tensor<5x1x4x1x3xf32>) -> tensor<5x4x3xf32>
+// CHECK-NEXT:   return %1 : tensor<5x4x3xf32>
+// CHECK-NEXT: }
+
+module {
+  func.func @main(%arg0: tensor<1x3x4x1x5xf32>, %arg1: tensor<5x4x3xf32>) -> tensor<1x3x4x1x5xf32> {
+    %c = stablehlo.constant dense<0> : tensor<i32>
+    %c_0 = stablehlo.constant dense<1> : tensor<i32>
+    %c_1 = stablehlo.constant dense<0> : tensor<i64>
+    %c_2 = stablehlo.constant dense<1> : tensor<i64>
+    %c_3 = stablehlo.constant dense<3> : tensor<i64>
+    %0:2 = stablehlo.while(%iterArg = %c_1, %iterArg_4 = %arg0) : tensor<i64>, tensor<1x3x4x1x5xf32>
+    cond {
+      %1 = stablehlo.compare  LT, %iterArg, %c_3 : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      stablehlo.return %1 : tensor<i1>
+    } do {
+      %1 = stablehlo.add %c_2, %iterArg : tensor<i64>
+      %2 = stablehlo.convert %1 : (tensor<i64>) -> tensor<i32>
+      %3 = stablehlo.subtract %2, %c_0 : tensor<i32>
+      %4 = stablehlo.dynamic_slice %arg1, %c, %3, %c, sizes = [5, 1, 3] : (tensor<5x4x3xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<5x1x3xf32>
+      %5 = stablehlo.reshape %4 : (tensor<5x1x3xf32>) -> tensor<5x1x1x3x1xf32>
+      %6 = stablehlo.transpose %5, dims = [4, 3, 2, 1, 0] : (tensor<5x1x1x3x1xf32>) -> tensor<1x3x1x1x5xf32>
+      %7 = stablehlo.dynamic_update_slice %iterArg_4, %6, %c, %c, %3, %c, %c : (tensor<1x3x4x1x5xf32>, tensor<1x3x1x1x5xf32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<1x3x4x1x5xf32>
+      stablehlo.return %1, %7 : tensor<i64>, tensor<1x3x4x1x5xf32>
+    }
+    return %0#1 : tensor<1x3x4x1x5xf32>
+  }
+}
+
+// CHECK: func.func @main(%arg0: tensor<1x3x4x1x5xf32>, %arg1: tensor<5x4x3xf32>) -> tensor<1x3x4x1x5xf32> {
+// CHECK-NEXT:   %0 = stablehlo.slice %arg1 [0:5, 0:3, 0:3] : (tensor<5x4x3xf32>) -> tensor<5x3x3xf32>
+// CHECK-NEXT:   %1 = stablehlo.reshape %0 : (tensor<5x3x3xf32>) -> tensor<5x3x1x3x1xf32>
+// CHECK-NEXT:   %2 = stablehlo.transpose %1, dims = [4, 3, 2, 1, 0] : (tensor<5x3x1x3x1xf32>) -> tensor<1x3x1x3x5xf32>
+// CHECK-NEXT:   %3 = stablehlo.reshape %2 : (tensor<1x3x1x3x5xf32>) -> tensor<1x3x3x1x5xf32>
+// CHECK-NEXT:   %4 = stablehlo.slice %arg0 [0:1, 0:3, 3:4, 0:1, 0:5] : (tensor<1x3x4x1x5xf32>) -> tensor<1x3x1x1x5xf32>
+// CHECK-NEXT:   %5 = stablehlo.concatenate %3, %4, dim = 2 : (tensor<1x3x3x1x5xf32>, tensor<1x3x1x1x5xf32>) -> tensor<1x3x4x1x5xf32>
+// CHECK-NEXT:   return %5 : tensor<1x3x4x1x5xf32>
+// CHECK-NEXT: }