Reduce-Mul-Broadcast to Dot General (#1669)

Pangoraw · web-flow · commit f603104e871d · 2025-12-01T10:26:05.000-05:00
* Reduce-Mul-Broadcast to Dot General

* unfmt
diff --git a/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp b/src/enzyme_ad/jax/Passes/EnzymeHLOOpt.cpp
@@ -25877,6 +25877,90 @@ struct BinaryNegatedOperandsSimplify
   }
 };
 
+struct ReduceMulBroadcastToDotGeneral
+    : public CheckedOpRewritePattern<stablehlo::ReduceOp,
+                                     ReduceMulBroadcastToDotGeneral> {
+  using CheckedOpRewritePattern<
+      stablehlo::ReduceOp,
+      ReduceMulBroadcastToDotGeneral>::CheckedOpRewritePattern;
+
+  LogicalResult matchAndRewriteImpl(stablehlo::ReduceOp op,
+                                    PatternRewriter &rewriter) const {
+    if (op.getInputs().size() != 1 || op.getInitValues().size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "only single-operand single-init reduce is supported");
+    }
+
+    auto dims = op.getDimensions();
+
+    Value input = op.getInputs()[0];
+    auto TT = cast<TensorType>(input.getType());
+    auto OT = cast<TensorType>(op.getResultTypes()[0]);
+
+    if (OT.getRank() != 2 || dims.size() != 1)
+      return failure();
+
+    auto checkCommonReduce = mlir::stablehlo::CheckCommonReduceOp(op);
+    if (!checkCommonReduce.isAddReduce ||
+        !matchPattern(op.getInitValues()[0], m_AnyZeroFloat()))
+      return rewriter.notifyMatchFailure(op, "reduction is not add");
+
+    auto mul = input.getDefiningOp<stablehlo::MulOp>();
+    if (!mul)
+      return rewriter.notifyMatchFailure(op, "input source is not a mul op");
+
+    Value mulLhs = mul.getLhs(), mulRhs = mul.getRhs();
+    auto lhsBdim = mulLhs.getDefiningOp<stablehlo::BroadcastInDimOp>(),
+         rhsBdim = mulRhs.getDefiningOp<stablehlo::BroadcastInDimOp>();
+
+    if (!lhsBdim || !rhsBdim)
+      return failure();
+
+    auto prepareInputForDotGeneral =
+        [&](stablehlo::BroadcastInDimOp bdim) -> Value {
+      // transpose dims: [0, 2] -> [0, 1]
+      // transpose dims: [1, 0] -> [1, 0]
+      auto OT = cast<TensorType>(bdim.getResult().getType());
+
+      auto bdims = bdim.getBroadcastDimensions();
+      SmallVector<int64_t> transposeDims(bdims.size(), -1);
+
+      int64_t ncdims = 0;
+      for (int i = 0; i < OT.getRank(); i++) {
+        bool inBDims = false;
+        for (auto [j, dim] : llvm::enumerate(bdims)) {
+          if (dim == i) {
+            inBDims = true;
+            transposeDims[j] = i - ncdims;
+            break;
+          }
+        }
+        if (!inBDims) {
+          ncdims++;
+        }
+      }
+
+      Value prepared = stablehlo::TransposeOp::create(
+          rewriter, bdim.getLoc(), bdim.getOperand(), transposeDims);
+
+      return prepared;
+    };
+
+    auto lhs = prepareInputForDotGeneral(lhsBdim);
+    auto rhs = prepareInputForDotGeneral(rhsBdim);
+
+    auto ndim = stablehlo::DotDimensionNumbersAttr::get(
+        op.getContext(), {}, {}, op.getDimensions(), op.getDimensions());
+
+    auto dg = DotGeneralOp::create(rewriter, op.getLoc(), OT, lhs, rhs, ndim,
+                                   /* precision_config */ nullptr,
+                                   /*algorithm*/ nullptr);
+    rewriter.replaceAllOpUsesWith(op, dg.getResult());
+
+    return success();
+  }
+};
+
 // currently limited to non-batched dot_general
 struct DotGeneralToSyrk
     : public CheckedOpRewritePattern<stablehlo::DotGeneralOp,
@@ -26761,6 +26845,7 @@ struct EnzymeHLOOptPass
         ElementwiseWrap,
         ElementwiseExtend,
         SubtractMultiplyConstToAddMulConst,
+        ReduceMulBroadcastToDotGeneral,
         DotGeneralDistributiveSimplify<stablehlo::AddOp>,
         DotGeneralDistributiveSimplify<stablehlo::SubtractOp>,
         TrivialReduceWindowToReduceOp,
diff --git a/src/enzyme_ad/jax/TransformOps/TransformOps.td b/src/enzyme_ad/jax/TransformOps/TransformOps.td
@@ -2505,6 +2505,11 @@ def EnzymeHLOUnroll : EnzymeHLOParameterizedPatternOp<
   }];
 }
 
+def ApplyReduceMulBroadcastToDotGeneralPatterns : EnzymeHLOPatternOp<
+    "reduce_mul_broadcast_to_dot_general"> {
+  let patterns = ["ReduceMulBroadcastToDotGeneral"];
+}
+
 def ApplyDotGeneralOnlyDiagonalAccessPatterns : EnzymeHLOPatternOp<
     "dot_general_only_diagonal_access"> {
   let patterns = ["DotGeneralOnlyDiagonalAccess"];
diff --git a/src/enzyme_ad/jax/primitives.py b/src/enzyme_ad/jax/primitives.py
@@ -304,6 +304,7 @@ def optimization_passes(
         "self_mul_to_convolution_like(0)",
         "trivial_reduce_window_to_reduce_op",
         "case_to_if",
+        "reduce_mul_broadcast_to_dot_general",
         "dot_general_add_distributive_simplify",
         "dot_general_subtract_distributive_simplify",
         "remove_no_ops_from_while_loop",
diff --git a/test/lit_tests/reduce_mul_broadcast_to_dot_general.mlir b/test/lit_tests/reduce_mul_broadcast_to_dot_general.mlir
@@ -0,0 +1,17 @@
+// RUN: enzymexlamlir-opt %s --enzyme-hlo-opt | FileCheck %s
+
+module {
+  func.func @main(%arg0: tensor<100x100xf64>, %arg1: tensor<100x100xf64>) -> tensor<100x100xf64> {
+    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
+    %0 = stablehlo.broadcast_in_dim %arg0, dims = [0, 2] : (tensor<100x100xf64>) -> tensor<100x100x100xf64>
+    %1 = stablehlo.broadcast_in_dim %arg1, dims = [1, 0] : (tensor<100x100xf64>) -> tensor<100x100x100xf64>
+    %2 = stablehlo.multiply %0, %1 {enzymexla.symmetric_matrix = [#enzymexla<guaranteed NOTGUARANTEED>]} : tensor<100x100x100xf64>
+    %3 = stablehlo.reduce(%2 init: %cst) applies stablehlo.add across dimensions = [0] : (tensor<100x100x100xf64>, tensor<f64>) -> tensor<100x100xf64>
+    return %3 : tensor<100x100xf64>
+  }
+}
+
+// CHECK:  func.func @main(%arg0: tensor<100x100xf64>, %arg1: tensor<100x100xf64>) -> tensor<100x100xf64> {
+// CHECK-NEXT:    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [0] x [1] : (tensor<100x100xf64>, tensor<100x100xf64>) -> tensor<100x100xf64>
+// CHECK-NEXT:    return %0 : tensor<100x100xf64>
+// CHECK-NEXT:  }