[DT] Add generic op materialization pattern for GPU

jtuyls · jtuyls · commit eb8c1ed7da01 · 2025-03-20T16:53:40.000-05:00
Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingPatterns.cpp
@@ -234,12 +234,9 @@ static FailureOr<Operation *> lowerGenericOpWithEncoding(
       typeConverter.getEncodingInfo(
           cast<RankedTensorType>(outputOperand->get().getType()));
   if (IREE::Codegen::isIdentityLayout(outMaterializeEncodingInfo)) {
-    return rewriter.notifyMatchFailure(
-        genericOp, "MaterializeEncodingInfo failed for output");
-  }
-  if (outMaterializeEncodingInfo.swizzle) {
-    return rewriter.notifyMatchFailure(
-        genericOp, "generic op lowering does not support swizzle yet");
+    return dropEncodingAndCloneOp(rewriter, genericOp.getOperation(),
+                                  convertedInputOperands,
+                                  convertedOutputOperands);
   }
 
   auto convertedResultType =
@@ -248,8 +245,41 @@ static FailureOr<Operation *> lowerGenericOpWithEncoding(
                                                  utils::IteratorType::parallel);
   // Compute the new indexing maps for the packed layout. This assumes that
   // the output map is identity, and that all iterator types are parallel.
-  SmallVector<int64_t> outInnerDimsPos =
-      outMaterializeEncodingInfo.innerDimsPos;
+  ArrayRef<int64_t> outInnerDimsPos = outMaterializeEncodingInfo.innerDimsPos;
+  SmallVector<int64_t> outResultDimsPerm =
+      llvm::to_vector(llvm::seq<int64_t>(0, convertedResultType.getRank()));
+  if (outMaterializeEncodingInfo.swizzle.has_value()) {
+    int outRank =
+        cast<RankedTensorType>(outputOperand->get().getType()).getRank();
+    SmallVector<int64_t> transposePerm =
+        llvm::to_vector(llvm::seq<int64_t>(0, outRank));
+    for (auto perm : outMaterializeEncodingInfo.swizzle->permutation) {
+      transposePerm.push_back(outRank + perm);
+    }
+    applyPermutationToVector(outResultDimsPerm, transposePerm);
+  }
+  SmallVector<int64_t> invOutResultDimsPerm =
+      invertPermutationVector(outResultDimsPerm);
+
+  // Calculate the running offset for every dimension position for easy lookup
+  // when calculating the packed result dimensions for every operand.
+  // Example:
+  //   expandShape == [[4, 8, 4], [4, 4]]
+  // In this case:
+  //   outOffsetForDimsPos == [0, 3]
+  // So that whenever we need the real dimension for an entry (`outerIndex`,
+  // `innerIndex`) in the 2D expanded shape vector, we can calculate it as:
+  //   dim(outerIndex, innerIndex) = outOffsetForDimsPos[outerIndex] +
+  //   innerIndex
+  SmallVector<int64_t> outOffsetForDimsPos(outInnerDimsPos.size(), 0);
+  if (outMaterializeEncodingInfo.swizzle.has_value()) {
+    int64_t runningSize = 0;
+    for (size_t i = 0; i < outInnerDimsPos.size(); i++) {
+      outOffsetForDimsPos[i] = runningSize;
+      runningSize += outMaterializeEncodingInfo.swizzle->expandShape[i].size();
+    }
+  }
+
   SmallVector<int64_t> outInverseOuterDimsPerm =
       invertPermutationVector(outMaterializeEncodingInfo.outerDimsPerm);
   SmallVector<AffineMap> packedIndexingMaps;
@@ -277,14 +307,59 @@ static FailureOr<Operation *> lowerGenericOpWithEncoding(
     for (auto [idx, pos] : llvm::enumerate(innerDimsPos)) {
       auto dimPos = cast<AffineDimExpr>(inputMap.getResult(pos)).getPosition();
       for (auto [tileIdx, outDim] : llvm::enumerate(outInnerDimsPos)) {
-        if (dimPos == outDim) {
+        if (dimPos != outDim) {
+          continue;
+        }
+        if (!materializeEncodingInfo.swizzle.has_value()) {
           packedResultDims.push_back(outputMap.getNumDims() + tileIdx);
+          continue;
+        }
+        // In case of a layout with swizzle, an expanded set of dimensions
+        // needs to be appended as specified by the swizzle's `expandedShape`
+        // field. Note that the dimension index should be offset by the
+        // calculated output starting offset as every dimension is now
+        // transformed into an expanded sequence of indices and the correct
+        // dimension index is:
+        //   outOffsetForDimsPos[tileIdx] + innerIndex
+        assert(idx < materializeEncodingInfo.swizzle->expandShape.size() &&
+               "`innerDimsPos` index should not exceed the swizzle's "
+               "`expandShape` size");
+        const size_t dimSize =
+            materializeEncodingInfo.swizzle->expandShape[idx].size();
+        const int64_t outIdxOffset =
+            outputMap.getNumDims() + outOffsetForDimsPos[tileIdx];
+        for (size_t i = 0; i < dimSize; i++) {
+          packedResultDims.push_back(outIdxOffset + i);
         }
       }
     }
+    // In case of a layout with swizzle, the packed result dimensions need
+    // to be transposed according to the swizzle's permutation vector.
+    if (materializeEncodingInfo.swizzle.has_value()) {
+      int inRank =
+          cast<RankedTensorType>(inputOperand->get().getType()).getRank();
+      SmallVector<int64_t> transposePerm =
+          llvm::to_vector(llvm::seq<int64_t>(0, inRank));
+      for (auto perm : materializeEncodingInfo.swizzle->permutation) {
+        transposePerm.push_back(inRank + perm);
+      }
+      applyPermutationToVector(packedResultDims, transposePerm);
+    }
+    // Calculate the final packed result dimensions through the inverse result
+    // dimensions permutation map. This effectively linearizes the packed result
+    // dimensions with respect to the output dimensions. For example, if the
+    // permuted output dimensions are [D0, D2, D1], this will transform all
+    // packed operand result dimensions with the permutation map that would make
+    // the output dimensions the identity map [D0, D1, D2], i.e. {D0 -> D0, D1
+    // -> D2, D2 -> D1}. Suppose that the operand dimensions are [D0, D2], this
+    // operation would transform it into [D0, D1] to align with the output
+    // identity map.
+    SmallVector<int64_t> finalPackedResultDims = llvm::map_to_vector(
+        packedResultDims, [&](int64_t r) { return invOutResultDimsPerm[r]; });
+
     // Create the packed indexing map.
     SmallVector<AffineExpr> packedResultExprs =
-        llvm::map_to_vector(packedResultDims, [&](int64_t dim) {
+        llvm::map_to_vector(finalPackedResultDims, [&](int64_t dim) {
           return rewriter.getAffineDimExpr(dim);
         });
     auto packedInputMap = AffineMap::get(
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/gpu_materialize_encoding_gfx942.mlir b/compiler/src/iree/compiler/Codegen/Common/test/gpu_materialize_encoding_gfx942.mlir
@@ -1254,3 +1254,57 @@ func.func @missing_user_indexing_maps() {
 // CHECK-DAG:     %[[STORE_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1)
 // CHECK-DAG:     %[[LOAD:.+]] = flow.dispatch.tensor.load %[[LOAD_BINDING]]{{.+}} -> tensor<255x513xf32>
 // CHECK-DAG:     flow.dispatch.tensor.store %[[LOAD]], %[[STORE_BINDING]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]>
+#encoding_bcast = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2) -> (d0, d2)>], affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]>
+func.func @dequantization() {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x128x64xi8, #encoding>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>>
+  %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x128x64xf32, #encoding>>
+  %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x128x64xi8, #encoding>> -> tensor<2x128x64xi8, #encoding>
+  %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>> -> tensor<2x64xf32, #encoding_bcast>
+  %9 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>> -> tensor<2x64xf32, #encoding_bcast>
+  %13 = tensor.empty() : tensor<2x128x64xf32, #encoding>
+  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %8, %9 : tensor<2x128x64xi8, #encoding>, tensor<2x64xf32, #encoding_bcast>, tensor<2x64xf32, #encoding_bcast>) outs(%13 : tensor<2x128x64xf32, #encoding>) {
+  ^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32):
+    %21 = arith.extui %in : i8 to i32
+    %22 = arith.uitofp %21 : i32 to f32
+    %23 = arith.subf %22, %in_1 : f32
+    %24 = arith.mulf %23, %in_0 : f32
+    linalg.yield %24 : f32
+  } -> tensor<2x128x64xf32, #encoding>
+  flow.dispatch.tensor.store %14, %6, offsets = [0, 0, 0], sizes = [2, 128, 64], strides = [1, 1, 1] : tensor<2x128x64xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<2x128x64xf32, #encoding>>
+  return
+}
+//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d4, d7)>
+// CHECK-LABEL: func.func @dequantization()
+//   CHECK-DAG:   %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(0) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x1x4x8x4x4x4x4xi8>>
+//   CHECK-DAG:   %[[LHS_SCALES_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(1) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>>
+//   CHECK-DAG:   %[[LHS_ZPS_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(2) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>>
+//   CHECK-DAG:   %[[RESULT_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(3) {{.*}} : !flow.dispatch.tensor<writeonly:tensor<2x1x4x8x4x4x4x4xf32>>
+//   CHECK-DAG:   %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [2, 1, 4, 8, 4, 4, 4, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1x4x8x4x4x4x4xi8>> -> tensor<2x1x4x8x4x4x4x4xi8>
+//   CHECK-DAG:   %[[LHS_SCALES:.+]] = flow.dispatch.tensor.load %[[LHS_SCALES_BINDING]], offsets = [0, 0, 0, 0], sizes = [2, 4, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>> -> tensor<2x4x4x4xf32>
+//   CHECK-DAG:   %[[LHS_ZPS:.+]] = flow.dispatch.tensor.load %[[LHS_ZPS_BINDING]], offsets = [0, 0, 0, 0], sizes = [2, 4, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>> -> tensor<2x4x4x4xf32>
+//   CHECK-DAG:   %[[EMPTY_LHS:.+]] = tensor.empty() : tensor<2x1x4x8x4x4x4x4xf32>
+//   CHECK-DAG:   %[[LHS_DEQUANT:.+]] = linalg.generic
+//  CHECK-SAME:       indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP1]], #[[$MAP]]]
+//  CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
+//  CHECK-SAME:       ins(%[[LHS]], %[[LHS_SCALES]], %[[LHS_ZPS]] : tensor<2x1x4x8x4x4x4x4xi8>, tensor<2x4x4x4xf32>, tensor<2x4x4x4xf32>)
+//  CHECK-SAME:       outs(%[[EMPTY_LHS]] : tensor<2x1x4x8x4x4x4x4xf32>)
+//       CHECK:     arith.extui
+//       CHECK:     arith.uitofp
+//       CHECK:     arith.subf
+//       CHECK:     arith.mulf
+//       CHECK:   flow.dispatch.tensor.store %[[LHS_DEQUANT]], %[[RESULT_BINDING]], offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [2, 1, 4, 8, 4, 4, 4, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x4x4x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1x4x8x4x4x4x4xf32>>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.cpp
@@ -4,8 +4,10 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h"
+#include <numeric>
+
 #include "iree/compiler/Codegen/Dialect/Codegen/Utils/Utils.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
@@ -398,6 +400,86 @@ TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
   return crossThreadOuterSwizzle;
 }
 
+/// Remove the expanded dimensions for this index and update the permutation by
+/// erasing the removed dimensions' indices and adjusting existing larger
+/// indices accordingly.
+static void remove(TileSwizzle &swizzle, size_t idx) {
+  assert(idx < swizzle.expandShape.size() && "idx out of bounds");
+  const size_t startIdx = std::accumulate(
+      std::begin(swizzle.expandShape), std::begin(swizzle.expandShape) + idx, 0,
+      [](size_t idx, const TileSwizzle::ExpandShapeDimVectorType &dims)
+          -> size_t { return idx + dims.size(); });
+  const size_t endIdx = startIdx + swizzle.expandShape[idx].size();
+  swizzle.expandShape.erase(swizzle.expandShape.begin() + idx);
+  SmallVector<int64_t> newPermutation;
+  for (const int64_t &p : swizzle.permutation) {
+    if (p < startIdx) {
+      newPermutation.push_back(p);
+    } else if (p >= endIdx) {
+      newPermutation.push_back(p - (endIdx - startIdx));
+    }
+  }
+  swizzle.permutation = newPermutation;
+}
+
+FailureOr<TileSwizzle> getEncodingSwizzle(IREE::Encoding::EncodingAttr encoding,
+                                          IREE::GPU::DataTiledMMAAttr mma,
+                                          IREE::GPU::MMAFragment fragment) {
+  TileSwizzle swizzle = getSwizzle(mma, fragment);
+  FailureOr<linalg::ContractionDimensions> cDims =
+      getEncodingContractionDims(encoding);
+  if (failed(cDims)) {
+    return failure();
+  }
+  // The following expects M, N, K, and Batch sizes of at most 1 for now.
+  // TODO: Extend this to multiple M/N/K/Batch dims.
+  assert(cDims->m.size() <= 1 && cDims->n.size() <= 1 && cDims->k.size() == 1 &&
+         cDims->batch.size() <= 1 &&
+         "Expected at most one M, N, K, and Batch dimension");
+  std::optional<unsigned> mDim =
+      cDims->m.empty() ? std::nullopt
+                       : encoding.mapDimToOperandIndex(cDims->m[0]);
+  std::optional<unsigned> nDim =
+      cDims->n.empty() ? std::nullopt
+                       : encoding.mapDimToOperandIndex(cDims->n[0]);
+  std::optional<unsigned> kDim = encoding.mapDimToOperandIndex(cDims->k[0]);
+  switch (fragment) {
+  case IREE::GPU::MMAFragment::Lhs:
+    // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
+    // Dimensions are removed from last to first to ensure correctness.
+    if (!kDim.has_value()) {
+      remove(swizzle, 1);
+    }
+    if (!cDims->m.empty() && !mDim.has_value()) {
+      remove(swizzle, 0);
+    }
+    break;
+  case IREE::GPU::MMAFragment::Rhs:
+    // B-matrix (RHS). Since the pack ops already took care of transposing B,
+    // source dimensions are N (index 0) and K (index 1).
+    // Dimensions are removed from last to first to ensure correctness.
+    if (!kDim.has_value()) {
+      remove(swizzle, 1);
+    }
+    if (!cDims->n.empty() && !nDim.has_value()) {
+      remove(swizzle, 0);
+    }
+    break;
+  case IREE::GPU::MMAFragment::Acc:
+    // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
+    // 1).
+    // Dimensions are removed from last to first to ensure correctness.
+    if (!cDims->n.empty() && !nDim.has_value()) {
+      remove(swizzle, 1);
+    }
+    if (!cDims->m.empty() && !mDim.has_value()) {
+      remove(swizzle, 0);
+    }
+    break;
+  }
+  return swizzle;
+}
+
 TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
                                 IREE::GPU::MMAFragment fragment) {
   auto swizzle =
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h
@@ -33,6 +33,13 @@ Codegen::TileSwizzle getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic intrinsic,
 Codegen::TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
                                 IREE::GPU::MMAFragment fragment);
 
+/// Returns the swizzle for the data-tiled-mma tile, based on the `fragment`
+/// and contraction dimensions required from the `encoding`.
+FailureOr<Codegen::TileSwizzle>
+getEncodingSwizzle(IREE::Encoding::EncodingAttr encoding,
+                   IREE::GPU::DataTiledMMAAttr mma,
+                   IREE::GPU::MMAFragment fragment);
+
 } // namespace mlir::iree_compiler::IREE::GPU
 
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IR_GPUTILESWIZZLEUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/CPUEncodingExternalModels.cpp
@@ -70,20 +70,6 @@ static void transposeInPlace(MaterializeEncodingInfo &info) {
   transpose(info.outerDimsPerm);
 }
 
-static Operation *dropEncodingAndCloneOp(OpBuilder &builder, Operation *op,
-                                         ValueRange convertedInputOperands,
-                                         ValueRange convertedOutputOperands) {
-  SmallVector<Value> operands;
-  operands.append(convertedInputOperands.begin(), convertedInputOperands.end());
-  operands.append(convertedOutputOperands.begin(),
-                  convertedOutputOperands.end());
-  return mlir::clone(
-      builder, op,
-      {cast<RankedTensorType>(convertedOutputOperands[0].getType())
-           .dropEncoding()},
-      operands);
-}
-
 static RankedTensorType
 getExpandedType(RankedTensorType type, bool isBatched, bool isTransposed,
                 SmallVectorImpl<ReassociationIndices> &ri) {
diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp
@@ -342,7 +342,12 @@ struct GPUDeviceEncodingLayoutResolverAttrInterface
     info = std::move(maybeEncodingInfo.value());
     auto fragment = static_cast<IREE::GPU::MMAFragment>(
         encoding.getOperandIndex().getInt());
-    info.swizzle = getSwizzle(mma, fragment);
+    FailureOr<Codegen::TileSwizzle> maybeSwizzle =
+        getEncodingSwizzle(encoding, mma, fragment);
+    if (failed(maybeSwizzle)) {
+      return info;
+    }
+    info.swizzle = std::move(maybeSwizzle.value());
     return info;
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -1158,6 +1158,20 @@ OpFoldResult convertByteOffsetToElementOffset(RewriterBase &rewriter,
   }
 }
 
+Operation *dropEncodingAndCloneOp(OpBuilder &builder, Operation *op,
+                                  ValueRange convertedInputOperands,
+                                  ValueRange convertedOutputOperands) {
+  SmallVector<Value> operands;
+  operands.append(convertedInputOperands.begin(), convertedInputOperands.end());
+  operands.append(convertedOutputOperands.begin(),
+                  convertedOutputOperands.end());
+  return mlir::clone(
+      builder, op,
+      {cast<RankedTensorType>(convertedOutputOperands[0].getType())
+           .dropEncoding()},
+      operands);
+}
+
 LogicalResult isArgmaxOp(linalg::GenericOp genericOp) {
   // Check for 2 results(value, index), and 1 input
   if (genericOp.getNumDpsInits() != 2) {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -211,6 +211,11 @@ OpFoldResult convertByteOffsetToElementOffset(RewriterBase &rewriter,
                                               OpFoldResult byteOffset,
                                               Type elementType);
 
+/// Clone an operation and drop all encodings.
+Operation *dropEncodingAndCloneOp(OpBuilder &builder, Operation *op,
+                                  ValueRange convertedInputOperands,
+                                  ValueRange convertedOutputOperands);
+
 /// Check if a linalg.generic is representing an argmax operation.
 LogicalResult isArgmaxOp(linalg::GenericOp genericOp);
 
diff --git a/tests/e2e/encoding/BUILD.bazel b/tests/e2e/encoding/BUILD.bazel
diff --git a/tests/e2e/encoding/CMakeLists.txt b/tests/e2e/encoding/CMakeLists.txt
diff --git a/tests/e2e/encoding/encoding.mlir b/tests/e2e/encoding/encoding.mlir