iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingPatterns.cpp
+163-15 b/‎compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingPatterns.cpp
+163-15
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/gpu_materialize_encoding_gfx942.mlir
+54 b/‎compiler/src/iree/compiler/Codegen/Common/test/gpu_materialize_encoding_gfx942.mlir
+54
@@ -217,9 +217,10 @@ lowerOpWithEncoding(RewriterBase &rewriter, tensor::EmptyOp emptyOp,
   return newEmptyOp;
 }
 
-/// Converts a linalg::GenericOp with encoded inputs into the packed domain.
-/// The `genericOp` must have all parallel iterator types and a single output
-/// with an identity indexing map.
+/// Converts a linalg::GenericOp with encoded inputs into the packed domain,
+/// with an optional swizzle expansion and permutation if applicable. The
+/// `genericOp` must have all parallel iterator types and a single output with
+/// an identity indexing map.
 static FailureOr<Operation *> lowerGenericOpWithEncoding(
     RewriterBase &rewriter, linalg::GenericOp genericOp,
     ValueRange convertedInputOperands, ValueRange convertedOutputOperands,
@@ -230,30 +231,119 @@ static FailureOr<Operation *> lowerGenericOpWithEncoding(
     return rewriter.notifyMatchFailure(genericOp,
                                        "Output indexing map is not identity");
   }
+  // Step 1: Retrieve the output encoding materialization information and
+  // compute the new indexing maps for the packed and potentially swizzled
+  // layout. This consists of an outer dimension and inner dimension permutation
+  // vectors for the packing and an expanded result dimension permutation vector
+  // for the optional swizzling. This assumes that the output map is identity,
+  // and that all iterator types are parallel.
+  //
+  // Running example:
+  //
+  // Given following output layout:
+  //
+  // outputType:              tensor<2x128x64xf32>
+  // outputPackInfo:          innerDimsPos = [1, 2],
+  //                          innerTileSizes = [128, 16]
+  //                          outerDimsPerm = [0, 1, 2]
+  // outputSwizzle:           expandShape = [[4, 8, 4], [4, 4]]
+  //                          permutation = [1, 4, 0, 2, 3]}
+  //
+  // Retrieve and compute the permutation vectors for the packing outer and
+  // inner dimension permutation and for the expanded swizzle permutation. Then,
+  // calculate the permutation that would transform the swizzled output
+  // dimension map into the identity dimension map. This is the inverse swizzle
+  // permutation.
+  //
+  // outInverseOuterDimsPerm: [0, 1, 2]
+  // outInnerDimsPos:         [1, 2]
+  // outSwizzlePerm:          [0, 1, 2, 4, 7, 3, 5, 6]
+  // invOutSwizzlePerm:       [0, 1, 2, 5, 3, 6, 7, 4]
   MaterializeEncodingInfo outMaterializeEncodingInfo =
       typeConverter.getEncodingInfo(
           cast<RankedTensorType>(outputOperand->get().getType()));
   if (IREE::Codegen::isIdentityLayout(outMaterializeEncodingInfo)) {
-    return rewriter.notifyMatchFailure(
-        genericOp, "MaterializeEncodingInfo failed for output");
-  }
-  if (outMaterializeEncodingInfo.swizzle) {
-    return rewriter.notifyMatchFailure(
-        genericOp, "generic op lowering does not support swizzle yet");
+    return dropEncodingAndCloneOp(rewriter, genericOp.getOperation(),
+                                  convertedInputOperands,
+                                  convertedOutputOperands);
   }
 
   auto convertedResultType =
       cast<RankedTensorType>(convertedOutputOperands[0].getType());
   SmallVector<utils::IteratorType> iteratorTypes(convertedResultType.getRank(),
                                                  utils::IteratorType::parallel);
-  // Compute the new indexing maps for the packed layout. This assumes that
-  // the output map is identity, and that all iterator types are parallel.
-  SmallVector<int64_t> outInnerDimsPos =
-      outMaterializeEncodingInfo.innerDimsPos;
+
   SmallVector<int64_t> outInverseOuterDimsPerm =
       invertPermutationVector(outMaterializeEncodingInfo.outerDimsPerm);
+  ArrayRef<int64_t> outInnerDimsPos = outMaterializeEncodingInfo.innerDimsPos;
+  SmallVector<int64_t> outSwizzlePerm =
+      llvm::to_vector(llvm::seq<int64_t>(0, convertedResultType.getRank()));
+  if (outMaterializeEncodingInfo.swizzle.has_value()) {
+    const int outRank =
+        cast<RankedTensorType>(outputOperand->get().getType()).getRank();
+    SmallVector<int64_t> transposePerm =
+        llvm::to_vector(llvm::seq<int64_t>(0, outRank));
+    for (auto perm : outMaterializeEncodingInfo.swizzle->permutation) {
+      transposePerm.push_back(outRank + perm);
+    }
+    applyPermutationToVector(outSwizzlePerm, transposePerm);
+  }
+  SmallVector<int64_t> invOutSwizzlePerm =
+      invertPermutationVector(outSwizzlePerm);
+
+  // Calculate the running offset for every dimension position for easy lookup
+  // when calculating the packed result dimensions for every operand.
+  // Example:
+  //   expandShape == [[4, 8, 4], [4, 4]]
+  // In this case:
+  //   outOffsetForDimsPos == [0, 3]
+  // So that whenever we need the real dimension for an entry (`outerIndex`,
+  // `innerIndex`) in the 2D expanded shape vector, we can calculate it as:
+  //   dim(outerIndex, innerIndex) = outOffsetForDimsPos[outerIndex] +
+  //   innerIndex
+  SmallVector<int64_t> outOffsetForDimsPos(outInnerDimsPos.size(), 0);
+  if (outMaterializeEncodingInfo.swizzle.has_value()) {
+    int64_t runningSize = 0;
+    for (size_t i = 0; i < outInnerDimsPos.size(); i++) {
+      outOffsetForDimsPos[i] = runningSize;
+      runningSize += outMaterializeEncodingInfo.swizzle->expandShape[i].size();
+    }
+  }
+
   SmallVector<AffineMap> packedIndexingMaps;
   for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) {
+    // Step 2: Retrieve the encoding for every input operand and perform the
+    // outer dimension permutation, inner dimension expansion and permutation,
+    // swizzle expansion and swizzle permutation.
+    //
+    // Running example:
+    //
+    // Given the input layout and indexing maps:
+    //
+    // inputType:       tensor<2x64xf32>
+    // innerPackInfo:   innerDimsPos = [1]
+    //                  innerTileSizes = [16]
+    //                  outerDimsPerm = [0, 1]
+    // innerSwizzle:    expandShape = [[4, 4]]
+    //                  permutation = [1, 0]
+    // inputMap:        [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
+    //                   affine_map<(d0, d1, d2) -> (d0, d2)>]
+    //
+    // 1. Calculate the result dimensions from the indexing maps and perform the
+    // outer dimension permutation:
+    //
+    // packedResultDims: [0, 2]
+    //
+    // 2. Perform inner dimension expansion, permutation and optional swizzle
+    // expansion in one go. In this example, the inner dimension (64) would be
+    // expanded into 4x16 based on `innerDimsPos` and `innerTileSizes` above,
+    // and then expanded to 4x4x4 based on the swizzle.
+    //
+    // packedResultDims: [0, 2, 6, 7]
+    //
+    // 3. Perform the swizzle permutation:
+    //
+    // packedResultDims: [0, 2, 7, 6]
     MaterializeEncodingInfo materializeEncodingInfo =
         typeConverter.getEncodingInfo(
             cast<RankedTensorType>(inputOperand->get().getType()));
@@ -277,14 +367,72 @@ static FailureOr<Operation *> lowerGenericOpWithEncoding(
     for (auto [idx, pos] : llvm::enumerate(innerDimsPos)) {
       auto dimPos = cast<AffineDimExpr>(inputMap.getResult(pos)).getPosition();
       for (auto [tileIdx, outDim] : llvm::enumerate(outInnerDimsPos)) {
-        if (dimPos == outDim) {
+        if (dimPos != outDim) {
+          continue;
+        }
+        if (!materializeEncodingInfo.swizzle.has_value()) {
           packedResultDims.push_back(outputMap.getNumDims() + tileIdx);
+          continue;
         }
+        // In case of a layout with swizzle, an expanded set of dimensions
+        // needs to be appended as specified by the swizzle's `expandedShape`
+        // field. Note that the dimension index should be offset by the
+        // calculated output starting offset as every dimension is now
+        // transformed into an expanded sequence of indices and the correct
+        // dimension index is:
+        //   outOffsetForDimsPos[tileIdx] + innerIndex
+        assert(idx < materializeEncodingInfo.swizzle->expandShape.size() &&
+               "`innerDimsPos` index should not exceed the swizzle's "
+               "`expandShape` size");
+        const size_t dimSize =
+            materializeEncodingInfo.swizzle->expandShape[idx].size();
+        const int64_t outIdxOffset =
+            outputMap.getNumDims() + outOffsetForDimsPos[tileIdx];
+        for (size_t i = 0; i < dimSize; i++) {
+          packedResultDims.push_back(outIdxOffset + i);
+        }
+      }
+    }
+    // In case of a layout with swizzle, the packed result dimensions need
+    // to be transposed according to the swizzle's permutation vector.
+    if (materializeEncodingInfo.swizzle.has_value()) {
+      int inRank =
+          cast<RankedTensorType>(inputOperand->get().getType()).getRank();
+      SmallVector<int64_t> transposePerm =
+          llvm::to_vector(llvm::seq<int64_t>(0, inRank));
+      for (auto perm : materializeEncodingInfo.swizzle->permutation) {
+        transposePerm.push_back(inRank + perm);
       }
+      applyPermutationToVector(packedResultDims, transposePerm);
     }
+
+    // Step 3: Calculate the final packed result dimensions through the inverse
+    // result dimensions permutation map. This effectively linearizes the packed
+    // result dimensions with respect to the output dimensions. For example, if
+    // the permuted output dimensions are [D0, D2, D1], this will transform all
+    // packed operand result dimensions with the permutation map that would make
+    // the output dimensions the identity map [D0, D1, D2], i.e. {D0 -> D0, D1
+    // -> D2, D2 -> D1}. Suppose that the operand dimensions are [D0, D2], this
+    // operation would transform it into [D0, D1] to align with the output
+    // identity map.
+    //
+    // Running example:
+    //
+    // The packed and swizzled result dimensions for the input operand:
+    //
+    // packedResultDims:      [0, 2, 7, 6]
+    //
+    // Now we need to account for swizzled output result dimensions being
+    // linearized to the identity map. This can be achieved by applying
+    // `invOutSwizzlePerm` ([0, 1, 2, 5, 3, 6, 7, 4]):
+    //
+    // finalPackedResultDims: [0, 2, 4, 7]
+    SmallVector<int64_t> finalPackedResultDims = llvm::map_to_vector(
+        packedResultDims, [&](int64_t r) { return invOutSwizzlePerm[r]; });
+
     // Create the packed indexing map.
     SmallVector<AffineExpr> packedResultExprs =
-        llvm::map_to_vector(packedResultDims, [&](int64_t dim) {
+        llvm::map_to_vector(finalPackedResultDims, [&](int64_t dim) {
           return rewriter.getAffineDimExpr(dim);
         });
     auto packedInputMap = AffineMap::get(
 
@@ -1260,3 +1260,57 @@ func.func @missing_user_indexing_maps() {
 // CHECK-DAG:     %[[STORE_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1)
 // CHECK-DAG:     %[[LOAD:.+]] = flow.dispatch.tensor.load %[[LOAD_BINDING]]{{.+}} -> tensor<255x513xf32>
 // CHECK-DAG:     flow.dispatch.tensor.store %[[LOAD]], %[[STORE_BINDING]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]>
+#encoding_bcast = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>, affine_map<(d0, d1, d2) -> (d0, d2)>], affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]>
+func.func @dequantization() {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x128x64xi8, #encoding>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>>
+  %6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2x128x64xf32, #encoding>>
+  %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x128x64xi8, #encoding>> -> tensor<2x128x64xi8, #encoding>
+  %8 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>> -> tensor<2x64xf32, #encoding_bcast>
+  %9 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2, 64], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2x64xf32, #encoding_bcast>> -> tensor<2x64xf32, #encoding_bcast>
+  %13 = tensor.empty() : tensor<2x128x64xf32, #encoding>
+  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7, %8, %9 : tensor<2x128x64xi8, #encoding>, tensor<2x64xf32, #encoding_bcast>, tensor<2x64xf32, #encoding_bcast>) outs(%13 : tensor<2x128x64xf32, #encoding>) {
+  ^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32):
+    %21 = arith.extui %in : i8 to i32
+    %22 = arith.uitofp %21 : i32 to f32
+    %23 = arith.subf %22, %in_1 : f32
+    %24 = arith.mulf %23, %in_0 : f32
+    linalg.yield %24 : f32
+  } -> tensor<2x128x64xf32, #encoding>
+  flow.dispatch.tensor.store %14, %6, offsets = [0, 0, 0], sizes = [2, 128, 64], strides = [1, 1, 1] : tensor<2x128x64xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<2x128x64xf32, #encoding>>
+  return
+}
+//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d2, d4, d7)>
+// CHECK-LABEL: func.func @dequantization()
+//   CHECK-DAG:   %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(0) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x1x4x8x4x4x4x4xi8>>
+//   CHECK-DAG:   %[[LHS_SCALES_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(1) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>>
+//   CHECK-DAG:   %[[LHS_ZPS_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(2) {{.*}} : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>>
+//   CHECK-DAG:   %[[RESULT_BINDING:.+]] = hal.interface.binding.subspan {{.*}} binding(3) {{.*}} : !flow.dispatch.tensor<writeonly:tensor<2x1x4x8x4x4x4x4xf32>>
+//   CHECK-DAG:   %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [2, 1, 4, 8, 4, 4, 4, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1x4x8x4x4x4x4xi8>> -> tensor<2x1x4x8x4x4x4x4xi8>
+//   CHECK-DAG:   %[[LHS_SCALES:.+]] = flow.dispatch.tensor.load %[[LHS_SCALES_BINDING]], offsets = [0, 0, 0, 0], sizes = [2, 4, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>> -> tensor<2x4x4x4xf32>
+//   CHECK-DAG:   %[[LHS_ZPS:.+]] = flow.dispatch.tensor.load %[[LHS_ZPS_BINDING]], offsets = [0, 0, 0, 0], sizes = [2, 4, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x4x4x4xf32>> -> tensor<2x4x4x4xf32>
+//   CHECK-DAG:   %[[EMPTY_LHS:.+]] = tensor.empty() : tensor<2x1x4x8x4x4x4x4xf32>
+//   CHECK-DAG:   %[[LHS_DEQUANT:.+]] = linalg.generic
+//  CHECK-SAME:       indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP1]], #[[$MAP]]]
+//  CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]
+//  CHECK-SAME:       ins(%[[LHS]], %[[LHS_SCALES]], %[[LHS_ZPS]] : tensor<2x1x4x8x4x4x4x4xi8>, tensor<2x4x4x4xf32>, tensor<2x4x4x4xf32>)
+//  CHECK-SAME:       outs(%[[EMPTY_LHS]] : tensor<2x1x4x8x4x4x4x4xf32>)
+//       CHECK:     arith.extui
+//       CHECK:     arith.uitofp
+//       CHECK:     arith.subf
+//       CHECK:     arith.mulf
+//       CHECK:   flow.dispatch.tensor.store %[[LHS_DEQUANT]], %[[RESULT_BINDING]], offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [2, 1, 4, 8, 4, 4, 4, 4], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<2x1x4x8x4x4x4x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x1x4x8x4x4x4x4xf32>>