Skip to content

Commit

Permalink
add test of thread mapping and test of error
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Aug 29, 2024
1 parent 1cb50e1 commit 38c7885
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ namespace mlir::iree_compiler::AMDAIE {

namespace {

enum class GPUMappingType { Block, Thread };
enum class GPUGroupType { Block, Thread };

/// Assign GPU dialect thread/block mapping attributes to tiled dimensions.
/// The returned vector's size is the number of non-zero values in
/// `tileSizesVal`. Failure is returned if it is not possible to assign
/// mapping attributes to the dimensions.
FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
ArrayRef<int64_t> tileSizesVal, GPUMappingType mappingType,
ArrayRef<int64_t> tileSizesVal, GPUGroupType groupType,
TilingInterface op) {
MLIRContext *context = op.getContext();

Expand All @@ -43,57 +48,62 @@ FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
[](int64_t t) { return t > 1; });

// See mlir::gpu::MappingId enum. Currently 13 dimensions are supported.
// clang-format off
// https://github.com/llvm/llvm-project/blob/e8063702cfbbf39f0b92283d0588dee264b5eb2b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td#L37.
// clang-format on
// The mlir::gpu::MappingId enum supports 13 dimensions, see:
// https://github.com/llvm/llvm-project/blob/main
// /mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
return op->emitOpError("has too many dimensions to tile, ")
<< "there are only " << mlir::gpu::getMaxEnumValForMappingId()
<< " dimensions available in the mlir::gpu dialect (require "
<< nbIndVars << " here).";
<< " dimensions available in the mlir::gpu dialect, but "
<< nbIndVars << " are required here..";
}

// Currently we expect only 2 tiled dimensions to be >1 when mapping
// to thread dimensions. This is to target the 2-D AIE array.
//
// TODO(newling) if there are 3+ dimensions, we probably need to collapse
// them. I'm leaving this as follow-up pass. Instead of
// (i,j,k) in (2,3,5)
// for example, could be
// (i,l) in (2,15)
// them into just 2. I'm leaving this as a follow-up task. Basically, instead
// of
// ```(i,j,k) in (2,3,5)```
// we want
// ```(i,l) in (2,15)```
// with then
// j=l/5 and k=l%5.
// Once the above is implemented, we can safely remove the following check
// entirely:
if (nbIndVarsAboveOne > 2 && mappingType == GPUMappingType::Thread) {
//
// Once the above is implemented, we can safely remove the following check:
if (nbIndVarsAboveOne > 2 && groupType == GPUGroupType::Thread) {
auto tileSizesStr = std::to_string(tileSizesVal[0]);
for (unsigned i = 1; i < tileSizesVal.size(); ++i) {
tileSizesStr += ", " + std::to_string(tileSizesVal[i]);
}
return op->emitOpError("has requested tile sizes (")
return op->emitOpError("has requested tile sizes [")
<< tileSizesStr
<< "). Currently we only support tiling thread dimensions "
<< "with at most 2 dimensions having a tile size greater than 1. ";
<< "]. Currently we only support tiling thread dimensions "
<< "with at most 2 dimensions having a tile size greater than 1, "
<< "there are " << nbIndVarsAboveOne << " here.";
}

auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
auto id = static_cast<gpu::MappingId>(i);
if (mappingType == GPUMappingType::Block)
if (groupType == GPUGroupType::Block)
return gpu::GPUBlockMappingAttr::get(context, id);
else if (mappingType == GPUMappingType::Thread)
else if (groupType == GPUGroupType::Thread)
return gpu::GPUThreadMappingAttr::get(context, id);
else {
assert(false && "unhandled mapping type");
assert(false && "Unhandled group type, must be thread or block.");
}
};

// Map an integer to an Attribute as follows:
// 0 -> DimY
// 1 -> DimX
// 2 -> DimZ
// 3 -> LinearDim0
// 4 -> LinearDim1
// etc.
//
// Note that 0 and 1 are effectively swapped, because for AIE we want to
// map the first dimension to AIE array columns (or something like that).
auto getAttribute = [&](uint32_t i) -> Attribute {
if (i == 0)
return getMappingAttributeForDimension(1);
Expand All @@ -103,9 +113,8 @@ FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
return getMappingAttributeForDimension(i);
};

// Map from tiling dimension to Attribute.
// We give priority to tiling dimensions of size > 1, so that they
// preferentially get DimY and DimX.
// preferentially get DimY and DimX.
SmallVector<Attribute> mapping(tileSizesVal.size(), {});
uint32_t nAttributes = 0;
for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
Expand Down Expand Up @@ -269,7 +278,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
auto maybeMapping = getGPUMappingAttributes(
tileSizesVal,
tilingLevel == 0 ? GPUMappingType::Block : GPUMappingType::Thread,
tilingLevel == 0 ? GPUGroupType::Block : GPUGroupType::Thread,
consumerOp);
if (failed(maybeMapping)) {
return signalPassFailure();
Expand Down Expand Up @@ -302,8 +311,7 @@ void AMDAIETileAndFusePass::runOnOperation() {
// Fuse all Linalg ops (can be generalized later)
.Default([&](Operation *op) {
return op->getDialect() ==
rewriter.getContext()
->getLoadedDialect<linalg::LinalgDialect>();
context->getLoadedDialect<linalg::LinalgDialect>();
});
return {fusableOp, false};
});
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-1
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-LEVEL-0
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=1}))' --split-input-file --verify-diagnostics %s | FileCheck %s --check-prefix=TILE-LEVEL-1
// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-amdaie-tile-and-fuse{tiling-level=0 tile-elementwise=false}))' --split-input-file %s | FileCheck %s --check-prefix=TILE-MATMUL-ONLY

func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {
Expand Down Expand Up @@ -34,6 +34,24 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3
// TILE-LEVEL-0: linalg.conv_2d_nhwc_hwcf
// TILE-LEVEL-0: } {mapping = [#gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}

// TILE-LEVEL-1: @conv_2d_nhwc_hwcf
// TILE-LEVEL-1: scf.forall
// TILE-LEVEL-1-SAME: {
// TILE-LEVEL-1: linalg.fill
// TILE-LEVEL-1: linalg.conv_2d_nhwc_hwcf
// TILE-LEVEL-1: } {mapping = [#gpu.thread<z>, #gpu.thread<linear_dim_0>, #gpu.thread<y>, #gpu.thread<x>]}

// -----

func.func @conv_2d_nhwc_hwcf_unsupported_tiling(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x32x64xbf16>) -> tensor<2x12x12x64xf32> {
%cst = arith.constant 0.000000e+00 : f32
%0 = tensor.empty() : tensor<2x12x12x64xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
// expected-error @+1 {{'linalg.conv_2d_nhwc_hwcf' op has requested tile sizes [1, 4, 4, 4, 0, 0, 0]. Currently we only support tiling thread dimensions with at most 2 dimensions having a tile size greater than 1, there are 3 here.}}
%2 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 4, 4, 4, 0, 0, 0], [1, 4, 4, 4, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]]>, strides = dense<1> : vector<2xi64>} ins(%arg0, %arg1 : tensor<2x14x14x32xbf16>, tensor<3x3x32x64xbf16>) outs(%1 : tensor<2x12x12x64xf32>) -> tensor<2x12x12x64xf32>
return %2 : tensor<2x12x12x64xf32>
}

// -----

func.func @matmul_static(%arg0: tensor<8x16xi32>, %arg1 : tensor<16x8xi32>) -> tensor<8x8xi32> {
Expand Down

0 comments on commit 38c7885

Please sign in to comment.