Skip to content

Commit

Permalink
squash again
Browse files Browse the repository at this point in the history
  • Loading branch information
newling committed Aug 29, 2024
1 parent bfbd397 commit 1cb50e1
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree-amd-aie/Transforms/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "llvm/ADT/StringExtras.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
Expand All @@ -16,16 +17,119 @@
#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
#include "mlir/IR/Iterators.h"
#include "mlir/IR/PatternMatch.h"

#define DEBUG_TYPE "iree-amdaie-tile-and-fuse"


namespace mlir::iree_compiler::AMDAIE {

namespace {

enum class GPUMappingType { Block, Thread };
FailureOr<SmallVector<Attribute>> getGPUMappingAttributes(
ArrayRef<int64_t> tileSizesVal, GPUMappingType mappingType,
TilingInterface op) {
MLIRContext *context = op.getContext();

// There is one induction variables in the scf.forall for each of the
// non-zero tile sizes. Recall that a '0' tile size corresponds to 'do
// not tile'.
uint32_t nbIndVars = std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
[](int64_t t) { return t != 0; });

uint32_t nbIndVarsAboveOne =
std::count_if(tileSizesVal.begin(), tileSizesVal.end(),
[](int64_t t) { return t > 1; });

// See mlir::gpu::MappingId enum. Currently 13 dimensions are supported.
// clang-format off
// https://github.com/llvm/llvm-project/blob/e8063702cfbbf39f0b92283d0588dee264b5eb2b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td#L37.
// clang-format on
if (nbIndVars > mlir::gpu::getMaxEnumValForMappingId()) {
return op->emitOpError("has too many dimensions to tile, ")
<< "there are only " << mlir::gpu::getMaxEnumValForMappingId()
<< " dimensions available in the mlir::gpu dialect (require "
<< nbIndVars << " here).";
}

// Currently we expect only 2 tiled dimensions to be >1 when mapping
// to thread dimensions. This is to target the 2-D AIE array.
//
// TODO(newling) if there are 3+ dimensions, we probably need to collapse
// them. I'm leaving this as follow-up pass. Instead of
// (i,j,k) in (2,3,5)
// for example, could be
// (i,l) in (2,15)
// with then
// j=l/5 and k=l%5.
// Once the above is implemented, we can safely remove the following check
// entirely:
if (nbIndVarsAboveOne > 2 && mappingType == GPUMappingType::Thread) {
auto tileSizesStr = std::to_string(tileSizesVal[0]);
for (unsigned i = 1; i < tileSizesVal.size(); ++i) {
tileSizesStr += ", " + std::to_string(tileSizesVal[i]);
}
return op->emitOpError("has requested tile sizes (")
<< tileSizesStr
<< "). Currently we only support tiling thread dimensions "
<< "with at most 2 dimensions having a tile size greater than 1. ";
}

auto getMappingAttributeForDimension = [&](uint32_t i) -> Attribute {
auto id = static_cast<gpu::MappingId>(i);
if (mappingType == GPUMappingType::Block)
return gpu::GPUBlockMappingAttr::get(context, id);
else if (mappingType == GPUMappingType::Thread)
return gpu::GPUThreadMappingAttr::get(context, id);
else {
assert(false && "unhandled mapping type");
}
};

// 0 -> DimY
// 1 -> DimX
// 2 -> DimZ
// 3 -> LinearDim0
// 4 -> LinearDim1
// etc.
auto getAttribute = [&](uint32_t i) -> Attribute {
if (i == 0)
return getMappingAttributeForDimension(1);
else if (i == 1)
return getMappingAttributeForDimension(0);
else
return getMappingAttributeForDimension(i);
};

// Map from tiling dimension to Attribute.
// We give priority to tiling dimensions of size > 1, so that they
// preferentially get DimY and DimX.
SmallVector<Attribute> mapping(tileSizesVal.size(), {});
uint32_t nAttributes = 0;
for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
if (tileSizesVal[i] > 1) {
mapping[i] = getAttribute(nAttributes);
++nAttributes;
}
}
for (uint32_t i = 0; i < tileSizesVal.size(); ++i) {
if (!mapping[i] && tileSizesVal[i] > 0) {
mapping[i] = getAttribute(nAttributes);
++nAttributes;
}
}

// Squeeze out the empty attributes (corresponding to '0's in tileSizesVal).
SmallVector<Attribute> finalMapping;
finalMapping.reserve(nbIndVars);
for (Attribute attr : mapping) {
if (attr) finalMapping.push_back(attr);
}
return finalMapping;
}

/// Utility function to check if any of the reduction dimension is being tiled.
static bool isTilingReductionDimension(TilingInterface consumerOp,
SmallVector<int64_t> tileSizesVal) {
Expand Down Expand Up @@ -157,27 +261,20 @@ void AMDAIETileAndFusePass::runOnOperation() {

SmallVector<OpFoldResult> tileSizes =
getAsIndexOpFoldResult(context, tileSizesVal);

auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);

// When tiling using scf.for we do not need to set any mapping.
if (!useSCFFor) {
options.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
// Here we assume there are always two levels of parallel (scf.forall)
// loops, and the first level of tiling is always using scf.forall and
// mapped to blocks. Currently we are not using mapping attributes for
// Conv2d ops, because there could be four parallel tiling dimensions.
// TODO (vivian): create AIE specific mapping attributes.
if (!isa<linalg::ConvolutionOpInterface>(consumerOp.getOperation())) {
if (tilingLevel == 0) {
options.setMapping(
{gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimY),
gpu::GPUBlockMappingAttr::get(context, gpu::MappingId::DimX)});
} else {
options.setMapping(
{gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimY),
gpu::GPUThreadMappingAttr::get(context, gpu::MappingId::DimX)});
}
auto maybeMapping = getGPUMappingAttributes(
tileSizesVal,
tilingLevel == 0 ? GPUMappingType::Block : GPUMappingType::Thread,
consumerOp);
if (failed(maybeMapping)) {
return signalPassFailure();
}
options.setMapping(maybeMapping.value());
}

IRRewriter rewriter(context);
Expand Down
18 changes: 18 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,24 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) {
passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass());
passManager.addPass(memref::createFoldMemRefAliasOpsPass());
passManager.addPass(createAMDAIEBridgeToAIRPass());

// Running canonicalization for all pipelines here results in failures.
// Example
// ```
// 'memref.cast' op is an unsupported operation. This pass currently only
// supports AllocOp and SubViewOp as inputs.
// ```
// It is currently required for the convolution pipeline though, to remove the
// extra (size-1) thread- and group- dimensions.
//
// TODO(newling) there are better solutions like:
// 1) make canonicalization work for scf.forall
// 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls.
// 3) resolve above 'unsupproted operation' error.
if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
passManager.addPass(createCanonicalizerPass());
}

// TODO (Erwei): Figure out a way to work with AMDAIEPackToDmaPass.
if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline)
passManager.addPass(createAMDAIEDecomposeLinalgExtPackUnPackToAIRPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func.func @conv_2d_nhwc_hwcf(%arg0: tensor<2x14x14x32xbf16>, %arg1: tensor<3x3x3
// TILE-LEVEL-0-SAME: {
// TILE-LEVEL-0: linalg.fill
// TILE-LEVEL-0: linalg.conv_2d_nhwc_hwcf
// TILE-LEVEL-0: }
// TILE-LEVEL-0: } {mapping = [#gpu.block<y>, #gpu.block<x>, #gpu.block<z>]}

// -----

Expand Down

0 comments on commit 1cb50e1

Please sign in to comment.