Skip to content

Commit

Permalink
Implement thread coarsening for GPU kernels (#352)
Browse files Browse the repository at this point in the history
* Add parallel loop unroll pass

* Canonicalizer for flattening alternatives op

* Add env variables user-specified coarsening factors

* A zero-block gpu kernel launch is an error so avoid that

* Max shared memory depends on the target gpu

* Canonicalize AlternativeOp with a single region

* Output kernel information
  • Loading branch information
ivanradanov authored Jul 10, 2023
1 parent e288f7b commit 07ec6ca
Show file tree
Hide file tree
Showing 14 changed files with 1,916 additions and 250 deletions.
3 changes: 2 additions & 1 deletion include/polygeist/Passes/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ std::unique_ptr<Pass> createParallelLowerPass(
std::unique_ptr<Pass> createConvertCudaRTtoCPUPass();
std::unique_ptr<Pass> createConvertCudaRTtoGPUPass();
std::unique_ptr<Pass> createConvertCudaRTtoHipRTPass();
std::unique_ptr<Pass> createSCFParallelLoopUnrollPass(int unrollFactor = 2);
std::unique_ptr<Pass>
createConvertPolygeistToLLVMPass(const LowerToLLVMOptions &options,
bool useCStyleMemRef, bool onlyGpuModules,
std::string gpuTarget);
std::unique_ptr<Pass> createConvertPolygeistToLLVMPass();
std::unique_ptr<Pass> createForBreakToWhilePass();
std::unique_ptr<Pass>
createConvertParallelToGPUPass1(bool useOriginalThreadNums = false);
createConvertParallelToGPUPass1(std::string arch = "sm_60");
std::unique_ptr<Pass>
createConvertParallelToGPUPass2(bool emitGPUKernelLaunchBounds = true);
std::unique_ptr<Pass> createMergeGPUModulesPass();
Expand Down
13 changes: 13 additions & 0 deletions include/polygeist/Passes/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ def Mem2Reg : Pass<"mem2reg"> {
let constructor = "mlir::polygeist::createMem2RegPass()";
}

def SCFParallelLoopUnroll : Pass<"scf-parallel-loop-unroll"> {
let summary = "Unroll and interleave scf parallel loops";
let dependentDialects =
["::mlir::scf::SCFDialect"];
let constructor = "mlir::polygeist::createSCFParallelLoopUnrollPass()";
let options = [
Option<"unrollFactor", "unrollFactor", "int", /*default=*/"2", "Unroll factor">
];
}

def ConvertCudaRTtoCPU : Pass<"convert-cudart-to-cpu", "mlir::ModuleOp"> {
let summary = "Lower cudart functions to cpu versions";
let dependentDialects =
Expand Down Expand Up @@ -60,6 +70,9 @@ def ConvertParallelToGPU1 : Pass<"convert-parallel-to-gpu1"> {
let summary = "Convert parallel loops to gpu";
let constructor = "mlir::polygeist::createConvertParallelToGPUPass1()";
let dependentDialects = ["func::FuncDialect", "LLVM::LLVMDialect", "memref::MemRefDialect"];
let options = [
Option<"arch", "arch", "std::string", /*default=*/"\"sm_60\"", "Target GPU architecture">
];
}

def ConvertParallelToGPU2 : Pass<"convert-parallel-to-gpu2"> {
Expand Down
1 change: 1 addition & 0 deletions include/polygeist/PolygeistOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def AlternativesOp : Polygeist_Op<"alternatives", [
let regions = (region VariadicRegion<SizedRegion<1>>:$regions);
let skipDefaultBuilders = 1;
let builders = [OpBuilder<(ins "int":$regionNum)>];
let hasCanonicalizer = 1;
}

def GPUWrapperOp : Polygeist_Op<"gpu_wrapper", [
Expand Down
90 changes: 90 additions & 0 deletions lib/polygeist/Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,96 @@ void AlternativesOp::build(OpBuilder &builder, OperationState &result,
}
}

class HoistSingleAlternative final : public OpRewritePattern<AlternativesOp> {
public:
using OpRewritePattern<AlternativesOp>::OpRewritePattern;

LogicalResult matchAndRewrite(AlternativesOp aop,
PatternRewriter &rewriter) const override {
assert(aop->getNumRegions() > 0);
if (aop->getNumRegions() > 1) {
return failure();
}
auto block = &*aop->getRegions()[0].begin();
rewriter.eraseOp(block->getTerminator());
rewriter.mergeBlockBefore(block, aop);
rewriter.eraseOp(aop);
return success();
}
};

class FlattenAlternatives final : public OpRewritePattern<AlternativesOp> {
public:
using OpRewritePattern<AlternativesOp>::OpRewritePattern;

LogicalResult matchAndRewrite(AlternativesOp aop,
PatternRewriter &rewriter) const override {
// Ignore nested alternatives ops
if (aop->getParentOfType<AlternativesOp>())
return failure();

AlternativesOp innerAop = nullptr;
for (auto &region : aop->getRegions()) {
for (auto &op : region.getOps()) {
if (auto aop = dyn_cast<AlternativesOp>(&op)) {
innerAop = aop;
break;
}
}
if (innerAop)
break;
}
if (!innerAop)
return failure();

// TODO use block insertion etc for better performance
auto newAop = rewriter.create<polygeist::AlternativesOp>(
aop->getLoc(), innerAop->getNumRegions() + aop->getNumRegions() - 1);
newAop->setAttrs(aop->getAttrs());
auto srcBlock = &*aop->getBlock()->getParent()->begin();
unsigned curRegion = 0;
for (; curRegion < innerAop->getNumRegions(); curRegion++) {
BlockAndValueMapping mapping;
auto block = &*newAop->getRegion(curRegion).begin();
rewriter.setInsertionPointToStart(block);
for (auto &op : *innerAop->getBlock()) {
if (&op == innerAop.getOperation()) {
for (auto &op : innerAop->getRegion(curRegion).getOps())
if (!isa<PolygeistYieldOp>(&op))
rewriter.clone(op, mapping);
} else {
if (!isa<PolygeistYieldOp>(&op))
rewriter.clone(op, mapping);
}
}
}

unsigned oldRegion = 0;
for (; oldRegion < aop->getNumRegions(); oldRegion++) {
auto &srcRegion = aop->getRegion(oldRegion);
if (innerAop->getBlock()->getParent() == &srcRegion) {
continue;
}
auto block = &*newAop->getRegion(curRegion).begin();
rewriter.setInsertionPointToStart(block);
BlockAndValueMapping mapping;
for (auto &op : srcRegion.getOps())
if (!isa<PolygeistYieldOp>(&op))
rewriter.clone(op, mapping);
curRegion++;
}

rewriter.eraseOp(aop);

return success();
}
};

void AlternativesOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
results.insert<HoistSingleAlternative, FlattenAlternatives>(context);
}

//===----------------------------------------------------------------------===//
// GPUBlockOp
//===----------------------------------------------------------------------===//
Expand Down
25 changes: 24 additions & 1 deletion lib/polygeist/Passes/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms
ConvertParallelToGPU.cpp
SerializeToCubin.cpp
SerializeToHsaco.cpp
ParallelLoopUnroll.cpp

ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine
Expand Down Expand Up @@ -88,9 +89,31 @@ if(POLYGEIST_ENABLE_CUDA)

endif()
if(POLYGEIST_ENABLE_ROCM)
# Enable gpu-to-hsaco pass.
target_compile_definitions(obj.MLIRPolygeistTransforms
PRIVATE
POLYGEIST_ENABLE_ROCM=1
__HIP_PLATFORM_AMD__
)

if (NOT DEFINED ROCM_PATH)
if (NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
endif()
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} "${ROCM_PATH}/hip")
find_package(hip REQUIRED)

# there should be some variable for this
target_include_directories(obj.MLIRPolygeistTransforms
PRIVATE
${ROCM_PATH}/include
)

target_link_libraries(MLIRPolygeistTransforms
PRIVATE
hip::host
)

endif()
Loading

0 comments on commit 07ec6ca

Please sign in to comment.