Skip to content

Commit

Permalink
Add an option to do coalescing-friendly thread coarsening
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanradanov committed Jul 14, 2023
1 parent d476b55 commit 2c7c4da
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 16 deletions.
16 changes: 12 additions & 4 deletions lib/polygeist/Passes/ConvertParallelToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ static llvm::cl::opt<bool> GPUKernelEnableBlockCoarsening(
"gpu-kernel-enable-block-coarsening", llvm::cl::init(true),
llvm::cl::desc("When emitting coarsened kernels, enable block coarsening"));

static llvm::cl::opt<bool> GPUKernelEnableCoalescingFriendlyUnroll(
"gpu-kernel-enable-coalescing-friendly-unroll", llvm::cl::init(false),
llvm::cl::desc("When thread coarsening, do coalescing-friendly unrolling"));

// TODO when we add other backends, we would need to to add an argument to the
// pass which one we are compiling to to provide the appropriate error id
#if POLYGEIST_ENABLE_CUDA
Expand Down Expand Up @@ -1750,7 +1754,8 @@ struct ConvertParallelToGPU1Pass
auto blockUnrollFactors = getUnrollFactors(coarsenBlocks);
if (polygeist::scfParallelUnrollByFactors(
gridPop, ArrayRef<uint64_t>(blockUnrollFactors),
/* generateEpilogueLoop */ true, nullptr)
/* generateEpilogueLoop */ true,
/* coalescingFriendlyIndexing */ false, nullptr)
.failed())
wrapper->emitRemark("Failed to coarsen blocks");
}
Expand All @@ -1766,7 +1771,8 @@ struct ConvertParallelToGPU1Pass
auto threadUnrollFactors = getUnrollFactors(coarsenThreads);
if (polygeist::scfParallelUnrollByFactors(
blockPop, ArrayRef<uint64_t>(threadUnrollFactors),
/* generateEpilogueLoop */ false, nullptr)
/* generateEpilogueLoop */ false,
GPUKernelEnableCoalescingFriendlyUnroll, nullptr)
.failed())
wrapper->emitRemark("Failed to coarsen threads");
}
Expand Down Expand Up @@ -1934,7 +1940,8 @@ struct ConvertParallelToGPU1Pass
auto unrollFactors = UNROLL_FACTORS[gridDims][iBlock];
if (polygeist::scfParallelUnrollByFactors(
gridPop, ArrayRef<uint64_t>(unrollFactors),
/* generateEpilogueLoop */ true, nullptr)
/* generateEpilogueLoop */ true,
/* coalescingFriendlyIndexing */ false, nullptr)
.failed()) {
wrapper->emitRemark("Failed to coarsen blocks");
succeeded = false;
Expand All @@ -1946,7 +1953,8 @@ struct ConvertParallelToGPU1Pass
unrollFactors = UNROLL_FACTORS[blockDims][iThread];
if (polygeist::scfParallelUnrollByFactors(
blockPop, ArrayRef<uint64_t>(unrollFactors),
/* generateEpilogueLoop */ false, nullptr)
/* generateEpilogueLoop */ false,
GPUKernelEnableCoalescingFriendlyUnroll, nullptr)
.failed()) {
wrapper->emitRemark("Failed to coarsen threads");
succeeded = false;
Expand Down
22 changes: 16 additions & 6 deletions lib/polygeist/Passes/ParallelLoopUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
/// Unrolls 'pop' by 'unrollFactor', returns success if the loop is unrolled.
LogicalResult mlir::polygeist::scfParallelUnrollByFactor(
scf::ParallelOp &pop, uint64_t unrollFactor, unsigned dim,
bool generateEpilogueLoop,
bool generateEpilogueLoop, bool coalescingFriendlyIndexing,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
assert(unrollFactor > 0 && "expected positive unroll factor");
assert(dim >= 0 && dim < pop.getUpperBound().size());
Expand Down Expand Up @@ -364,10 +364,19 @@ LogicalResult mlir::polygeist::scfParallelUnrollByFactor(
auto res = generateUnrolledInterleavedLoop(
pop.getBody(), dstPop.getBody(), dim, unrollFactor,
[&](unsigned i, Value iv, OpBuilder b) {
// iv' = iv * unrollFactor + i
auto base = b.create<arith::MulIOp>(loc, iv, unrollFactorCst);
return b.create<arith::AddIOp>(
loc, base, b.create<arith::ConstantIndexOp>(loc, i));
if (coalescingFriendlyIndexing) {
// upperBoundUnrolled = upperBound / unrollFactor;
// iv(i) = iv + upperBoundUnrolled * i
auto base =
b.create<arith::MulIOp>(loc, upperBoundUnrolled,
b.create<arith::ConstantIndexOp>(loc, i));
return b.create<arith::AddIOp>(loc, base, iv);
} else {
// iv(i) = iv * unrollFactor + i
auto base = b.create<arith::MulIOp>(loc, iv, unrollFactorCst);
return b.create<arith::AddIOp>(
loc, base, b.create<arith::ConstantIndexOp>(loc, i));
}
});
if (res.succeeded()) {
pop->erase();
Expand All @@ -392,7 +401,8 @@ struct SCFParallelLoopUnroll
pops.push_back(pop);
});
for (auto pop : pops) {
(void)scfParallelUnrollByFactor(pop, unrollFactor, 0, true, nullptr)
(void)scfParallelUnrollByFactor(pop, unrollFactor, 0, true, false,
nullptr)
.succeeded();
}
}
Expand Down
13 changes: 7 additions & 6 deletions lib/polygeist/Passes/ParallelLoopUnroll.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@
namespace mlir::polygeist {
LogicalResult scfParallelUnrollByFactor(
scf::ParallelOp &pop, uint64_t unrollFactor, unsigned dim,
bool generateEpilogueLoop,
bool generateEpilogueLoop, bool coalescingFriendlyIndexing,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn);
static LogicalResult scfParallelUnrollByFactors(
scf::ParallelOp &pop, ArrayRef<uint64_t> unrollFactors,
bool generateEpilogueLoop,
bool generateEpilogueLoop, bool coalescingFriendlyIndexing,
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn) {
unsigned dims = pop.getUpperBound().size();
assert(dims == unrollFactors.size());
bool succeeded = true;
for (unsigned dim = 0; dim < dims; dim++) {
succeeded = succeeded && polygeist::scfParallelUnrollByFactor(
pop, unrollFactors[dim], dim,
generateEpilogueLoop, annotateFn)
.succeeded();
succeeded =
succeeded && polygeist::scfParallelUnrollByFactor(
pop, unrollFactors[dim], dim, generateEpilogueLoop,
coalescingFriendlyIndexing, annotateFn)
.succeeded();
}
return success(succeeded);
}
Expand Down

0 comments on commit 2c7c4da

Please sign in to comment.