diff --git a/lib/polygeist/Passes/ConvertParallelToGPU.cpp b/lib/polygeist/Passes/ConvertParallelToGPU.cpp index 2dc4d4b84da2..447ddb62d9fc 100644 --- a/lib/polygeist/Passes/ConvertParallelToGPU.cpp +++ b/lib/polygeist/Passes/ConvertParallelToGPU.cpp @@ -48,6 +48,10 @@ static llvm::cl::opt GPUKernelEnableBlockCoarsening( "gpu-kernel-enable-block-coarsening", llvm::cl::init(true), llvm::cl::desc("When emitting coarsened kernels, enable block coarsening")); +static llvm::cl::opt GPUKernelEnableCoalescingFriendlyUnroll( + "gpu-kernel-enable-coalescing-friendly-unroll", llvm::cl::init(false), + llvm::cl::desc("When thread coarsening, do coalescing-friendly unrolling")); + // TODO when we add other backends, we would need to to add an argument to the // pass which one we are compiling to to provide the appropriate error id #if POLYGEIST_ENABLE_CUDA @@ -1750,7 +1754,8 @@ struct ConvertParallelToGPU1Pass auto blockUnrollFactors = getUnrollFactors(coarsenBlocks); if (polygeist::scfParallelUnrollByFactors( gridPop, ArrayRef(blockUnrollFactors), - /* generateEpilogueLoop */ true, nullptr) + /* generateEpilogueLoop */ true, + /* coalescingFriendlyIndexing */ false, nullptr) .failed()) wrapper->emitRemark("Failed to coarsen blocks"); } @@ -1766,7 +1771,8 @@ struct ConvertParallelToGPU1Pass auto threadUnrollFactors = getUnrollFactors(coarsenThreads); if (polygeist::scfParallelUnrollByFactors( blockPop, ArrayRef(threadUnrollFactors), - /* generateEpilogueLoop */ false, nullptr) + /* generateEpilogueLoop */ false, + GPUKernelEnableCoalescingFriendlyUnroll, nullptr) .failed()) wrapper->emitRemark("Failed to coarsen threads"); } @@ -1934,7 +1940,8 @@ struct ConvertParallelToGPU1Pass auto unrollFactors = UNROLL_FACTORS[gridDims][iBlock]; if (polygeist::scfParallelUnrollByFactors( gridPop, ArrayRef(unrollFactors), - /* generateEpilogueLoop */ true, nullptr) + /* generateEpilogueLoop */ true, + /* coalescingFriendlyIndexing */ false, nullptr) .failed()) { wrapper->emitRemark("Failed to coarsen blocks"); succeeded = false; @@ -1946,7 +1953,8 @@ struct ConvertParallelToGPU1Pass unrollFactors = UNROLL_FACTORS[blockDims][iThread]; if (polygeist::scfParallelUnrollByFactors( blockPop, ArrayRef(unrollFactors), - /* generateEpilogueLoop */ false, nullptr) + /* generateEpilogueLoop */ false, + GPUKernelEnableCoalescingFriendlyUnroll, nullptr) .failed()) { wrapper->emitRemark("Failed to coarsen threads"); succeeded = false; diff --git a/lib/polygeist/Passes/ParallelLoopUnroll.cpp b/lib/polygeist/Passes/ParallelLoopUnroll.cpp index a412a8dd696b..17f0fddf9928 100644 --- a/lib/polygeist/Passes/ParallelLoopUnroll.cpp +++ b/lib/polygeist/Passes/ParallelLoopUnroll.cpp @@ -270,7 +270,7 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend, /// Unrolls 'pop' by 'unrollFactor', returns success if the loop is unrolled. LogicalResult mlir::polygeist::scfParallelUnrollByFactor( scf::ParallelOp &pop, uint64_t unrollFactor, unsigned dim, - bool generateEpilogueLoop, + bool generateEpilogueLoop, bool coalescingFriendlyIndexing, function_ref annotateFn) { assert(unrollFactor > 0 && "expected positive unroll factor"); assert(dim >= 0 && dim < pop.getUpperBound().size()); @@ -364,10 +364,19 @@ LogicalResult mlir::polygeist::scfParallelUnrollByFactor( auto res = generateUnrolledInterleavedLoop( pop.getBody(), dstPop.getBody(), dim, unrollFactor, [&](unsigned i, Value iv, OpBuilder b) { - // iv' = iv * unrollFactor + i - auto base = b.create(loc, iv, unrollFactorCst); - return b.create( - loc, base, b.create(loc, i)); + if (coalescingFriendlyIndexing) { + // upperBoundUnrolled = upperBound / unrollFactor; + // iv(i) = iv + upperBoundUnrolled * i + auto base = + b.create(loc, upperBoundUnrolled, + b.create(loc, i)); + return b.create(loc, base, iv); + } else { + // iv(i) = iv * unrollFactor + i + auto base = b.create(loc, iv, unrollFactorCst); + return b.create( + loc, base, b.create(loc, i)); + } }); if (res.succeeded()) { pop->erase(); @@ -392,7 +401,8 @@ struct SCFParallelLoopUnroll pops.push_back(pop); }); for (auto pop : pops) { - (void)scfParallelUnrollByFactor(pop, unrollFactor, 0, true, nullptr) + (void)scfParallelUnrollByFactor(pop, unrollFactor, 0, true, false, + nullptr) .succeeded(); } } diff --git a/lib/polygeist/Passes/ParallelLoopUnroll.h b/lib/polygeist/Passes/ParallelLoopUnroll.h index 33fa9d2462ca..15ad89f717ba 100644 --- a/lib/polygeist/Passes/ParallelLoopUnroll.h +++ b/lib/polygeist/Passes/ParallelLoopUnroll.h @@ -4,20 +4,21 @@ namespace mlir::polygeist { LogicalResult scfParallelUnrollByFactor( scf::ParallelOp &pop, uint64_t unrollFactor, unsigned dim, - bool generateEpilogueLoop, + bool generateEpilogueLoop, bool coalescingFriendlyIndexing, function_ref annotateFn); static LogicalResult scfParallelUnrollByFactors( scf::ParallelOp &pop, ArrayRef unrollFactors, - bool generateEpilogueLoop, + bool generateEpilogueLoop, bool coalescingFriendlyIndexing, function_ref annotateFn) { unsigned dims = pop.getUpperBound().size(); assert(dims == unrollFactors.size()); bool succeeded = true; for (unsigned dim = 0; dim < dims; dim++) { - succeeded = succeeded && polygeist::scfParallelUnrollByFactor( - pop, unrollFactors[dim], dim, - generateEpilogueLoop, annotateFn) - .succeeded(); + succeeded = + succeeded && polygeist::scfParallelUnrollByFactor( + pop, unrollFactors[dim], dim, generateEpilogueLoop, + coalescingFriendlyIndexing, annotateFn) + .succeeded(); } return success(succeeded); }