From c1b50ad3f9a487125e26b7c331715bb74f715f21 Mon Sep 17 00:00:00 2001 From: TB Schardl Date: Tue, 12 Nov 2024 10:53:47 -0500 Subject: [PATCH] [TapirUtils] Perform basic updates to TaskInfo analysis when serializing detaches, to support serializing nested tasks. --- .../llvm/Transforms/Utils/TapirUtils.h | 8 +- .../llvm/Transforms/Utils/TaskSimplify.h | 2 +- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 2 +- llvm/lib/Transforms/Utils/TapirUtils.cpp | 51 ++++++---- llvm/lib/Transforms/Utils/TaskSimplify.cpp | 18 ++-- .../Tapir/nested-serialize-detach.ll | 98 +++++++++++++++++++ 6 files changed, 151 insertions(+), 28 deletions(-) create mode 100644 llvm/test/Transforms/Tapir/nested-serialize-detach.ll diff --git a/llvm/include/llvm/Transforms/Utils/TapirUtils.h b/llvm/include/llvm/Transforms/Utils/TapirUtils.h index cb7585b4043b..d40a63da486c 100644 --- a/llvm/include/llvm/Transforms/Utils/TapirUtils.h +++ b/llvm/include/llvm/Transforms/Utils/TapirUtils.h @@ -103,7 +103,8 @@ bool MoveStaticAllocasInBlock(BasicBlock *Entry, BasicBlock *Block, /// Inline any taskframe.resume markers associated with the given taskframe. If /// \p DT is provided, then it will be updated to reflect the CFG changes. -void InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT = nullptr); +void InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT = nullptr, + TaskInfo *TI = nullptr); /// Clone exception-handling blocks EHBlocksToClone, with predecessors /// EHBlockPreds in a given task. Updates EHBlockPreds to point at the cloned @@ -131,7 +132,8 @@ void SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, SmallPtrSetImpl *InlinedLPads, SmallVectorImpl *DetachedRethrows, bool ReplaceWithTaskFrame = false, - DominatorTree *DT = nullptr, LoopInfo *LI = nullptr); + DominatorTree *DT = nullptr, TaskInfo *TI = nullptr, + LoopInfo *LI = nullptr); /// Analyze a task T for serialization. Gets the reattaches, landing pads, and /// detached rethrows that need special handling during serialization. @@ -145,7 +147,7 @@ void AnalyzeTaskForSerialization( /// Serialize the detach DI that spawns task T. If \p DT is provided, then it /// will be updated to reflect the CFG changes. void SerializeDetach(DetachInst *DI, Task *T, bool ReplaceWithTaskFrame = false, - DominatorTree *DT = nullptr); + DominatorTree *DT = nullptr, TaskInfo *TI = nullptr); /// Get the entry basic block to the detached context that contains /// the specified block. diff --git a/llvm/include/llvm/Transforms/Utils/TaskSimplify.h b/llvm/include/llvm/Transforms/Utils/TaskSimplify.h index 681af4e07ea4..4607cfeb4570 100644 --- a/llvm/include/llvm/Transforms/Utils/TaskSimplify.h +++ b/llvm/include/llvm/Transforms/Utils/TaskSimplify.h @@ -31,7 +31,7 @@ class TaskSimplifyPass : public PassInfoMixin { bool simplifySyncs(Task *T, MaybeParallelTasks &MPTasks); /// Simplify the specified task T. -bool simplifyTask(Task *T); +bool simplifyTask(Task *T, TaskInfo &TI, DominatorTree &DT); /// Simplify the taskframes analyzed by TapirTaskInfo TI. bool simplifyTaskFrames(TaskInfo &TI, DominatorTree &DT); diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 8218f1f37f07..a032fc5b1e6e 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1099,7 +1099,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal, ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds, &ClonedInlinedLPads, &ClonedDetachedRethrows, - NeedToInsertTaskFrame, DT, LI); + NeedToInsertTaskFrame, DT, nullptr, LI); } // Detach the stripmined loop. diff --git a/llvm/lib/Transforms/Utils/TapirUtils.cpp b/llvm/lib/Transforms/Utils/TapirUtils.cpp index 0acf1b5c7951..4b1fcc3ba496 100644 --- a/llvm/lib/Transforms/Utils/TapirUtils.cpp +++ b/llvm/lib/Transforms/Utils/TapirUtils.cpp @@ -387,11 +387,16 @@ class LandingPadInliningInfo { /// Dominator tree to update. DominatorTree *DT = nullptr; + + /// TaskInfo to update. + TaskInfo *TI = nullptr; + public: LandingPadInliningInfo(DetachInst *DI, BasicBlock *EHContinue, Value *LPadValInEHContinue, - DominatorTree *DT = nullptr) - : OuterResumeDest(EHContinue), SpawnerLPad(LPadValInEHContinue), DT(DT) { + DominatorTree *DT = nullptr, TaskInfo *TI = nullptr) + : OuterResumeDest(EHContinue), SpawnerLPad(LPadValInEHContinue), DT(DT), + TI(TI) { // Find the predecessor block of OuterResumeDest. BasicBlock *DetachBB = DI->getParent(); BasicBlock *DetachUnwind = DI->getUnwindDest(); @@ -414,9 +419,9 @@ class LandingPadInliningInfo { } LandingPadInliningInfo(InvokeInst *TaskFrameResume, - DominatorTree *DT = nullptr) + DominatorTree *DT = nullptr, TaskInfo *TI = nullptr) : OuterResumeDest(TaskFrameResume->getUnwindDest()), - SpawnerLPad(TaskFrameResume->getLandingPadInst()), DT(DT) { + SpawnerLPad(TaskFrameResume->getLandingPadInst()), DT(DT), TI(TI) { // If there are PHI nodes in the unwind destination block, we need to keep // track of which values came into them from the detach before removing the // edge from this block. @@ -484,6 +489,8 @@ BasicBlock *LandingPadInliningInfo::getInnerResumeDest() { for (DomTreeNode *I : Children) DT->changeImmediateDominator(I, NewNode); } + if (TI) + TI->addBlockToSpindle(*InnerResumeDest, TI->getSpindleFor(OuterResumeDest)); // The number of incoming edges we expect to the inner landing pad. const unsigned PHICapacity = 2; @@ -571,11 +578,15 @@ void LandingPadInliningInfo::forwardTaskResume(InvokeInst *TR) { if (NormalDest) { for (BasicBlock *Succ : successors(NormalDest)) maybeRemovePredecessor(Succ, NormalDest); + if (TI) + TI->removeBlock(*NormalDest); NormalDest->eraseFromParent(); } if (UnwindDest) { for (BasicBlock *Succ : successors(UnwindDest)) maybeRemovePredecessor(Succ, UnwindDest); + if (TI) + TI->removeBlock(*UnwindDest); UnwindDest->eraseFromParent(); } } @@ -584,8 +595,8 @@ static void handleDetachedLandingPads( DetachInst *DI, BasicBlock *EHContinue, Value *LPadValInEHContinue, SmallPtrSetImpl &InlinedLPads, SmallVectorImpl &DetachedRethrows, - DominatorTree *DT = nullptr) { - LandingPadInliningInfo DetUnwind(DI, EHContinue, LPadValInEHContinue, DT); + DominatorTree *DT = nullptr, TaskInfo *TI = nullptr) { + LandingPadInliningInfo DetUnwind(DI, EHContinue, LPadValInEHContinue, DT, TI); // Append the clauses from the outer landing pad instruction into the inlined // landing pad instructions. @@ -815,13 +826,14 @@ static void getTaskFrameLandingPads( // Helper method to handle a given taskframe.resume. static void handleTaskFrameResume(Value *TaskFrame, Instruction *TaskFrameResume, - DominatorTree *DT = nullptr) { + DominatorTree *DT = nullptr, + TaskInfo *TI = nullptr) { // Get landingpads to inline. SmallPtrSet InlinedLPads; getTaskFrameLandingPads(TaskFrame, TaskFrameResume, InlinedLPads); InvokeInst *TFR = cast(TaskFrameResume); - LandingPadInliningInfo TFResumeDest(TFR, DT); + LandingPadInliningInfo TFResumeDest(TFR, DT, TI); // Append the clauses from the outer landing pad instruction into the inlined // landing pad instructions. @@ -839,7 +851,8 @@ static void handleTaskFrameResume(Value *TaskFrame, TFResumeDest.forwardTaskResume(TFR); } -void llvm::InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT) { +void llvm::InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT, + TaskInfo *TI) { SmallVector TaskFrameResumes; // Record all taskframe.resume markers that use TaskFrame. for (User *U : TaskFrame->users()) @@ -849,12 +862,12 @@ void llvm::InlineTaskFrameResumes(Value *TaskFrame, DominatorTree *DT) { // Handle all taskframe.resume markers. for (Instruction *TFR : TaskFrameResumes) - handleTaskFrameResume(TaskFrame, TFR, DT); + handleTaskFrameResume(TaskFrame, TFR, DT, TI); } static void startSerializingTaskFrame(Value *TaskFrame, SmallVectorImpl &ToErase, - DominatorTree *DT, + DominatorTree *DT, TaskInfo *TI, bool PreserveTaskFrame) { for (User *U : TaskFrame->users()) if (Instruction *UI = dyn_cast(U)) @@ -862,7 +875,7 @@ static void startSerializingTaskFrame(Value *TaskFrame, ToErase.push_back(UI); if (!PreserveTaskFrame) - InlineTaskFrameResumes(TaskFrame, DT); + InlineTaskFrameResumes(TaskFrame, DT, TI); } void llvm::SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, @@ -873,7 +886,9 @@ void llvm::SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, SmallPtrSetImpl *InlinedLPads, SmallVectorImpl *DetachedRethrows, bool ReplaceWithTaskFrame, DominatorTree *DT, - LoopInfo *LI) { + TaskInfo *TI, LoopInfo *LI) { + LLVM_DEBUG(dbgs() << "Serializing detach " << *DI << "\n"); + BasicBlock *Spawner = DI->getParent(); BasicBlock *TaskEntry = DI->getDetached(); BasicBlock *Continue = DI->getContinue(); @@ -885,7 +900,7 @@ void llvm::SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, SmallVector ToErase; Value *TaskFrame = getTaskFrameUsed(TaskEntry); if (TaskFrame) - startSerializingTaskFrame(TaskFrame, ToErase, DT, ReplaceWithTaskFrame); + startSerializingTaskFrame(TaskFrame, ToErase, DT, TI, ReplaceWithTaskFrame); // Clone any EH blocks that need cloning. if (EHBlocksToClone) { @@ -952,7 +967,7 @@ void llvm::SerializeDetach(DetachInst *DI, BasicBlock *ParentEntry, } else { // Otherwise, "inline" the detached landingpads. handleDetachedLandingPads(DI, EHContinue, LPadValInEHContinue, - *InlinedLPads, *DetachedRethrows, DT); + *InlinedLPads, *DetachedRethrows, DT, TI); } } @@ -1059,7 +1074,7 @@ void llvm::AnalyzeTaskForSerialization( /// Serialize the detach DI that spawns task T. If provided, the dominator tree /// DT will be updated to reflect the serialization. void llvm::SerializeDetach(DetachInst *DI, Task *T, bool ReplaceWithTaskFrame, - DominatorTree *DT) { + DominatorTree *DT, TaskInfo *TI) { assert(DI && "SerializeDetach given nullptr for detach."); assert(DI == T->getDetach() && "Task and detach arguments do not match."); SmallVector EHBlocksToClone; @@ -1078,7 +1093,9 @@ void llvm::SerializeDetach(DetachInst *DI, Task *T, bool ReplaceWithTaskFrame, } SerializeDetach(DI, T->getParentTask()->getEntry(), EHContinue, LPadVal, Reattaches, &EHBlocksToClone, &EHBlockPreds, &InlinedLPads, - &DetachedRethrows, ReplaceWithTaskFrame, DT); + &DetachedRethrows, ReplaceWithTaskFrame, DT, TI); + if (TI) + TI->moveSpindlesToParent(T); } static bool isCanonicalTaskFrameEnd(const Instruction *TFEnd) { diff --git a/llvm/lib/Transforms/Utils/TaskSimplify.cpp b/llvm/lib/Transforms/Utils/TaskSimplify.cpp index 481cb8ee2947..f16749bc10d1 100644 --- a/llvm/lib/Transforms/Utils/TaskSimplify.cpp +++ b/llvm/lib/Transforms/Utils/TaskSimplify.cpp @@ -238,10 +238,11 @@ static bool detachImmediatelySyncs(DetachInst *DI) { return isa(I); } -bool llvm::simplifyTask(Task *T) { +bool llvm::simplifyTask(Task *T, TaskInfo &TI, DominatorTree &DT) { if (T->isRootTask()) return false; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); LLVM_DEBUG(dbgs() << "Simplifying task @ " << T->getEntry()->getName() << "\n"); @@ -254,7 +255,8 @@ bool llvm::simplifyTask(Task *T) { // destination from T's detach. if (DI->hasUnwindDest()) { if (!taskCanThrow(T)) { - removeUnwindEdge(DI->getParent()); + LLVM_DEBUG(dbgs() << "Removing unwind edge of " << *DI << "\n"); + removeUnwindEdge(DI->getParent(), &DTU); // removeUnwindEdge will invalidate the DI pointer. Get the new DI // pointer. DI = T->getDetach(); @@ -263,13 +265,17 @@ bool llvm::simplifyTask(Task *T) { } if (!taskCanReachContinuation(T)) { + LLVM_DEBUG(dbgs() << "Task cannot reach continuation. Serializing " << *DI + << "\n"); // This optimization assumes that if a task cannot reach its continuation // then we shouldn't bother spawning it. The task might perform code that // can reach the unwind destination, however. - SerializeDetach(DI, T, NestedSync); + SerializeDetach(DI, T, NestedSync, &DT, &TI); Changed = true; } else if (!PreserveAllSpawns && detachImmediatelySyncs(DI)) { - SerializeDetach(DI, T, NestedSync); + LLVM_DEBUG(dbgs() << "Detach immediately syncs. Serializing " << *DI + << "\n"); + SerializeDetach(DI, T, NestedSync, &DT, &TI); Changed = true; } @@ -651,7 +657,7 @@ bool TaskSimplify::runOnFunction(Function &F) { // Simplify each task in the function. for (Task *T : post_order(TI.getRootTask())) - Changed |= simplifyTask(T); + Changed |= simplifyTask(T, TI, DT); if (PostCleanupCFG && (Changed | SplitBlocks)) Changed |= simplifyFunctionCFG(F, TTI, nullptr, Options); @@ -729,7 +735,7 @@ PreservedAnalyses TaskSimplifyPass::run(Function &F, // Simplify each task in the function. for (Task *T : post_order(TI.getRootTask())) - Changed |= simplifyTask(T); + Changed |= simplifyTask(T, TI, DT); if (PostCleanupCFG && (Changed | SplitBlocks)) Changed |= simplifyFunctionCFG(F, TTI, nullptr, Options); diff --git a/llvm/test/Transforms/Tapir/nested-serialize-detach.ll b/llvm/test/Transforms/Tapir/nested-serialize-detach.ll new file mode 100644 index 000000000000..f83422740dd2 --- /dev/null +++ b/llvm/test/Transforms/Tapir/nested-serialize-detach.ll @@ -0,0 +1,98 @@ +; Check that nested detaches can be serialized. +; +; RUN: opt < %s -passes="function(task-simplify)" -S | FileCheck %s +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-macosx15.0.0" + +; Function Attrs: nounwind willreturn memory(argmem: readwrite) +declare token @llvm.syncregion.start() #0 + +; Function Attrs: willreturn memory(argmem: readwrite) +declare void @llvm.sync.unwind(token) #1 + +; Function Attrs: willreturn memory(argmem: readwrite) +declare void @llvm.detached.rethrow.sl_p0i32s(token, { ptr, i32 }) #1 + +; Function Attrs: nounwind willreturn memory(argmem: readwrite) +declare token @llvm.taskframe.create() #0 + +; CHECK: define void @_ZNK5Graph17pbfs_walk_PennantEP7PennantIiERH3BagIiEjPj() +; CHECK-NEXT: entry: +; CHECK-NOT: detach within +; CHECK: unreachable + +define void @_ZNK5Graph17pbfs_walk_PennantEP7PennantIiERH3BagIiEjPj() personality ptr null { +entry: + %syncreg = tail call token @llvm.syncregion.start() + %syncreg45 = tail call token @llvm.syncregion.start() + %0 = tail call token @llvm.tapir.runtime.start() + detach within %syncreg45, label %pfor.body.entry.tf, label %pfor.inc unwind label %lpad59.loopexit + +pfor.body.entry.tf: ; preds = %entry + %tf.i = tail call token @llvm.taskframe.create() + %syncreg.i = tail call token @llvm.syncregion.start() + detach within %syncreg.i, label %pfor.cond.i.strpm.detachloop.entry, label %pfor.cond.cleanup.i unwind label %lpad4924.loopexit.split-lp + +pfor.cond.i.strpm.detachloop.entry: ; preds = %pfor.body.entry.tf + %syncreg.i.strpm.detachloop = tail call token @llvm.syncregion.start() + detach within none, label %pfor.body.entry.i.strpm.outer.1, label %pfor.inc.i.strpm.outer.1 unwind label %lpad4924.loopexit.strpm + +pfor.body.entry.i.strpm.outer.1: ; preds = %pfor.cond.i.strpm.detachloop.entry + invoke void @llvm.detached.rethrow.sl_p0i32s(token none, { ptr, i32 } zeroinitializer) + to label %lpad4924.unreachable unwind label %lpad4924.loopexit.strpm + +pfor.inc.i.strpm.outer.1: ; preds = %pfor.cond.i.strpm.detachloop.entry + sync within none, label %pfor.cond.i.strpm.detachloop.reattach.split + +pfor.cond.i.strpm.detachloop.reattach.split: ; preds = %pfor.inc.i.strpm.outer.1 + reattach within %syncreg.i, label %pfor.cond.cleanup.i + +pfor.cond.cleanup.i: ; preds = %pfor.cond.i.strpm.detachloop.reattach.split, %pfor.body.entry.tf + sync within %syncreg.i, label %sync.continue.i + +sync.continue.i: ; preds = %pfor.cond.cleanup.i + invoke void @llvm.sync.unwind(token none) + to label %pfor.preattach unwind label %lpad4924.loopexit.split-lp + +lpad4924.loopexit.strpm: ; preds = %pfor.body.entry.i.strpm.outer.1, %pfor.cond.i.strpm.detachloop.entry + %lpad.strpm = landingpad { ptr, i32 } + cleanup + invoke void @llvm.detached.rethrow.sl_p0i32s(token %syncreg.i, { ptr, i32 } zeroinitializer) + to label %lpad4924.loopexit.strpm.unreachable unwind label %lpad4924.loopexit.split-lp + +lpad4924.loopexit.strpm.unreachable: ; preds = %lpad4924.loopexit.strpm + unreachable + +lpad4924.loopexit.split-lp: ; preds = %lpad4924.loopexit.strpm, %sync.continue.i, %pfor.body.entry.tf + %lpad.loopexit.split-lp = landingpad { ptr, i32 } + cleanup + call void @llvm.detached.rethrow.sl_p0i32s(token none, { ptr, i32 } zeroinitializer) + unreachable + +lpad4924.unreachable: ; preds = %pfor.body.entry.i.strpm.outer.1 + unreachable + +pfor.preattach: ; preds = %sync.continue.i + reattach within %syncreg45, label %pfor.inc + +pfor.inc: ; preds = %pfor.preattach, %entry + ret void + +lpad59.loopexit: ; preds = %entry + %lpad.loopexit28 = landingpad { ptr, i32 } + cleanup + tail call void @llvm.tapir.runtime.end(token %0) + resume { ptr, i32 } zeroinitializer +} + +; Function Attrs: nounwind willreturn memory(argmem: readwrite) +declare token @llvm.tapir.runtime.start() #0 + +; Function Attrs: nounwind willreturn memory(argmem: readwrite) +declare void @llvm.tapir.runtime.end(token) #0 + +; uselistorder directives +uselistorder ptr null, { 1, 2, 0 } + +attributes #0 = { nounwind willreturn memory(argmem: readwrite) } +attributes #1 = { willreturn memory(argmem: readwrite) }