From 079fc97b51fe541ba5a4ba271bcc4dd6f43df21a Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 27 Nov 2023 18:24:11 +0000 Subject: [PATCH] [SYCL][Graph] Fixes enqueue barrier slowdown (#11933) The implementation of ext_oneapi_submit_barrier involved exponential slowdown due to unnecessary extra dependencies to barrier nodes. This PR solves this issue by: 1) improving the function that searches for graph leaves (exit nodes) 2) removing unnecessary dependencies to previous barriers when adding new nodes. Addresses Issue: #11915 --- sycl/source/detail/graph_impl.cpp | 8 +++----- sycl/source/detail/graph_impl.hpp | 20 +++++++++++++++++++- sycl/source/handler.cpp | 8 ++++++++ sycl/unittests/Extensions/CommandGraph.cpp | 6 +++--- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 168d1bc83f253..6be686434d4b6 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -416,15 +416,13 @@ void graph_impl::makeEdge(std::shared_ptr Src, std::vector graph_impl::getExitNodesEvents() { std::vector Events; - auto EnqueueExitNodesEvents = [&](std::shared_ptr &Node, - std::deque> &) { + + for (auto Node : MNodeStorage) { if (Node->MSuccessors.empty()) { Events.push_back(getEventForNode(Node)); } - return false; - }; + } - searchDepthFirst(EnqueueExitNodesEvents); return Events; } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 949b08d480e75..7069beb28b500 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -784,6 +784,24 @@ class graph_impl { /// @return vector of events associated to exit nodes. std::vector getExitNodesEvents(); + /// Removes all Barrier nodes from the list of extra dependencies + /// MExtraDependencies. + /// @return vector of events associated to previous barrier nodes. + std::vector + removeBarriersFromExtraDependencies() { + std::vector Events; + for (auto It = MExtraDependencies.begin(); + It != MExtraDependencies.end();) { + if ((*It)->MCGType == sycl::detail::CG::Barrier) { + Events.push_back(getEventForNode(*It)); + It = MExtraDependencies.erase(It); + } else { + ++It; + } + } + return Events; + } + private: /// Iterate over the graph depth-first and run \p NodeFunc on each node. /// @param NodeFunc A function which receives as input a node in the graph to @@ -861,7 +879,7 @@ class graph_impl { /// added to this graph. /// This list is mainly used by barrier nodes which must be considered /// as predecessors for all nodes subsequently added to the graph. - std::vector> MExtraDependencies; + std::list> MExtraDependencies; }; /// Class representing the implementation of command_graph. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 1072719be9653..a8aee207c585b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -389,6 +389,14 @@ event handler::finalize() { // nodes/events of the graph if (MEventsWaitWithBarrier.size() == 0) { MEventsWaitWithBarrier = GraphImpl->getExitNodesEvents(); + // Graph-wide barriers take precedence over previous one. + // We therefore remove the previous ones from ExtraDependencies list. + // The current barrier is then added to this list in the graph_impl. + std::vector EventsBarriers = + GraphImpl->removeBarriersFromExtraDependencies(); + MEventsWaitWithBarrier.insert(std::end(MEventsWaitWithBarrier), + std::begin(EventsBarriers), + std::end(EventsBarriers)); } CGData.MEvents.insert(std::end(CGData.MEvents), std::begin(MEventsWaitWithBarrier), diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index d2b21d52a6e8c..f7b559d772686 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1458,7 +1458,7 @@ TEST_F(CommandGraphTest, EnqueueMultipleBarrier) { // (B2) // /|\ // / | \ - // (6) (7) (8) (those nodes also have B1 as a predecessor) + // (6) (7) (8) ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); for (auto Root : GraphImpl->MRoots) { auto Node = Root.lock(); @@ -1468,7 +1468,7 @@ TEST_F(CommandGraphTest, EnqueueMultipleBarrier) { ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), sycl::detail::getSyclObjImpl(Barrier1)); ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); - ASSERT_EQ(SuccNode->MSuccessors.size(), 6lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 3lu); for (auto Succ1 : SuccNode->MSuccessors) { auto SuccBarrier1 = Succ1.lock(); if (SuccBarrier1->MCGType == sycl::detail::CG::Barrier) { @@ -1479,7 +1479,7 @@ TEST_F(CommandGraphTest, EnqueueMultipleBarrier) { for (auto Succ2 : SuccBarrier1->MSuccessors) { auto SuccBarrier2 = Succ2.lock(); // Nodes 6, 7, 8 - ASSERT_EQ(SuccBarrier2->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccBarrier2->MPredecessors.size(), 1lu); ASSERT_EQ(SuccBarrier2->MSuccessors.size(), 0lu); } } else {