From 322384284cd8b1ad71c8cbb93d6ba0f79ed65e9a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 3 Jul 2025 16:17:28 +0000 Subject: [PATCH 01/45] [SYCL] Handler-less kernel submit API --- sycl/include/sycl/queue.hpp | 89 +++++++++++++++++++++++++++++++ sycl/source/detail/queue_impl.hpp | 18 +++++++ sycl/source/queue.cpp | 21 ++++++++ 3 files changed, 128 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a0dbdf5c540e8..2898025ae40c8 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,6 +149,31 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t(*)(int); + +class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { +public: + ExtendedSubmissionInfo() {} + + std::string_view &KernelName() { return MKernelName; } + std::unique_ptr &HostKernel() { return MHostKernel; } + const std::unique_ptr &HostKernel() const { return MHostKernel; } + int &KernelNumArgs() { return MKernelNumArgs; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + bool &KernelIsESIMD() { return MKernelIsESIMD; } + bool &KernelHasSpecialCaptures() {return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + +private: + std::string_view MKernelName; + std::unique_ptr MHostKernel; + int MKernelNumArgs = 0; + KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; + bool MKernelIsESIMD = false; + bool MKernelHasSpecialCaptures = true; + detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; +}; + } // namespace v1 } // namespace detail @@ -3609,6 +3634,37 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; + + template + void ProcessExtendedSubmitProperties(PropertiesT Props, + const KernelType &KernelFunc, + detail::v1::ExtendedSubmissionInfo &SI) const { + ProcessSubmitProperties(Props, SI); + + using NameT = + typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>; + + SI.HostKernel().reset(new detail::HostKernel( + std::forward(KernelFunc))); + SI.KernelName() = detail::getKernelName(); + SI.KernelNumArgs() = detail::getKernelNumParams(); + SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); + SI.KernelIsESIMD() = detail::isKernelESIMD(); + SI.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + SI.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. @@ -3680,6 +3736,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + event submit_with_event_impl(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + + event submit_with_event_impl(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + + event submit_with_event_impl(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) const; + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3763,6 +3834,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } + template + event submit_with_event(PropertiesT Props, + const nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::ExtendedSubmissionInfo SI{}; + ProcessExtendedSubmitProperties(Props, + KernelFunc, SI); + + // TODO UseFallbackAssert + + return submit_with_event_impl(Range, SI, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d6892011b991e..1034137916aa5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,6 +370,24 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } + event submit_with_event(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + + event submit_with_event(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + + event submit_with_event(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + return event(); + } + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ed0b0e42e6e6f..bd899d871f0d8 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,6 +312,27 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +event queue::submit_with_event_impl( + const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + +event queue::submit_with_event_impl( + const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + +event queue::submit_with_event_impl( + const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); +} + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, From fde19cab99ddc783c7a2f75328b1649f33b272ea Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 3 Jul 2025 16:22:06 +0000 Subject: [PATCH 02/45] Fix formatting --- sycl/include/sycl/queue.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2898025ae40c8..f6ac18d1b4392 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3634,12 +3634,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } - template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; - }; + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; template From 13424de9a98bc5e88da518415c63d0719fc0129f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 4 Jul 2025 09:17:58 +0000 Subject: [PATCH 03/45] Fix formatting --- sycl/include/sycl/queue.hpp | 74 +++++++++++++++++-------------- sycl/source/detail/queue_impl.hpp | 33 ++++++++++---- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f6ac18d1b4392..a09947c552ec8 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,7 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t(*)(int); +using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { public: @@ -157,12 +157,18 @@ class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { std::string_view &KernelName() { return MKernelName; } std::unique_ptr &HostKernel() { return MHostKernel; } - const std::unique_ptr &HostKernel() const { return MHostKernel; } + const std::unique_ptr &HostKernel() const { + return MHostKernel; + } int &KernelNumArgs() { return MKernelNumArgs; } - KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { + return MKernelParamDescGetter; + } bool &KernelIsESIMD() { return MKernelIsESIMD; } - bool &KernelHasSpecialCaptures() {return MKernelHasSpecialCaptures; } - detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { + return MKernelNameBasedCachePtr; + } private: std::string_view MKernelName; @@ -3635,17 +3641,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } template struct TransformUserItemType { - using type = std::conditional_t< - std::is_convertible_v, LambdaArgType>, nd_item, - std::conditional_t, LambdaArgType>, - item, LambdaArgType>>; + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; }; - template - void ProcessExtendedSubmitProperties(PropertiesT Props, - const KernelType &KernelFunc, - detail::v1::ExtendedSubmissionInfo &SI) const { + template + void ProcessExtendedSubmitProperties( + PropertiesT Props, const KernelType &KernelFunc, + detail::v1::ExtendedSubmissionInfo &SI) const { ProcessSubmitProperties(Props, SI); using NameT = @@ -3655,8 +3661,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::is_integral::value && Dims == 1, item, typename TransformUserItemType::type>; - SI.HostKernel().reset(new detail::HostKernel( - std::forward(KernelFunc))); + SI.HostKernel().reset( + new detail::HostKernel( + std::forward(KernelFunc))); SI.KernelName() = detail::getKernelName(); SI.KernelNumArgs() = detail::getKernelNumParams(); SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); @@ -3736,20 +3743,20 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) const; + event submit_with_event_impl( + const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, @@ -3834,17 +3841,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template - event submit_with_event(PropertiesT Props, - const nd_range Range, + template + event submit_with_event(PropertiesT Props, const nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::ExtendedSubmissionInfo SI{}; - ProcessExtendedSubmitProperties(Props, - KernelFunc, SI); + ProcessExtendedSubmitProperties(Props, KernelFunc, + SI); // TODO UseFallbackAssert diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 1034137916aa5..e041ebb9d30f4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,21 +370,36 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_with_event(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<1> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } - event submit_with_event(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<2> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } - event submit_with_event(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + event + submit_with_event(const nd_range<3> Range, + const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)Range; + (void)ExtSubmitInfo; + (void)CodeLoc; + (void)IsTopCodeLoc; return event(); } From fbc789d6f3bd591b1655faa7fd6357a4e74a337a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 7 Jul 2025 12:48:33 +0000 Subject: [PATCH 04/45] Change the ExtendedSubmissionInfo to KernelRuntimeInfo, expose the new APIs as public under a new define --- .../oneapi/experimental/enqueue_functions.hpp | 23 ++++++ sycl/include/sycl/queue.hpp | 81 ++++++++++++------- sycl/source/detail/queue_impl.hpp | 24 +++--- sycl/source/queue.cpp | 24 +++--- 4 files changed, 105 insertions(+), 47 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f599078a6769e..5e2a6ec78feb4 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -110,6 +110,16 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT>( Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc); } + +template +event submit_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); +} } // namespace detail template @@ -144,6 +154,19 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +event submit_with_event(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail::submit_with_event_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif + template void single_task(handler &CGH, const KernelType &KernelObj) { CGH.single_task(KernelObj); diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a09947c552ec8..b2a1106aa051b 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,15 +151,14 @@ class __SYCL_EXPORT SubmissionInfo { using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); -class __SYCL_EXPORT ExtendedSubmissionInfo : public SubmissionInfo { +// This class is intended to store the kernel runtime information, +// extracted from the compile time kernel structures. +class __SYCL_EXPORT KernelRuntimeInfo { public: - ExtendedSubmissionInfo() {} + KernelRuntimeInfo() {} std::string_view &KernelName() { return MKernelName; } std::unique_ptr &HostKernel() { return MHostKernel; } - const std::unique_ptr &HostKernel() const { - return MHostKernel; - } int &KernelNumArgs() { return MKernelNumArgs; } KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; @@ -198,6 +197,13 @@ template event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + +template +event submit_with_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -3246,11 +3252,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + return submit_with_event(sycl::ext::oneapi::experimental::empty_properties_t{}, + Range, Rest..., CodeLoc); +#else return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); +#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3627,6 +3639,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + template + friend event ext::oneapi::experimental::detail::submit_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3649,11 +3668,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - void ProcessExtendedSubmitProperties( + void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, - detail::v1::ExtendedSubmissionInfo &SI) const { - ProcessSubmitProperties(Props, SI); - + detail::v1::KernelRuntimeInfo &KRInfo) const { using NameT = typename detail::get_kernel_name_t::name; using LambdaArgType = sycl::detail::lambda_arg_type>; @@ -3661,15 +3678,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { std::is_integral::value && Dims == 1, item, typename TransformUserItemType::type>; - SI.HostKernel().reset( + KRInfo.HostKernel().reset( new detail::HostKernel( - std::forward(KernelFunc))); - SI.KernelName() = detail::getKernelName(); - SI.KernelNumArgs() = detail::getKernelNumParams(); - SI.KernelParamDescGetter() = &(detail::getKernelParamDesc); - SI.KernelIsESIMD() = detail::isKernelESIMD(); - SI.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - SI.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KernelFunc)); + KRInfo.KernelName() = detail::getKernelName(); + KRInfo.KernelNumArgs() = detail::getKernelNumParams(); + KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); + KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); + KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES @@ -3744,18 +3761,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<1> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<2> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( - const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<3> Range, + const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3843,19 +3863,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_with_event(PropertiesT Props, const nd_range Range, + event submit_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::ExtendedSubmissionInfo SI{}; - ProcessExtendedSubmitProperties(Props, KernelFunc, - SI); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); // TODO UseFallbackAssert - return submit_with_event_impl(Range, SI, TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + return submit_with_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } /// Submits a command group function object to the queue, in order to be diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e041ebb9d30f4..7bff56d1b8e3a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -371,33 +371,39 @@ class queue_impl : public std::enable_shared_from_this { } event - submit_with_event(const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); } event - submit_with_event(const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); } event - submit_with_event(const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + submit_with_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)Range; - (void)ExtSubmitInfo; + (void)SubmitInfo; + (void)KRInfo; (void)CodeLoc; (void)IsTopCodeLoc; return event(); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index bd899d871f0d8..422148c0ec8f9 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -313,24 +313,30 @@ event queue::submit_with_event_impl( } event queue::submit_with_event_impl( - const nd_range<1> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } event queue::submit_with_event_impl( - const nd_range<2> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } event queue::submit_with_event_impl( - const nd_range<3> Range, - const detail::v1::ExtendedSubmissionInfo &ExtSubmitInfo, + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, ExtSubmitInfo, CodeLoc, IsTopCodeLoc); + return impl->submit_with_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); } void queue::submit_without_event_impl( From 591b3ec4d6c955089137e672639a36ee3a72d205 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 8 Jul 2025 13:41:07 +0000 Subject: [PATCH 05/45] Added copy/move constructor and assignment operator --- sycl/include/sycl/queue.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b2a1106aa051b..b78a1e6770eca 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -157,14 +157,30 @@ class __SYCL_EXPORT KernelRuntimeInfo { public: KernelRuntimeInfo() {} + KernelRuntimeInfo(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo(KernelRuntimeInfo &&rhs) = delete; + + KernelRuntimeInfo &operator=(const KernelRuntimeInfo &rhs) = delete; + + KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; + std::string_view &KernelName() { return MKernelName; } + std::unique_ptr &HostKernel() { return MHostKernel; } + + detail::HostKernelBase *GetHostKernelPtr() { return MHostKernel.get(); } + int &KernelNumArgs() { return MKernelNumArgs; } + KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + bool &KernelIsESIMD() { return MKernelIsESIMD; } + bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } From 664160133196383646de49dd4d3b6675acf70fc1 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 11 Jul 2025 14:23:44 +0000 Subject: [PATCH 06/45] Add a no event submit and no handler compile flag --- sycl/CMakeLists.txt | 1 + .../oneapi/experimental/enqueue_functions.hpp | 25 +++- sycl/include/sycl/queue.hpp | 131 ++++++++++++++---- sycl/source/CMakeLists.txt | 8 ++ sycl/source/detail/queue_impl.cpp | 97 +++++++++++++ sycl/source/detail/queue_impl.hpp | 70 +++++++--- sycl/source/queue.cpp | 30 ++++ 7 files changed, 318 insertions(+), 44 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 9b98bef1a849c..fdba7b5da02bb 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -11,6 +11,7 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" OFF) option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) +option(SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT "Builds SYCL runtime libraries with the support for Command Group Handler bypass kernel submit" OFF) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 5e2a6ec78feb4..25c7cdcbf2e6e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -111,6 +111,7 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, Props, detail::type_erased_cgfo_ty{CGF}, nullptr, CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event_impl(const queue &Q, PropertiesT Props, @@ -120,6 +121,17 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } + +template +void submit_without_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + //Q.submit_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + // KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail template @@ -165,7 +177,18 @@ event submit_with_event(const queue &Q, PropertiesT Props, return sycl::ext::oneapi::experimental::detail::submit_with_event_impl (Q, Props, Range, KernelFunc, CodeLoc); } -#endif + +template +void submit_without_event(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_without_event_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void single_task(handler &CGH, const KernelType &KernelObj) { diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b78a1e6770eca..a3f698c553e99 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,6 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -165,35 +166,48 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; - std::string_view &KernelName() { return MKernelName; } + detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } + const detail::ABINeutralKernelNameStrT &KernelName() const { return MKernelName; } - std::unique_ptr &HostKernel() { return MHostKernel; } + std::shared_ptr &HostKernel() { return MHostKernel; } + const std::shared_ptr &HostKernel() const { return MHostKernel; } - detail::HostKernelBase *GetHostKernelPtr() { return MHostKernel.get(); } + char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } + char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } int &KernelNumArgs() { return MKernelNumArgs; } + const int &KernelNumArgs() const { return MKernelNumArgs; } KernelParamDescGetterFuncPtr &KernelParamDescGetter() { return MKernelParamDescGetter; } + const KernelParamDescGetterFuncPtr &KernelParamDescGetter() const { + return MKernelParamDescGetter; + } bool &KernelIsESIMD() { return MKernelIsESIMD; } + const bool &KernelIsESIMD() const { return MKernelIsESIMD; } bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } + const bool &KernelHasSpecialCaptures() const { return MKernelHasSpecialCaptures; } detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; } + detail::KernelNameBasedCacheT *KernelNameBasedCachePtr() const { + return MKernelNameBasedCachePtr; + } private: - std::string_view MKernelName; - std::unique_ptr MHostKernel; + detail::ABINeutralKernelNameStrT MKernelName; + std::shared_ptr MHostKernel; int MKernelNumArgs = 0; KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; bool MKernelIsESIMD = false; bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace v1 } // namespace detail @@ -214,12 +228,21 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + +template +void submit_without_event_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3268,17 +3291,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_with_event(sycl::ext::oneapi::experimental::empty_properties_t{}, - Range, Rest..., CodeLoc); -#else return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); -#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3655,6 +3672,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template friend event ext::oneapi::experimental::detail::submit_with_event_impl( @@ -3662,6 +3680,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + template + friend void ext::oneapi::experimental::detail::submit_without_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3682,13 +3708,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { - using NameT = - typename detail::get_kernel_name_t::name; + using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< std::is_integral::value && Dims == 1, item, @@ -3697,13 +3723,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.HostKernel().reset( new detail::HostKernel( KernelFunc)); - KRInfo.KernelName() = detail::getKernelName(); - KRInfo.KernelNumArgs() = detail::getKernelNumParams(); - KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); - KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KRInfo.KernelName() = detail::getKernelName(); + KRInfo.KernelNumArgs() = detail::getKernelNumParams(); + KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); + KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); + KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. @@ -3776,24 +3803,44 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event submit_with_event_impl( nd_range<1> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( nd_range<2> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_with_event_impl( nd_range<3> Range, - const detail::v1::SubmissionInfo &ExtSubmitInfo, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + void submit_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3877,7 +3924,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template event submit_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3887,9 +3936,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; + using NameT = + typename detail::get_kernel_name_t::name; + ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + // TODO call the kernel_parallel_for wrapper // TODO UseFallbackAssert @@ -3897,6 +3951,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } + template + void submit_without_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::SubmissionInfo SI{}; + detail::v1::KernelRuntimeInfo KRInfo{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessSubmitProperties(Props, SI); + ProcessKernelRuntimeInfo(Props, + KernelFunc, KRInfo); + + // TODO call the kernel_parallel_for wrapper + + // TODO UseFallbackAssert + + submit_without_event_impl(Range, SI, KRInfo, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dd1b833383055..0af3cf995b600 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -56,6 +56,14 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) $<$:__SYCL_BUILD_SYCL_DLL> ) + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${LIB_OBJ_NAME} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() + target_include_directories( ${LIB_OBJ_NAME} PRIVATE diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index eaa4955848535..9195c73915e85 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,6 +415,103 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +std::vector queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams) { + + size_t IndexShift = 0; + std::vector Args; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } + + return Args; +} + +detail::EventImplPtr +queue_impl::submit_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + + std::unique_ptr CommandGroup; + detail::CG::StorageInitHelper CGData; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + std::unique_lock Lock(MMutex); + + // Graphs are not supported yet for the no-handler path + assert(!hasCommandGraph()); + + // Set the No Last Event Mode to false, since the no-handler path + // does not support it yet. + MNoLastEventMode.store(false, std::memory_order_relaxed); + + // Used by queue_empty() and getLastEvent() + MEmpty.store(false, std::memory_order_release); + + // Sync with an external event + std::optional ExternalEvent = popExternalEvent(); + if (ExternalEvent) { + CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent)); + } + + // Sync with the last event for in order queue + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + if (isInOrder() && LastEvent) { + CGData.MEvents.push_back(LastEvent); + } + + // TODO UnenqueuedCmdEvents for out of order queue + + Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), + KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), + KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), + std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), + KRInfo.KernelNameBasedCachePtr(), + std::move(StreamStorage), + std::move(AuxiliaryResources), + detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, CallerNeedsEvent); + + // Sync with the last event for in order queue + if (isInOrder() && !EventImpl->isDiscarded()) { + LastEvent = EventImpl; + } + + return CallerNeedsEvent ? EventImpl : nullptr; +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e9a6edbe0a04a..cb3f3e938d8d8 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,17 +370,16 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event submit_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } event @@ -388,12 +387,10 @@ class queue_impl : public std::enable_shared_from_this { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } event @@ -401,14 +398,40 @@ class queue_impl : public std::enable_shared_from_this { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)Range; - (void)SubmitInfo; - (void)KRInfo; - (void)CodeLoc; - (void)IsTopCodeLoc; - return event(); + detail::EventImplPtr EventImpl = + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); } + void + submit_without_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_without_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } + + void + submit_without_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + IsTopCodeLoc); + } +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, @@ -944,6 +967,10 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); #endif + std::vector extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); + /// Performs command group submission to the queue. /// /// \param CGF is a function object containing command group. @@ -960,6 +987,13 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); + detail::EventImplPtr submit_impl(const NDRDescT &NDRDesc, + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 422148c0ec8f9..e854579b74cb2 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,6 +312,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event queue::submit_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -339,6 +340,35 @@ event queue::submit_with_event_impl( CodeLoc, IsTopCodeLoc); } +void queue::submit_without_event_impl( + nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_without_event_impl( + nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +void queue::submit_without_event_impl( + nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_without_event(Range, SubmitInfo, KRInfo, + CodeLoc, IsTopCodeLoc); +} + +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, From a6e711e8ab4dca986a61a52966f55b93adcdc347 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 14 Jul 2025 13:01:14 +0000 Subject: [PATCH 07/45] Added a new configure option to build no handler submit path, changed the no handler private submit function names to submit "direct", addressed review comments. --- .../oneapi/experimental/enqueue_functions.hpp | 39 ++++++----- sycl/include/sycl/queue.hpp | 70 ++++++++++++++----- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 29 ++++---- sycl/source/queue.cpp | 24 +++---- 5 files changed, 102 insertions(+), 62 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 25c7cdcbf2e6e..c20112fde560f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,22 +114,21 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event_impl(const queue &Q, PropertiesT Props, +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + Q.submit_direct_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } - template -void submit_without_event_impl(const queue &Q, PropertiesT Props, +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - //Q.submit_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - // KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + return Q.submit_direct_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, + KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -149,11 +148,24 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void submit(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_direct_impl + (Q, Props, Range, KernelFunc, CodeLoc); +} +#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { + sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_with_event_impl( Q, Props, std::forward(CGF), CodeLoc); } @@ -174,18 +186,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); -} - -template -void submit_without_event(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_without_event_impl + return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a3f698c553e99..a7ed6aed9e6d7 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -231,14 +231,14 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event_impl(const queue &Q, PropertiesT Props, +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template -void submit_without_event_impl(const queue &Q, PropertiesT Props, +event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3675,17 +3675,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template - friend event ext::oneapi::experimental::detail::submit_with_event_impl( + friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template - friend void ext::oneapi::experimental::detail::submit_without_event_impl( + friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template @@ -3804,37 +3805,37 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_with_event_impl( + event submit_direct_with_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_without_event_impl( + void submit_direct_without_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3925,10 +3926,45 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + +#ifdef SYCL_LANGUAGE_VERSION +#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]] +#else +#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]] +#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS +#else +#define __SYCL_KERNEL_ATTR__ +#endif // SYCL_LANGUAGE_VERSION + + // TODO The kernel wrapper functions have to be moved from the handler class + // to a place where they are accessible by both the handler and the queue class. + // For test purposes, this wrapper is a copy of the handler wrapper, + // but an aplication which would use both handler and no-handler APIs, + // won't compile. + // + // NOTE: the name of these functions - "kernel_parallel_for" - are used by the + // Front End to determine kernel invocation kind. + template +#ifdef __SYCL_DEVICE_ONLY__ + [[__sycl_detail__::add_ir_attributes_function( + ext::oneapi::experimental::detail::PropertyMetaInfo::name..., + ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] +#endif + __SYCL_KERNEL_ATTR__ static void + kernel_parallel_for(const KernelType &KernelFunc) { +#ifdef __SYCL_DEVICE_ONLY__ + KernelFunc(detail::Builder::getElement(detail::declptr())); +#else + (void)KernelFunc; +#endif + } + template - event submit_with_event(PropertiesT Props, nd_range Range, + event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { @@ -3943,18 +3979,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - // TODO call the kernel_parallel_for wrapper + kernel_parallel_for, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); // TODO UseFallbackAssert - return submit_with_event_impl(Range, SI, KRInfo, + return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } template - void submit_without_event(PropertiesT Props, nd_range Range, + void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { @@ -3969,11 +4006,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - // TODO call the kernel_parallel_for wrapper + kernel_parallel_for, KernelType, + ext::oneapi::experimental::empty_properties_t>(KernelFunc); // TODO UseFallbackAssert - submit_without_event_impl(Range, SI, KRInfo, + submit_direct_without_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7d5db8cc22254..50ad60689a45e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -439,7 +439,7 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr -queue_impl::submit_impl(const NDRDescT &NDRDesc, +queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7c4da21778a4a..269ac27d8843b 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -372,62 +372,62 @@ class queue_impl : public std::enable_shared_from_this { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT event - submit_with_event(nd_range<1> Range, + submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event - submit_with_event(nd_range<2> Range, + submit_direct_with_event(nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event - submit_with_event(nd_range<3> Range, + submit_direct_with_event(nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } void - submit_without_event(nd_range<1> Range, + submit_direct_without_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } void - submit_without_event(nd_range<2> Range, + submit_direct_without_event(nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } void - submit_without_event(nd_range<3> Range, + submit_direct_without_event(nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -987,13 +987,14 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); - detail::EventImplPtr submit_impl(const NDRDescT &NDRDesc, +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - +#endif /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index e854579b74cb2..be599169b0d98 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -313,57 +313,57 @@ event queue::submit_with_event_impl( } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_with_event_impl( +event queue::submit_direct_with_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_with_event(Range, SubmitInfo, KRInfo, + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_without_event_impl( +void queue::submit_direct_without_event_impl( nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_without_event(Range, SubmitInfo, KRInfo, + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, IsTopCodeLoc); } From 9c8040ec47959c35ee4fabb928e349ed69748380 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 17 Jul 2025 12:35:04 +0000 Subject: [PATCH 08/45] Host task dependency test --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 10 +++- .../oneapi/experimental/enqueue_functions.hpp | 56 +++++++++++++++---- .../sycl/khr/free_function_commands.hpp | 15 +++++ sycl/include/sycl/queue.hpp | 55 ++++++++++++------ .../scheduler/InOrderQueueHostTaskDeps.cpp | 44 ++++++++++++++- 5 files changed, 152 insertions(+), 28 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 3fdc6fde119bc..39a4798696429 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -115,6 +115,14 @@ macro(add_sycl_unittest test_dirname link_variant) -Wno-inconsistent-missing-override ) endif() - + target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) + + if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index c20112fde560f..123e883ccd10f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,23 +112,23 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + Q.submit_direct_without_event(Props, Range, KernelFunc, CodeLoc); } -template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event<__SYCL_USE_FALLBACK_ASSERT, PropertiesT, - KernelName, KernelType, Dims>(Props, Range, KernelFunc, CodeLoc); + return Q.submit_direct_with_event(Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -149,7 +149,7 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit(const queue &Q, PropertiesT Props, nd_range Range, @@ -157,7 +157,7 @@ void submit(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { sycl::ext::oneapi::experimental::detail::submit_direct_impl - (Q, Props, Range, KernelFunc, CodeLoc); + (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -179,7 +179,7 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, @@ -187,7 +187,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); + (Q, Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -303,6 +303,21 @@ void nd_launch(handler &CGH, nd_range Range, KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, + ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + submit(std::move(Q), empty_properties_t{}, Range, KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Range, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, @@ -312,6 +327,7 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, std::forward(Reductions)...); }); } +#endif template @@ -327,6 +343,25 @@ void nd_launch(handler &CGH, std::forward(Reductions)..., KernelObj); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +template +void nd_launch(queue Q, launch_config, Properties> Config, + const KernelType &KernelObj, ReductionsT &&...Reductions) { + if constexpr (sizeof...(ReductionsT) == 0) { + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), + KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } +} +#else template void nd_launch(queue Q, launch_config, Properties> Config, @@ -336,6 +371,7 @@ void nd_launch(queue Q, launch_config, Properties> Config, std::forward(Reductions)...); }); } +#endif template void nd_launch(handler &CGH, nd_range Range, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e6..e45bcdeca7d4e 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index a7ed6aed9e6d7..cb0227e721418 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -229,14 +229,14 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, @@ -3279,6 +3279,31 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + /// parallel_for version with a kernel represented as a lambda + nd_range that + /// specifies global, local sizes and offset. + /// + /// \param Range specifies the global and local work spaces of the kernel + /// \param Rest acts as-if: "ReductionTypes&&... Reductions, + /// const KernelType &KernelFunc". + template + std::enable_if_t::value, event> + parallel_for(nd_range Range, RestT &&...Rest) { + constexpr detail::code_location CodeLoc = getCodeLocation(); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + if constexpr (sizeof...(RestT) == 1) { + return submit_direct_with_event(ext::oneapi::experimental::empty_properties_t{}, + Range, Rest...); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } + } +#else /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3297,7 +3322,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); } - +#endif /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3673,14 +3698,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, @@ -3710,7 +3735,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }; #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template void ProcessKernelRuntimeInfo( PropertiesT Props, const KernelType &KernelFunc, @@ -3961,9 +3986,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #endif } - template + template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -3976,11 +4000,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); kernel_parallel_for, KernelType, - ext::oneapi::experimental::empty_properties_t>(KernelFunc); + PropertiesT>(KernelFunc); // TODO UseFallbackAssert @@ -3988,9 +4012,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } - template + template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -4003,11 +4026,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, + ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); kernel_parallel_for, KernelType, - ext::oneapi::experimental::empty_properties_t>(KernelFunc); + PropertiesT>(KernelFunc); // TODO UseFallbackAssert diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 31a2914e2c803..b10ec4173e364 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -48,7 +48,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { EXPECT_EQ(GEventsWaitCounter, expectedCount); } -enum class CommandType { KERNEL = 1, MEMSET = 2 }; +enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; std::vector> ExecutedCommands; inline ur_result_t customEnqueueKernelLaunch(void *pParams) { @@ -162,3 +162,45 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { + ExecutedCommands.clear(); + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + &customEnqueueKernelLaunch); + + sycl::platform Plt = sycl::platform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back( + {CommandType::HOST_TASK, 0}); + }); + }); + + event Ev2 = InOrderQueue.parallel_for(nd_range<1>{range{32}, range{32}}, + [](nd_item<1>) {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +} From 31cbdb9fa3e772b07a60ed3799f33488a4f59cf5 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 18 Jul 2025 09:21:16 +0000 Subject: [PATCH 09/45] Add a check for special captures --- sycl/source/detail/queue_impl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 50ad60689a45e..019103c391362 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -457,6 +457,9 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, // Graphs are not supported yet for the no-handler path assert(!hasCommandGraph()); + // No special captures supported yet for the no-handler path + assert(!KRInfo.KernelHasSpecialCaptures); + // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. MNoLastEventMode.store(false, std::memory_order_relaxed); From 998d592ea3349ac29902c51939c98d72ba2e21ef Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 11 Aug 2025 15:32:42 +0000 Subject: [PATCH 10/45] Switch to the common kernel wrappers, fix the KRInfo function call --- sycl/include/sycl/queue.hpp | 42 +++---------------------------- sycl/source/detail/queue_impl.cpp | 2 +- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index cb0227e721418..7a94323929633 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3952,40 +3952,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -#ifdef SYCL_LANGUAGE_VERSION -#ifndef __INTEL_SYCL_USE_INTEGRATION_HEADERS -#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel_entry_point(KernelName)]] -#else -#define __SYCL_KERNEL_ATTR__ [[clang::sycl_kernel]] -#endif // __INTEL_SYCL_USE_INTEGRATION_HEADERS -#else -#define __SYCL_KERNEL_ATTR__ -#endif // SYCL_LANGUAGE_VERSION - - // TODO The kernel wrapper functions have to be moved from the handler class - // to a place where they are accessible by both the handler and the queue class. - // For test purposes, this wrapper is a copy of the handler wrapper, - // but an aplication which would use both handler and no-handler APIs, - // won't compile. - // - // NOTE: the name of these functions - "kernel_parallel_for" - are used by the - // Front End to determine kernel invocation kind. - template -#ifdef __SYCL_DEVICE_ONLY__ - [[__sycl_detail__::add_ir_attributes_function( - ext::oneapi::experimental::detail::PropertyMetaInfo::name..., - ext::oneapi::experimental::detail::PropertyMetaInfo::value...)]] -#endif - __SYCL_KERNEL_ATTR__ static void - kernel_parallel_for(const KernelType &KernelFunc) { -#ifdef __SYCL_DEVICE_ONLY__ - KernelFunc(detail::Builder::getElement(detail::declptr())); -#else - (void)KernelFunc; -#endif - } - template event submit_direct_with_event(PropertiesT Props, nd_range Range, @@ -4003,8 +3969,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - kernel_parallel_for, KernelType, - PropertiesT>(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert @@ -4029,8 +3995,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ProcessKernelRuntimeInfo(Props, KernelFunc, KRInfo); - kernel_parallel_for, KernelType, - PropertiesT>(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c77aadc9bc59d..2330b46130a87 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -458,7 +458,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, assert(!hasCommandGraph()); // No special captures supported yet for the no-handler path - assert(!KRInfo.KernelHasSpecialCaptures); + assert(!KRInfo.KernelHasSpecialCaptures()); // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. From 4000c07367ec3ccd21a91c150f5fb4a541e3258a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 12 Aug 2025 09:36:17 +0000 Subject: [PATCH 11/45] Enable no handler in the preview lib build, add no handler unit tests target --- sycl/CMakeLists.txt | 1 - sycl/cmake/modules/AddSYCLUnitTest.cmake | 37 +++++++++++++++--------- sycl/include/sycl/queue.hpp | 8 ++--- sycl/source/CMakeLists.txt | 8 ----- sycl/source/detail/queue_impl.cpp | 4 +-- sycl/source/detail/queue_impl.hpp | 8 ++--- sycl/source/queue.cpp | 4 +-- sycl/unittests/xpti_trace/CMakeLists.txt | 1 + 8 files changed, 36 insertions(+), 35 deletions(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 7a8a60fba2ce8..20e940fce9f91 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -11,7 +11,6 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON) option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" OFF) option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) -option(SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT "Builds SYCL runtime libraries with the support for Command Group Handler bypass kernel submit" OFF) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 8791dec59e0e1..e581dfaa61ba0 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -1,6 +1,6 @@ # Internal function to create SYCL unit tests with code reuse -# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...) -function(add_sycl_unittest_internal test_dirname link_variant is_preview) +# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...) +function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh) # Enable exception handling for these unit tests set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) # Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its # parent scope. if (${is_preview}) - set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + if (${is_no_cgh}) + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/PreviewNoCGH") + else() + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + endif() endif() if ("${link_variant}" MATCHES "SHARED") @@ -62,7 +66,19 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) if (${is_preview}) target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) - set(sycl_cache_suffix "_preview") + if (${is_no_cgh}) + set(sycl_cache_suffix "_preview_no_cgh") + else() + set(sycl_cache_suffix "_preview") + endif() + endif() + + if (${is_no_cgh}) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) endif() if (SYCL_ENABLE_XPTI_TRACING) @@ -151,14 +167,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) endif() target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) - - if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) - target_compile_definitions( - ${test_dirname} - PRIVATE - __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - ) - endif() endfunction() # add_sycl_unittest(test_name_prefix SHARED|OBJECT file1.cpp, file2.cpp ...) @@ -167,6 +175,7 @@ endfunction() # the SYCL preview features enabled. # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) - add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_preview_no_cgh ${link_variant} TRUE TRUE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7a94323929633..1f4a9b3d9df00 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -149,7 +149,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -207,7 +207,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3829,7 +3829,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3865,7 +3865,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index ff8bce4de2b1f..24a471eacb0f6 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -56,14 +56,6 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) $<$:__SYCL_BUILD_SYCL_DLL> ) - if (SYCL_ENABLE_UNFINISHED_NO_CGH_SUBMIT) - target_compile_definitions( - ${LIB_OBJ_NAME} - PRIVATE - __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - ) - endif() - target_include_directories( ${LIB_OBJ_NAME} PRIVATE diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2330b46130a87..80de9d7d9dd5f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -415,7 +415,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, } #endif -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), @@ -513,7 +513,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, return CallerNeedsEvent ? EventImpl : nullptr; } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES template event queue_impl::submitWithHandler(const std::vector &DepEvents, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 484b9e855b53f..18ed4025e20d2 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -370,7 +370,7 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -430,7 +430,7 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -988,14 +988,14 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -#endif +#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index be599169b0d98..6817386e6c22d 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -312,7 +312,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, @@ -367,7 +367,7 @@ void queue::submit_direct_without_event_impl( CodeLoc, IsTopCodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif //__INTEL_PREVIEW_BREAKING_CHANGES void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index 44574d36a9aa4..42c5505d86107 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,3 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_preview_no_cgh PRIVATE xpti xptitest_subscriber) From 01af8bbf312af89a12a20ce5cf481d55c48bc579 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 12 Aug 2025 11:13:32 +0000 Subject: [PATCH 12/45] Unused argument fix and IsTopCodeLoc assignment --- sycl/source/detail/queue_impl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 80de9d7d9dd5f..0ffb66bb74c7b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -445,6 +445,7 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + (void)SubmitInfo; std::unique_ptr CommandGroup; detail::CG::StorageInitHelper CGData; @@ -502,6 +503,8 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, 0, // MKernelWorkGroupMemorySize CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), *this, CallerNeedsEvent); From 4469e59fd5afeca91fa4cefdd39ba271f96fb141 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 09:56:39 +0000 Subject: [PATCH 13/45] Implemented the barrier and un-enqueued commands synchronization for out of order queues. Fixed an #ifdef. --- sycl/include/sycl/queue.hpp | 2 +- sycl/source/detail/queue_impl.cpp | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 1f4a9b3d9df00..3eb592ad77343 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3727,6 +3727,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -3734,7 +3735,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void ProcessKernelRuntimeInfo( diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 0ffb66bb74c7b..f058f596f7c70 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -480,7 +480,19 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, CGData.MEvents.push_back(LastEvent); } - // TODO UnenqueuedCmdEvents for out of order queue + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder()) { + MMissedCleanupRequests.unset( + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); + + if (MDefaultGraphDeps.LastBarrier && !MDefaultGraphDeps.LastBarrier->isEnqueued()) { + CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier); + } + } Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); @@ -513,6 +525,11 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, LastEvent = EventImpl; } + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder() && !EventImpl->isEnqueued()) { + MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); + } + return CallerNeedsEvent ? EventImpl : nullptr; } From ac1a5cf44db11577ec751db9a15ebe8d150a1e67 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 10:18:28 +0000 Subject: [PATCH 14/45] Fix formatting --- .../oneapi/experimental/enqueue_functions.hpp | 58 ++++----- .../sycl/khr/free_function_commands.hpp | 6 +- sycl/include/sycl/queue.hpp | 110 +++++++++--------- sycl/source/detail/queue_impl.cpp | 42 +++---- sycl/source/detail/queue_impl.hpp | 95 ++++++++------- sycl/source/queue.cpp | 42 +++---- .../scheduler/InOrderQueueHostTaskDeps.cpp | 7 +- 7 files changed, 172 insertions(+), 188 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 123e883ccd10f..d4a6cac2c560f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,23 +112,23 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, } #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template -void submit_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event(Props, Range, KernelFunc, CodeLoc); -} -template -event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, +template +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event(Props, Range, KernelFunc, CodeLoc); + Q.submit_direct_without_event( + Props, Range, KernelFunc, CodeLoc); +} +template +event submit_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { + return Q.submit_direct_with_event( + Props, Range, KernelFunc, CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail @@ -151,13 +151,13 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -void submit(const queue &Q, PropertiesT Props, - nd_range Range, +void submit(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_direct_impl - (Q, Props, Range, KernelFunc, CodeLoc); + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_direct_impl< + KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -181,13 +181,13 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template -event submit_with_event(const queue &Q, PropertiesT Props, - nd_range Range, +event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl - (Q, Props, Range, KernelFunc, CodeLoc); + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl< + KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + CodeLoc); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -350,10 +350,10 @@ void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { if constexpr (sizeof...(ReductionsT) == 0) { ext::oneapi::experimental::detail::LaunchConfigAccess, - Properties> - ConfigAccess(Config); - submit(std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(), - KernelObj); + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), + ConfigAccess.getRange(), KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index e45bcdeca7d4e..13ba0c8f56454 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -155,7 +155,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), k); + nd_range<1>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -169,7 +169,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), k); + nd_range<2>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -183,7 +183,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), k); + nd_range<3>(r, size), k); #else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 3eb592ad77343..ffb40895d43bb 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -167,10 +167,14 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } - const detail::ABINeutralKernelNameStrT &KernelName() const { return MKernelName; } + const detail::ABINeutralKernelNameStrT &KernelName() const { + return MKernelName; + } std::shared_ptr &HostKernel() { return MHostKernel; } - const std::shared_ptr &HostKernel() const { return MHostKernel; } + const std::shared_ptr &HostKernel() const { + return MHostKernel; + } char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } @@ -189,7 +193,9 @@ class __SYCL_EXPORT KernelRuntimeInfo { const bool &KernelIsESIMD() const { return MKernelIsESIMD; } bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } - const bool &KernelHasSpecialCaptures() const { return MKernelHasSpecialCaptures; } + const bool &KernelHasSpecialCaptures() const { + return MKernelHasSpecialCaptures; + } detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { return MKernelNameBasedCachePtr; @@ -229,19 +235,18 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT -template -void submit_direct_impl(const queue &Q, PropertiesT Props, - nd_range Range, +template +void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -template +template event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3293,8 +3298,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event(ext::oneapi::experimental::empty_properties_t{}, - Range, Rest...); + return submit_direct_with_event( + ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( [&](handler &CGH) { @@ -3698,19 +3703,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const sycl::detail::code_location &CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template + template friend void ext::oneapi::experimental::detail::submit_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); - template + template friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -3737,9 +3740,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - void ProcessKernelRuntimeInfo( - PropertiesT Props, const KernelType &KernelFunc, - detail::v1::KernelRuntimeInfo &KRInfo) const { + void ProcessKernelRuntimeInfo(PropertiesT Props, const KernelType &KernelFunc, + detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; using TransformedArgType = std::conditional_t< @@ -3753,8 +3755,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelNumArgs() = detail::getKernelNumParams(); KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); + KRInfo.KernelHasSpecialCaptures() = + detail::hasSpecialCaptures(); + KRInfo.KernelNameBasedCachePtr() = + detail::getKernelNameBasedCache(); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT @@ -3831,38 +3835,32 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; #endif //__INTEL_PREVIEW_BREAKING_CHANGES @@ -3955,9 +3953,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template event submit_direct_with_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3966,24 +3964,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, - KernelFunc, KRInfo); + ProcessKernelRuntimeInfo( + Props, KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert return submit_direct_with_event_impl(Range, SI, KRInfo, - TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } template void submit_direct_without_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3992,16 +3991,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, - KernelFunc, KRInfo); + ProcessKernelRuntimeInfo( + Props, KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); // TODO UseFallbackAssert submit_direct_without_event_impl(Range, SI, KRInfo, - TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } #endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f058f596f7c70..3273be7c2349e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -438,13 +438,10 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( return Args; } -detail::EventImplPtr -queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { +detail::EventImplPtr queue_impl::submit_direct_impl( + const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { (void)SubmitInfo; std::unique_ptr CommandGroup; @@ -483,36 +480,33 @@ queue_impl::submit_direct_impl(const NDRDescT &NDRDesc, // Barrier and un-enqueued commands synchronization for out or order queue if (!isInOrder()) { MMissedCleanupRequests.unset( - [&](MissedCleanupRequestsType &MissedCleanupRequests) { - for (auto &UpdatedGraph : MissedCleanupRequests) - doUnenqueuedCommandCleanup(UpdatedGraph); - MissedCleanupRequests.clear(); - }); + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); - if (MDefaultGraphDeps.LastBarrier && !MDefaultGraphDeps.LastBarrier->isEnqueued()) { + if (MDefaultGraphDeps.LastBarrier && + !MDefaultGraphDeps.LastBarrier->isEnqueued()) { CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier); } } Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), - KRInfo.KernelParamDescGetter(), KRInfo.KernelNumArgs()); + KRInfo.KernelParamDescGetter(), + KRInfo.KernelNumArgs()); CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), - KRInfo.HostKernel(), + std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), - std::move(Args), - toKernelNameStrT(KRInfo.KernelName()), - KRInfo.KernelNameBasedCachePtr(), - std::move(StreamStorage), - std::move(AuxiliaryResources), - detail::CGType::Kernel, + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + KRInfo.KernelNameBasedCachePtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize + 0, // MKernelWorkGroupMemorySize CodeLoc)); CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 18ed4025e20d2..a8bba21cefc76 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -371,64 +371,61 @@ class queue_impl : public std::enable_shared_from_this { } #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - event - submit_direct_with_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event - submit_direct_with_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event - submit_direct_with_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, - IsTopCodeLoc); + event submit_direct_with_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_direct_impl( + NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void - submit_direct_without_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<1> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void - submit_direct_without_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<2> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void - submit_direct_without_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + void submit_direct_without_event(nd_range<3> Range, + const detail::v1::SubmissionInfo &SubmitInfo, + const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } #endif //__INTEL_PREVIEW_BREAKING_CHANGES @@ -969,8 +966,8 @@ class queue_impl : public std::enable_shared_from_this { #endif std::vector extractArgsAndReqsFromLambda( - char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), - size_t NumKernelParams); + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); /// Performs command group submission to the queue. /// @@ -990,11 +987,11 @@ class queue_impl : public std::enable_shared_from_this { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc); + const v1::SubmissionInfo &SubmitInfo, + const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, + const detail::code_location &CodeLoc, + bool IsTopCodeLoc); #endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 6817386e6c22d..e3de12950162c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -314,57 +314,51 @@ event queue::submit_with_event_impl( #ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } event queue::submit_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } event queue::submit_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } void queue::submit_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, - CodeLoc, IsTopCodeLoc); + impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + IsTopCodeLoc); } #endif //__INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index b10ec4173e364..7dae968013948 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -182,13 +182,12 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { CGH.host_task([&] { std::unique_lock lk(CvMutex); Cv.wait(lk, [&ready] { return ready; }); - ExecutedCommands.push_back( - {CommandType::HOST_TASK, 0}); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0}); }); }); - event Ev2 = InOrderQueue.parallel_for(nd_range<1>{range{32}, range{32}}, - [](nd_item<1>) {}); + event Ev2 = InOrderQueue.parallel_for( + nd_range<1>{range{32}, range{32}}, [](nd_item<1>) {}); { std::unique_lock lk(CvMutex); From 5865f3a1ae7c55ba26ec2543185eccb2f142005b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 13 Aug 2025 10:33:07 +0000 Subject: [PATCH 15/45] Fixed #ifdef, added comment to a new function. --- sycl/source/detail/queue_impl.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index a8bba21cefc76..ce4b4f48a2db4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -965,9 +965,11 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const SubmissionInfo &SubmitInfo); #endif +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); +#endif /// Performs command group submission to the queue. /// @@ -986,6 +988,18 @@ class queue_impl : public std::enable_shared_from_this { const v1::SubmissionInfo &SubmitInfo); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES + /// Performs kernel submission to the queue. + /// + /// \param NDRDesc is an NDRange descriptor + /// \param SubmitInfo is additional optional information for the submission. + /// \param KRInfo is a descriptor of the kernel + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, const v1::KernelRuntimeInfo &KRInfo, From 27b3110ae28a2fda7f6d080fb700c36de5f27178 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 20 Aug 2025 15:13:49 +0000 Subject: [PATCH 16/45] Address review comments --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 5 +- .../oneapi/experimental/enqueue_functions.hpp | 22 +++------ sycl/include/sycl/queue.hpp | 46 ++++++------------- sycl/source/detail/queue_impl.cpp | 4 +- sycl/source/detail/queue_impl.hpp | 8 ++-- sycl/source/queue.cpp | 4 +- sycl/unittests/xpti_trace/CMakeLists.txt | 2 +- 7 files changed, 32 insertions(+), 59 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index e581dfaa61ba0..f7102b393c671 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -67,7 +67,7 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) if (${is_no_cgh}) - set(sycl_cache_suffix "_preview_no_cgh") + set(sycl_cache_suffix "_non_preview_no_cgh") else() set(sycl_cache_suffix "_preview") endif() @@ -165,7 +165,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c -Wno-inconsistent-missing-override ) endif() - target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) endfunction() @@ -176,6 +175,6 @@ endfunction() # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_non_preview_no_cgh ${link_variant} FALSE TRUE ${ARGN}) add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_preview_no_cgh ${link_variant} TRUE TRUE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 95330db434e89..3ddb983393826 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -118,7 +118,7 @@ template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } @@ -127,7 +127,7 @@ template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( Props, Range, KernelFunc, CodeLoc); } @@ -304,11 +304,11 @@ void nd_launch(handler &CGH, nd_range Range, KernelObj); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { submit(std::move(Q), empty_properties_t{}, Range, KernelObj); } else { @@ -317,18 +317,13 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, std::forward(Reductions)...); }); } -} #else -template -void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, - ReductionsT &&...Reductions) { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); }); -} #endif +} template @@ -344,11 +339,11 @@ void nd_launch(handler &CGH, std::forward(Reductions)..., KernelObj); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(ReductionsT) == 0) { ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> @@ -361,18 +356,13 @@ void nd_launch(queue Q, launch_config, Properties> Config, std::forward(Reductions)...); }); } -} #else -template -void nd_launch(queue Q, launch_config, Properties> Config, - const KernelType &KernelObj, ReductionsT &&...Reductions) { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); }); -} #endif +} template void nd_launch(handler &CGH, nd_range Range, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index bd67e01834180..b1d1ce453909c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,7 +151,7 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -215,7 +215,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3283,7 +3283,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3296,8 +3295,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event( + return submit_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3306,27 +3306,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); } - } #else - /// parallel_for version with a kernel represented as a lambda + nd_range that - /// specifies global, local sizes and offset. - /// - /// \param Range specifies the global and local work spaces of the kernel - /// \param Rest acts as-if: "ReductionTypes&&... Reductions, - /// const KernelType &KernelFunc". - template - std::enable_if_t::value, event> - parallel_for(nd_range Range, RestT &&...Rest) { - constexpr detail::code_location CodeLoc = getCodeLocation(); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); - } + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); #endif + } + /// parallel_for version with a kernel represented as a lambda + nd_range that /// specifies global, local sizes and offset. /// @@ -3832,7 +3820,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3862,7 +3850,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, @@ -3908,7 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3928,14 +3916,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - // TODO UseFallbackAssert - return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } - template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3955,8 +3941,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - // TODO UseFallbackAssert - submit_direct_without_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 222ca53910003..d8fbf85137e2e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -403,7 +403,7 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), @@ -515,7 +515,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( return CallerNeedsEvent ? EventImpl : nullptr; } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES template event queue_impl::submitWithHandler(const std::vector &DepEvents, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 59024d44dd633..738a78d6213ce 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -365,7 +365,7 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -422,7 +422,7 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -952,7 +952,7 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); @@ -975,7 +975,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index bd25d703aa51d..94df8a91d7a48 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,7 +315,7 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -364,7 +364,7 @@ void queue::submit_direct_without_event_impl( IsTopCodeLoc); } -#endif //__INTEL_PREVIEW_BREAKING_CHANGES +//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index 42c5505d86107..e3553d840e4c6 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,4 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) -target_link_libraries(XptiTraceTests_preview_no_cgh PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_non_preview_no_cgh PRIVATE xpti xptitest_subscriber) From 9041e94a878e8ff5ccc50baf76d1edf12c16219b Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 08:25:13 +0000 Subject: [PATCH 17/45] Updated Linux symbols --- sycl/include/sycl/queue.hpp | 4 ---- sycl/source/detail/queue_impl.cpp | 4 ---- sycl/source/detail/queue_impl.hpp | 5 +---- sycl/source/queue.cpp | 3 --- sycl/test/abi/sycl_symbols_linux.dump | 8 +++++++- 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b1d1ce453909c..2d727b7496468 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,7 +151,6 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -215,7 +214,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { bool MKernelHasSpecialCaptures = true; detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; }; -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES } // namespace v1 } // namespace detail @@ -3820,7 +3818,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -3850,7 +3847,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index d8fbf85137e2e..99d64d118b6a5 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -403,8 +403,6 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - std::vector queue_impl::extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams) { @@ -515,8 +513,6 @@ detail::EventImplPtr queue_impl::submit_direct_impl( return CallerNeedsEvent ? EventImpl : nullptr; } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES - template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 738a78d6213ce..f032e709da0a0 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -365,7 +365,6 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event submit_direct_with_event(nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -422,7 +421,6 @@ class queue_impl : public std::enable_shared_from_this { submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, IsTopCodeLoc); } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, @@ -952,7 +950,6 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES std::vector extractArgsAndReqsFromLambda( char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), size_t NumKernelParams); @@ -975,7 +972,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 94df8a91d7a48..ec2c5602816f9 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,7 +315,6 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -//#ifdef __INTEL_PREVIEW_BREAKING_CHANGES event queue::submit_direct_with_event_impl( nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, const detail::v1::KernelRuntimeInfo &KRInfo, @@ -364,8 +363,6 @@ void queue::submit_direct_without_event_impl( IsTopCodeLoc); } -//#endif //__INTEL_PREVIEW_BREAKING_CHANGES - void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index f41c07ee394b7..76d849af87c61 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3334,6 +3334,7 @@ _ZN4sycl3_V16detail21LocalAccessorBaseHost7getSizeEv _ZN4sycl3_V16detail21LocalAccessorBaseHostC1ENS0_5rangeILi3EEEiiRKNS0_13property_listE _ZN4sycl3_V16detail21LocalAccessorBaseHostC2ENS0_5rangeILi3EEEiiRKNS0_13property_listE _ZN4sycl3_V16detail22addHostAccessorAndWaitEPNS1_16AccessorImplHostE +_ZN4sycl3_V16detail22free_function_info_map3addEPKPKcPKjj _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateERKSt8functionIFbRKSt10shared_ptrINS1_17device_image_implEEEE _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKNS0_4spanIcLm18446744073709551615EEENS0_12bundle_stateE @@ -3747,7 +3748,13 @@ _ZNK4sycl3_V15queue20ext_oneapi_get_stateEv _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue29submit_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue32submit_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v114SubmissionInfoERKNS5_17KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv @@ -3850,7 +3857,6 @@ _ZNK4sycl3_V16detail19kernel_bundle_plain5emptyEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost11getPropListEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost6getPtrEv _ZNK4sycl3_V16detail21LocalAccessorBaseHost7getSizeEv -_ZN4sycl3_V16detail22free_function_info_map3addEPKPKcPKjj _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost10getSamplerEv _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost11getPropListEv _ZNK4sycl3_V16detail28SampledImageAccessorBaseHost12getNumOfDimsEv From ac2c5bb05206f0dc3b50a8b63c4b0466aaa6ca80 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 11:23:04 +0000 Subject: [PATCH 18/45] Addressed more review comments --- .../ext/oneapi/experimental/enqueue_functions.hpp | 12 ++---------- sycl/include/sycl/khr/free_function_commands.hpp | 6 +++--- sycl/include/sycl/queue.hpp | 10 ---------- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 3ddb983393826..fa3d51da5ec1c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -112,14 +112,12 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( + Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( + return Q.submit_direct_with_event( Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail template @@ -149,7 +145,6 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit(const queue &Q, PropertiesT Props, nd_range Range, @@ -160,7 +155,6 @@ void submit(const queue &Q, PropertiesT Props, nd_range Range, KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event(const queue &Q, PropertiesT Props, @@ -179,7 +173,6 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, @@ -190,7 +183,6 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void single_task(handler &CGH, const KernelType &KernelObj) { diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 13ba0c8f56454..4dd7c067126a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -154,7 +154,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), k); #else submit( @@ -168,7 +168,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), k); #else submit( @@ -182,7 +182,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - submit(std::move(q), ext::oneapi::experimental::empty_properties_t{}, + submit(q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), k); #else submit( diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 2d727b7496468..7d9716a6d9d4a 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -234,7 +234,6 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, @@ -247,7 +246,6 @@ event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT } // namespace detail } // namespace ext::oneapi::experimental @@ -3687,7 +3685,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template friend void ext::oneapi::experimental::detail::submit_direct_impl( @@ -3700,8 +3697,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3715,7 +3710,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT template struct TransformUserItemType { using type = std::conditional_t< std::is_convertible_v, LambdaArgType>, nd_item, @@ -3745,7 +3739,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelNameBasedCachePtr() = detail::getKernelNameBasedCache(); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. @@ -3890,8 +3883,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - template event submit_direct_with_event(PropertiesT Props, nd_range Range, @@ -3941,7 +3932,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } -#endif //__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. From 8e155fb23a40d70c54ac066f1c64fe67f2cae68e Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 11:24:40 +0000 Subject: [PATCH 19/45] Fix formatting --- sycl/include/sycl/queue.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7d9716a6d9d4a..53d7c6a24f0e6 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3303,11 +3303,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.query()); } #else - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Rest...); - }, - TlsCodeLocCapture.query()); + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); #endif } @@ -3883,8 +3883,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template + template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = @@ -3908,8 +3908,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template + template void submit_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = From 502f63733a684972dc8444ccd259400bb57e7521 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 12:20:44 +0000 Subject: [PATCH 20/45] Fix formatting, remove unused properties argument --- .../sycl/ext/oneapi/experimental/enqueue_functions.hpp | 2 +- sycl/include/sycl/queue.hpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index fa3d51da5ec1c..4b47c3da87a41 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -160,7 +160,7 @@ template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc = - sycl::detail::code_location::current()) { + sycl::detail::code_location::current()) { return sycl::ext::oneapi::experimental::detail::submit_with_event_impl( Q, Props, std::forward(CGF), CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 53d7c6a24f0e6..171a05549a70c 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3717,9 +3717,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; - template - void ProcessKernelRuntimeInfo(PropertiesT Props, const KernelType &KernelFunc, + template + void ProcessKernelRuntimeInfo(const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; From d708c9353355e6b11ac3f904585d75bdedec16b2 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 13:18:49 +0000 Subject: [PATCH 21/45] Fix ProcessKernelRuntimeInfo call --- sycl/include/sycl/queue.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 171a05549a70c..36101e2380362 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3896,8 +3896,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo( - Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, KernelFunc, + KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); @@ -3921,8 +3921,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo( - Props, KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(Props, KernelFunc, + KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); From e9f6e4e347158d621911c2dea663ec54837b30c6 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 13:51:55 +0000 Subject: [PATCH 22/45] Fix unit test build and ProcessKernelRuntimeInfo calls --- sycl/cmake/modules/AddSYCLUnitTest.cmake | 20 ++++++++++---------- sycl/include/sycl/queue.hpp | 6 ++---- sycl/unittests/xpti_trace/CMakeLists.txt | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index f7102b393c671..aeede07acdead 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -34,11 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c # Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its # parent scope. if (${is_preview}) - if (${is_no_cgh}) - set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/PreviewNoCGH") - else() set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") - endif() + endif() + + if (${is_no_cgh}) + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH") endif() if ("${link_variant}" MATCHES "SHARED") @@ -66,11 +66,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_c if (${is_preview}) target_compile_definitions(${test_dirname} PRIVATE __INTEL_PREVIEW_BREAKING_CHANGES) - if (${is_no_cgh}) - set(sycl_cache_suffix "_non_preview_no_cgh") - else() - set(sycl_cache_suffix "_preview") - endif() + set(sycl_cache_suffix "_preview") + endif() + + if (${is_no_cgh}) + set(sycl_cache_suffix "_no_cgh") endif() if (${is_no_cgh}) @@ -175,6 +175,6 @@ endfunction() # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_non_preview_no_cgh ${link_variant} FALSE TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN}) add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 36101e2380362..c04d28af0631e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3896,8 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, - KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); @@ -3921,8 +3920,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(Props, KernelFunc, - KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index e3553d840e4c6..0fbb3be4046fc 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,4 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) -target_link_libraries(XptiTraceTests_non_preview_no_cgh PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_no_cgh PRIVATE xpti xptitest_subscriber) From 057a7a58bea78f8ff156e8ff9530ccfdf4959bcb Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 21 Aug 2025 16:18:02 +0000 Subject: [PATCH 23/45] Fix formatting --- sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 4b47c3da87a41..300cb003978de 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -120,6 +120,7 @@ void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, Q.submit_direct_without_event( Props, Range, KernelFunc, CodeLoc); } + template event submit_direct_with_event_impl( From 77d92ca0a18d05d1820ee9788544b667b9a02db6 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 22 Aug 2025 14:55:57 +0000 Subject: [PATCH 24/45] Added single_task shortcut function support for no-handler --- sycl/include/sycl/queue.hpp | 37 +++++++++++++------ .../Extensions/CommandGraph/CommandGraph.cpp | 3 ++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index c04d28af0631e..8ebf40b56bfce 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2738,12 +2738,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + return submit_direct_with_event( + ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, + KernelFunc); + +#else return submit( [&](handler &CGH) { CGH.template single_task( Properties, KernelFunc); }, TlsCodeLocCapture.query()); +#endif + } /// single_task version with a kernel represented as a lambda. @@ -3293,7 +3302,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event( + return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3717,18 +3727,22 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { item, LambdaArgType>>; }; - template + template void ProcessKernelRuntimeInfo(const KernelType &KernelFunc, detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = std::conditional_t< + using TransformedArgType = std::conditional_t::value && Dims == 1, item, - typename TransformUserItemType::type>; + typename TransformUserItemType::type>, + void>; KRInfo.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); + new detail::HostKernel( + KernelFunc)); + KRInfo.KernelName() = detail::getKernelName(); KRInfo.KernelNumArgs() = detail::getKernelNumParams(); KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); @@ -3882,7 +3896,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } - template event submit_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, @@ -3896,10 +3911,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); - detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + detail::KernelWrapper::wrap(KernelFunc); return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), @@ -3920,7 +3935,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index 82fc0ed56c09d..662eef6186683 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -619,6 +619,8 @@ TEST_F(CommandGraphTest, AccessorModeEdges) { // Tests the transitive queue recording behaviour with queue shortcuts. TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { +// Graphs not supported yet for the no-handler submit path +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT device Dev; context Ctx{{Dev}}; queue Q1{Ctx, Dev}; @@ -662,6 +664,7 @@ TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { ext::oneapi::experimental::queue_state::executing); ASSERT_EQ(Q3.ext_oneapi_get_state(), ext::oneapi::experimental::queue_state::executing); +#endif } // Tests that dynamic_work_group_memory.get() will throw on the host side. From 85aaa5c3fefca6aba36ecb3dcdaf9fc862e486dc Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 25 Aug 2025 07:47:49 +0000 Subject: [PATCH 25/45] Fix formatting --- sycl/include/sycl/queue.hpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8ebf40b56bfce..b5a28f2f2de6f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2752,7 +2752,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { }, TlsCodeLocCapture.query()); #endif - } /// single_task version with a kernel represented as a lambda. @@ -3302,8 +3301,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event>( + return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3733,15 +3732,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::v1::KernelRuntimeInfo &KRInfo) const { using LambdaArgType = sycl::detail::lambda_arg_type>; - using TransformedArgType = std::conditional_t::value && Dims == 1, item, - typename TransformUserItemType::type>, + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>, void>; KRInfo.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); + new detail::HostKernel( + KernelFunc)); KRInfo.KernelName() = detail::getKernelName(); KRInfo.KernelNumArgs() = detail::getKernelNumParams(); @@ -3911,10 +3911,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, + KRInfo); - detail::KernelWrapper::wrap(KernelFunc); + detail::KernelWrapper::wrap(KernelFunc); return submit_direct_with_event_impl(Range, SI, KRInfo, TlsCodeLocCapture.query(), @@ -3935,7 +3936,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename detail::get_kernel_name_t::name; ProcessSubmitProperties(Props, SI); - ProcessKernelRuntimeInfo(KernelFunc, KRInfo); + ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); From 967d35e7ea1a815f874f2d2cbf0eea2d85947c5f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 08:47:41 +0000 Subject: [PATCH 26/45] Update KernelRuntimeInfo - change kernel name based cache pointer to DeviceKernelInfo pointer --- sycl/include/sycl/queue.hpp | 13 ++++++------- sycl/source/detail/queue_impl.cpp | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index b5a28f2f2de6f..96f5902cc2108 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -198,11 +198,11 @@ class __SYCL_EXPORT KernelRuntimeInfo { return MKernelHasSpecialCaptures; } - detail::KernelNameBasedCacheT *&KernelNameBasedCachePtr() { - return MKernelNameBasedCachePtr; + detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { + return MDeviceKernelInfoPtr; } - detail::KernelNameBasedCacheT *KernelNameBasedCachePtr() const { - return MKernelNameBasedCachePtr; + detail::DeviceKernelInfo *DeviceKernelInfoPtr() const { + return MDeviceKernelInfoPtr; } private: @@ -212,7 +212,7 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; bool MKernelIsESIMD = false; bool MKernelHasSpecialCaptures = true; - detail::KernelNameBasedCacheT *MKernelNameBasedCachePtr = nullptr; + detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; } // namespace v1 @@ -3749,8 +3749,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); KRInfo.KernelHasSpecialCaptures() = detail::hasSpecialCaptures(); - KRInfo.KernelNameBasedCachePtr() = - detail::getKernelNameBasedCache(); + KRInfo.DeviceKernelInfoPtr() = detail::getDeviceKernelInfo(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index e763ecdfc5ddc..6c2454b6ddea8 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -510,7 +510,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( nullptr, // MKernel nullptr, // MKernelBundle std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - KRInfo.KernelNameBasedCachePtr(), std::move(StreamStorage), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative From ec1ef896502ed3def2cda123be90b1a48933dbba Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 09:36:09 +0000 Subject: [PATCH 27/45] Switch to DeviceKernelInfo use on the no-handler path --- sycl/include/sycl/queue.hpp | 29 +---------------------------- sycl/source/detail/queue_impl.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 33 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 96f5902cc2108..84ba701878628 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -180,24 +180,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } - int &KernelNumArgs() { return MKernelNumArgs; } - const int &KernelNumArgs() const { return MKernelNumArgs; } - - KernelParamDescGetterFuncPtr &KernelParamDescGetter() { - return MKernelParamDescGetter; - } - const KernelParamDescGetterFuncPtr &KernelParamDescGetter() const { - return MKernelParamDescGetter; - } - - bool &KernelIsESIMD() { return MKernelIsESIMD; } - const bool &KernelIsESIMD() const { return MKernelIsESIMD; } - - bool &KernelHasSpecialCaptures() { return MKernelHasSpecialCaptures; } - const bool &KernelHasSpecialCaptures() const { - return MKernelHasSpecialCaptures; - } - detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { return MDeviceKernelInfoPtr; } @@ -208,10 +190,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { private: detail::ABINeutralKernelNameStrT MKernelName; std::shared_ptr MHostKernel; - int MKernelNumArgs = 0; - KernelParamDescGetterFuncPtr MKernelParamDescGetter = nullptr; - bool MKernelIsESIMD = false; - bool MKernelHasSpecialCaptures = true; detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; @@ -3744,12 +3722,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { KernelFunc)); KRInfo.KernelName() = detail::getKernelName(); - KRInfo.KernelNumArgs() = detail::getKernelNumParams(); - KRInfo.KernelParamDescGetter() = &(detail::getKernelParamDesc); - KRInfo.KernelIsESIMD() = detail::isKernelESIMD(); - KRInfo.KernelHasSpecialCaptures() = - detail::hasSpecialCaptures(); - KRInfo.DeviceKernelInfoPtr() = detail::getDeviceKernelInfo(); + KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); } #ifndef __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c2454b6ddea8..521366fcb9ad2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -465,7 +465,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( assert(!hasCommandGraph()); // No special captures supported yet for the no-handler path - assert(!KRInfo.KernelHasSpecialCaptures()); + assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. @@ -501,15 +501,16 @@ detail::EventImplPtr queue_impl::submit_direct_impl( } } - Args = extractArgsAndReqsFromLambda(KRInfo.GetKernelFuncPtr(), - KRInfo.KernelParamDescGetter(), - KRInfo.KernelNumArgs()); + Args = extractArgsAndReqsFromLambda( + KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); CommandGroup.reset(new detail::CGExecKernel( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + std::move(CGData), std::move(Args), + toKernelNameStrT(KRInfo.DeviceKernelInfoPtr()->Name), *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, From 1f95b9b5fd1f0bce2476f20b8678092b07db0d84 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 12:21:44 +0000 Subject: [PATCH 28/45] KernelName fix --- sycl/source/detail/queue_impl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 521366fcb9ad2..54f0dbe2907b9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -509,8 +509,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), - toKernelNameStrT(KRInfo.DeviceKernelInfoPtr()->Name), + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, From 12ef6dadc5339381f80e21433ea43c822c7b5ea3 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 29 Aug 2025 15:09:42 +0000 Subject: [PATCH 29/45] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index eae017c88eac8..bf5343a2c0dc7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -298,6 +298,7 @@ ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_K_NAEBVproperty_list@23@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_N_K4AEBVproperty_list@23@@Z ??0HostProfilingInfo@detail@_V1@sycl@@QEAA@XZ +??0KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??0LocalAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VLocalAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -507,6 +508,7 @@ ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@AEBUcode_location@123@@Z ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@XZ ??1AccessorBaseHost@detail@_V1@sycl@@QEAA@XZ +??1KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1SYCLCategory@detail@_V1@sycl@@UEAA@XZ ??1SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ @@ -710,6 +712,8 @@ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB ?Clear@exception_list@_V1@sycl@@AEAAXXZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAPEAVDeviceKernelInfo@345@XZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEAVDeviceKernelInfo@345@XZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEAAAEAW4event_mode_enum@experimental@oneapi@ext@34@XZ @@ -718,8 +722,14 @@ ?EventMode@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@45@XZ ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAPEADXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEADXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAVstring@345@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBVstring@345@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@v1@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@345@XZ @@ -4464,6 +4474,12 @@ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?stringifyErrorCode@detail@_V1@sycl@@YAPEBDH@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z From 29805311b105a95c23bbd151f0559ad34e4fc814 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 1 Sep 2025 15:39:45 +0000 Subject: [PATCH 30/45] Split the kernel submit code into a command submission lambda and generic part. Added "kernel" to the direct submission function names. --- .../oneapi/experimental/enqueue_functions.hpp | 12 ++-- sycl/include/sycl/queue.hpp | 48 ++++++------- sycl/source/detail/queue_impl.cpp | 72 ++++++++++--------- sycl/source/detail/queue_impl.hpp | 44 ++++++------ sycl/source/queue.cpp | 36 +++++----- 5 files changed, 107 insertions(+), 105 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 300cb003978de..f59315b6795fc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,19 +114,19 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - Q.submit_direct_without_event( + Q.submit_kernel_direct_without_event( Props, Range, KernelFunc, CodeLoc); } template -event submit_direct_with_event_impl( +event submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_direct_with_event( + return Q.submit_kernel_direct_with_event( Props, Range, KernelFunc, CodeLoc); } } // namespace detail @@ -152,7 +152,7 @@ void submit(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - sycl::ext::oneapi::experimental::detail::submit_direct_impl< + sycl::ext::oneapi::experimental::detail::submit_kernel_direct_impl< KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } @@ -180,7 +180,7 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_direct_with_event_impl< + return sycl::ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl< KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 84ba701878628..d273e53058a9f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -214,13 +214,13 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template -event submit_direct_with_event_impl(const queue &Q, PropertiesT Props, +event submit_kernel_direct_with_event_impl(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -2718,7 +2718,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_direct_with_event( + return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, KernelFunc); @@ -3279,7 +3279,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { @@ -3674,13 +3674,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - friend void ext::oneapi::experimental::detail::submit_direct_impl( + friend void ext::oneapi::experimental::detail::submit_kernel_direct_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); template - friend event ext::oneapi::experimental::detail::submit_direct_with_event_impl( + friend event ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3796,33 +3796,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - event submit_direct_with_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; - void submit_direct_without_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; @@ -3871,50 +3871,46 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_direct_with_event(PropertiesT Props, nd_range Range, + event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessSubmitProperties(Props, SI); ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper::wrap(KernelFunc); - return submit_direct_with_event_impl(Range, SI, KRInfo, + return submit_kernel_direct_with_event_impl(Range, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } template - void submit_direct_without_event(PropertiesT Props, nd_range Range, + void submit_kernel_direct_without_event(PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - detail::v1::SubmissionInfo SI{}; detail::v1::KernelRuntimeInfo KRInfo{}; using NameT = typename detail::get_kernel_name_t::name; - ProcessSubmitProperties(Props, SI); ProcessKernelRuntimeInfo(KernelFunc, KRInfo); detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); - submit_direct_without_event_impl(Range, SI, KRInfo, + submit_kernel_direct_without_event_impl(Range, KRInfo, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 54f0dbe2907b9..78a31465cfaf3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -447,26 +447,54 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( return Args; } -detail::EventImplPtr queue_impl::submit_direct_impl( - const NDRDescT &NDRDesc, const v1::SubmissionInfo &SubmitInfo, +detail::EventImplPtr queue_impl::submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - (void)SubmitInfo; - std::unique_ptr CommandGroup; - detail::CG::StorageInitHelper CGData; - std::vector Args; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; + // No special captures supported yet for the no-handler path + assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); + + SubmitCommandFuncType SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + std::unique_ptr CommandGroup; + std::vector Args; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + Args = extractArgsAndReqsFromLambda( + KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, CallerNeedsEvent); + return EventImpl; + }; + return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); +} + +detail::EventImplPtr queue_impl::submit_generic_direct( + bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { + detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); // Graphs are not supported yet for the no-handler path assert(!hasCommandGraph()); - // No special captures supported yet for the no-handler path - assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - // Set the No Last Event Mode to false, since the no-handler path // does not support it yet. MNoLastEventMode.store(false, std::memory_order_relaxed); @@ -501,27 +529,7 @@ detail::EventImplPtr queue_impl::submit_direct_impl( } } - Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); - - CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), KRInfo.HostKernel(), - nullptr, // MKernel - nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // MKernelIsCooperative - false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize - CodeLoc)); - - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - - EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, CallerNeedsEvent); + EventImplPtr EventImpl = SubmitCommandFunc(CGData); // Sync with the last event for in order queue if (isInOrder() && !EventImpl->isDiscarded()) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d50ce85a15f81..aa59300009842 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -366,60 +366,54 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_direct_with_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_direct_with_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_direct_with_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + event submit_kernel_direct_with_event(nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_direct_impl( - NDRDescT{Range}, SubmitInfo, KRInfo, true, CodeLoc, IsTopCodeLoc); + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void submit_direct_without_event(nd_range<1> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } - void submit_direct_without_event(nd_range<2> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } - void submit_direct_without_event(nd_range<3> Range, - const detail::v1::SubmissionInfo &SubmitInfo, + void submit_kernel_direct_without_event(nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_direct_impl(NDRDescT{Range}, SubmitInfo, KRInfo, false, CodeLoc, + submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, IsTopCodeLoc); } @@ -965,13 +959,17 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_direct_impl(const NDRDescT &NDRDesc, - const v1::SubmissionInfo &SubmitInfo, + detail::EventImplPtr submit_kernel_direct_impl(const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); +using SubmitCommandFuncType = std::function; + + detail::EventImplPtr submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index ec2c5602816f9..273731e4c477b 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -315,51 +315,51 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -event queue::submit_direct_with_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, +event queue::submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - return impl->submit_direct_with_event(Range, SubmitInfo, KRInfo, CodeLoc, + return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<1> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<2> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } -void queue::submit_direct_without_event_impl( - nd_range<3> Range, const detail::v1::SubmissionInfo &SubmitInfo, +void queue::submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { - impl->submit_direct_without_event(Range, SubmitInfo, KRInfo, CodeLoc, + impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, IsTopCodeLoc); } From 01e0f9f7f5510a66990831c8bad8e4ddedbf3a28 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 2 Sep 2025 10:13:31 +0000 Subject: [PATCH 31/45] Fix formatting --- .../oneapi/experimental/enqueue_functions.hpp | 21 +++--- sycl/include/sycl/queue.hpp | 67 +++++++++---------- sycl/source/detail/queue_impl.cpp | 33 +++++---- sycl/source/detail/queue_impl.hpp | 65 +++++++++--------- sycl/source/queue.cpp | 30 ++++----- 5 files changed, 106 insertions(+), 110 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f59315b6795fc..096e2d772dade 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -114,11 +114,12 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc) { - Q.submit_kernel_direct_without_event( - Props, Range, KernelFunc, CodeLoc); +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + Q.submit_kernel_direct_without_event(Props, Range, KernelFunc, CodeLoc); } template Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { - return Q.submit_kernel_direct_with_event( - Props, Range, KernelFunc, CodeLoc); + return Q.submit_kernel_direct_with_event(Props, Range, KernelFunc, + CodeLoc); } } // namespace detail @@ -180,8 +182,9 @@ event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - return sycl::ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl< - KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + return sycl::ext::oneapi::experimental::detail:: + submit_kernel_direct_with_event_impl(Q, Props, Range, KernelFunc, CodeLoc); } diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index d273e53058a9f..8e1803be9bbe7 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -214,16 +214,16 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, template -void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); template -event submit_kernel_direct_with_event_impl(const queue &Q, PropertiesT Props, - nd_range Range, - const KernelType &KernelFunc, - const sycl::detail::code_location &CodeLoc); +event submit_kernel_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -2718,7 +2718,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - return submit_kernel_direct_with_event( + return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, KernelFunc); @@ -3279,8 +3280,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT if constexpr (sizeof...(RestT) == 1) { - return submit_kernel_direct_with_event>( + return submit_kernel_direct_with_event>( ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); } else { return submit( @@ -3680,7 +3681,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - friend event ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( + friend event + ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( const queue &Q, PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); @@ -3797,33 +3799,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; event submit_kernel_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; void submit_kernel_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; /// A template-free version of submit_without_event as const member function. @@ -3871,10 +3867,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { template - event submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + event + submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3888,16 +3885,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT>::wrap(KernelFunc); return submit_kernel_direct_with_event_impl(Range, KRInfo, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } template - void submit_kernel_direct_without_event(PropertiesT Props, nd_range Range, - const KernelType &KernelFunc, - const detail::code_location &CodeLoc = - detail::code_location::current()) const { + void submit_kernel_direct_without_event( + PropertiesT Props, nd_range Range, const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3911,8 +3908,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { sycl::nd_item, PropertiesT>::wrap(KernelFunc); submit_kernel_direct_without_event_impl(Range, KRInfo, - TlsCodeLocCapture.query(), - TlsCodeLocCapture.isToplevel()); + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); } /// Submits a command group function object to the queue, in order to be diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 78a31465cfaf3..f725bded7484c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -448,47 +448,54 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, - const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { // No special captures supported yet for the no-handler path assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - SubmitCommandFuncType SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + SubmitCommandFuncType SubmitKernelFunc = + [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { std::unique_ptr CommandGroup; std::vector Args; std::vector> StreamStorage; std::vector> AuxiliaryResources; Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); + KRInfo.GetKernelFuncPtr(), + KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); CommandGroup.reset(new detail::CGExecKernel( std::move(NDRDesc), KRInfo.HostKernel(), nullptr, // MKernel nullptr, // MKernelBundle - std::move(CGData), std::move(Args), toKernelNameStrT(KRInfo.KernelName()), - *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, + std::move(CGData), std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + std::move(StreamStorage), std::move(AuxiliaryResources), + detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative false, // MKernelUsesClusterLaunch 0, // MKernelWorkGroupMemorySize CodeLoc)); CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + // TODO DiscardEvent should include a check for requirements list + // once accessors are implemented + bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, CallerNeedsEvent); + std::move(CommandGroup), *this, !DiscardEvent); return EventImpl; }; return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); } -detail::EventImplPtr queue_impl::submit_generic_direct( - bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { +detail::EventImplPtr +queue_impl::submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index aa59300009842..7a3bfc55fde6c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -366,55 +366,49 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } - event submit_kernel_direct_with_event(nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_kernel_direct_with_event(nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - event submit_kernel_direct_with_event(nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + event submit_kernel_direct_with_event( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl( NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } - void submit_kernel_direct_without_event(nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void submit_kernel_direct_without_event(nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } - void submit_kernel_direct_without_event(nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc) { + void submit_kernel_direct_without_event( + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -959,16 +953,17 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_kernel_direct_impl(const NDRDescT &NDRDesc, - const v1::KernelRuntimeInfo &KRInfo, - bool CallerNeedsEvent, - const detail::code_location &CodeLoc, - bool IsTopCodeLoc); + detail::EventImplPtr submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); -using SubmitCommandFuncType = std::function; + using SubmitCommandFuncType = + std::function; - detail::EventImplPtr submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + detail::EventImplPtr + submit_generic_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 273731e4c477b..26fca0c4ec44c 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -316,51 +316,45 @@ event queue::submit_with_event_impl( } event queue::submit_kernel_direct_with_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } event queue::submit_kernel_direct_with_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { return impl->submit_kernel_direct_with_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<1> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<2> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_kernel_direct_without_event_impl( - nd_range<3> Range, - const detail::v1::KernelRuntimeInfo &KRInfo, + nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { impl->submit_kernel_direct_without_event(Range, KRInfo, CodeLoc, - IsTopCodeLoc); + IsTopCodeLoc); } void queue::submit_without_event_impl( From 63d1345842b55b27fcc878c7897193e0a0c83972 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 2 Sep 2025 10:17:49 +0000 Subject: [PATCH 32/45] Rename submit_generic_direct to submit_direct --- sycl/source/detail/queue_impl.cpp | 6 +++--- sycl/source/detail/queue_impl.hpp | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f725bded7484c..6838d43d108b2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -490,12 +490,12 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( return EventImpl; }; - return submit_generic_direct(CallerNeedsEvent, SubmitKernelFunc); + return submit_direct(CallerNeedsEvent, SubmitKernelFunc); } detail::EventImplPtr -queue_impl::submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc) { +queue_impl::submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 7a3bfc55fde6c..85656daf63d26 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -961,9 +961,8 @@ class queue_impl : public std::enable_shared_from_this { using SubmitCommandFuncType = std::function; - detail::EventImplPtr - submit_generic_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + detail::EventImplPtr submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. From 7b95fb1103dcfba5998edf07f8cc7fdf62430f98 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 2 Sep 2025 11:27:47 +0000 Subject: [PATCH 33/45] Added scheduler-bypass flow to no-handler --- sycl/source/detail/queue_impl.cpp | 83 +++++++++++++++++++------------ sycl/source/detail/queue_impl.hpp | 41 ++++++++++----- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6838d43d108b2..934f246f8c190 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -448,7 +448,7 @@ std::vector queue_impl::extractArgsAndReqsFromLambda( } detail::EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { @@ -457,37 +457,58 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( SubmitCommandFuncType SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { - std::unique_ptr CommandGroup; std::vector Args; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; - - Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), - KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, - KRInfo.DeviceKernelInfoPtr()->NumParams); - - CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), KRInfo.HostKernel(), - nullptr, // MKernel - nullptr, // MKernelBundle - std::move(CGData), std::move(Args), - toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), - std::move(StreamStorage), std::move(AuxiliaryResources), - detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // MKernelIsCooperative - false, // MKernelUsesClusterLaunch - 0, // MKernelWorkGroupMemorySize - CodeLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - - // TODO DiscardEvent should include a check for requirements list - // once accessors are implemented bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); - EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), *this, !DiscardEvent); - return EventImpl; + bool SchedulerBypass = detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, getContextImpl()); + + if (SchedulerBypass) { + std::vector RawEvents = + detail::Command::getUrEvents(CGData.MEvents, this, false); + + std::shared_ptr ResultEvent = + DiscardEvent ? nullptr + : detail::event_impl::create_device_event(*this); + + enqueueImpKernel( + *this, NDRDesc, Args, nullptr, nullptr, + toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + RawEvents, ResultEvent.get(), nullptr, UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, false, 0, nullptr); + + return ResultEvent; + } else { + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + Args = extractArgsAndReqsFromLambda( + KRInfo.GetKernelFuncPtr(), + KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, + KRInfo.DeviceKernelInfoPtr()->NumParams); + + CommandGroup.reset(new detail::CGExecKernel( + std::move(NDRDesc), KRInfo.HostKernel(), + nullptr, // MKernel + nullptr, // MKernelBundle + std::move(CGData), std::move(Args), + toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + std::move(StreamStorage), std::move(AuxiliaryResources), + detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // MKernelIsCooperative + false, // MKernelUsesClusterLaunch + 0, // MKernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + // TODO DiscardEvent should include a check for requirements list + // once accessors are implemented + + EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), *this, !DiscardEvent); + return EventImpl; + } }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); @@ -539,12 +560,12 @@ queue_impl::submit_direct(bool CallerNeedsEvent, EventImplPtr EventImpl = SubmitCommandFunc(CGData); // Sync with the last event for in order queue - if (isInOrder() && !EventImpl->isDiscarded()) { + if (isInOrder() && EventImpl && !EventImpl->isDiscarded()) { LastEvent = EventImpl; } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && !EventImpl->isEnqueued()) { + if (!isInOrder() && EventImpl && !EventImpl->isEnqueued()) { MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 85656daf63d26..bd87c245d5e04 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -369,46 +369,61 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDesc, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDesc, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } event submit_kernel_direct_with_event( nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - detail::EventImplPtr EventImpl = submit_kernel_direct_impl( - NDRDescT{Range}, KRInfo, true, CodeLoc, IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + detail::EventImplPtr EventImpl = + submit_kernel_direct_impl(NDRDesc, KRInfo, true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } void submit_kernel_direct_without_event( nd_range<1> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + submit_kernel_direct_impl(NDRDesc, KRInfo, false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( nd_range<2> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + submit_kernel_direct_impl(NDRDesc, KRInfo, false, CodeLoc, IsTopCodeLoc); } void submit_kernel_direct_without_event( nd_range<3> Range, const detail::v1::KernelRuntimeInfo &KRInfo, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_kernel_direct_impl(NDRDescT{Range}, KRInfo, false, CodeLoc, - IsTopCodeLoc); + + NDRDescT NDRDesc{Range}; + + submit_kernel_direct_impl(NDRDesc, KRInfo, false, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -954,7 +969,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. detail::EventImplPtr submit_kernel_direct_impl( - const NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, + NDRDescT &NDRDesc, const v1::KernelRuntimeInfo &KRInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); From ebc8e6484f9d8dc5c3d5850863206373c60f972a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 3 Sep 2025 14:27:11 +0000 Subject: [PATCH 34/45] Added basic "no last event mode" and event handling --- sycl/source/detail/queue_impl.cpp | 38 +++++++++++++++++++++++++------ sycl/source/detail/queue_impl.hpp | 4 ++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 934f246f8c190..337739f49d379 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -456,7 +456,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); SubmitCommandFuncType SubmitKernelFunc = - [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + [&](detail::CG::StorageInitHelper &CGData) + -> std::pair { std::vector Args; bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); @@ -471,13 +472,32 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this); + if (!DiscardEvent) { + ResultEvent->setWorkerQueue(weak_from_this()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); + } + enqueueImpKernel( *this, NDRDesc, Args, nullptr, nullptr, toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), RawEvents, ResultEvent.get(), nullptr, UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, false, 0, nullptr); + false, false, 0, nullptr, KRInfo.GetKernelFuncPtr(), + KRInfo.DeviceKernelInfoPtr()->NumParams, + KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, false); + + if (!DiscardEvent) { + ResultEvent->setEnqueued(); + // connect returned event with dependent events + if (!isInOrder()) { + // MEvents is not used anymore, so can move. + ResultEvent->getPreparedDepsEvents() = std::move(CGData.MEvents); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); + } + } - return ResultEvent; + return {ResultEvent, true}; } else { std::unique_ptr CommandGroup; std::vector> StreamStorage; @@ -507,7 +527,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( EventImplPtr EventImpl = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), *this, !DiscardEvent); - return EventImpl; + return {DiscardEvent ? nullptr : EventImpl, false}; } }; @@ -557,11 +577,15 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } } - EventImplPtr EventImpl = SubmitCommandFunc(CGData); + auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(CGData); // Sync with the last event for in order queue - if (isInOrder() && EventImpl && !EventImpl->isDiscarded()) { - LastEvent = EventImpl; + if (isInOrder()) { + if (SchedulerBypass) { + LastEvent = nullptr; + } else if (EventImpl) { + LastEvent = EventImpl; + } } // Barrier and un-enqueued commands synchronization for out or order queue diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index bd87c245d5e04..e6836ab7ac106 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -973,8 +973,8 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - using SubmitCommandFuncType = - std::function; + using SubmitCommandFuncType = std::function( + detail::CG::StorageInitHelper &CGData)>; detail::EventImplPtr submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc); From 5470f4d068cf58689c4588b8956f578239a58c74 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 4 Sep 2025 13:50:03 +0000 Subject: [PATCH 35/45] Make events related logic inline --- sycl/source/detail/queue_impl.cpp | 40 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 337739f49d379..7370b9060ea6d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -461,12 +461,44 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( std::vector Args; bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); - bool SchedulerBypass = detail::Scheduler::areEventsSafeForSchedulerBypass( - CGData.MEvents, getContextImpl()); + bool SchedulerBypass = std::all_of(CGData.MEvents.begin(), CGData.MEvents.end(), + [&](EventImplPtr &Event) { + // Events that don't have an initialized context are throwaway events that + // don't represent actual dependencies. Calling getContextImpl() would set + // their context, which we wish to avoid as it is expensive. + // NOP events also don't represent actual dependencies. + if (Event->isDefaultConstructed() || Event->isNOP()) + return true; + + if (Event->isHost()) + return Event->isCompleted(); + + // Cross-context dependencies can't be passed to the backend directly. + if (&Event->getContextImpl() != &getContextImpl()) + return false; + + // A nullptr here means that the commmand does not produce a UR event or it + // hasn't been enqueued yet. + return Event->getHandle() != nullptr; + }); if (SchedulerBypass) { - std::vector RawEvents = - detail::Command::getUrEvents(CGData.MEvents, this, false); + std::vector RawEvents; + + for (EventImplPtr &Event : CGData.MEvents) { + auto Handle = Event->getHandle(); + if (Handle == nullptr) + continue; + + // Do not add redundant event dependencies for in-order queues. + // At this stage dependency is definitely ur task and need to check if + // current one is a host task. In this case we should not skip ur event due + // to different sync mechanisms for different task types on in-order queue. + if (Event->getWorkerQueue().get() == this && isInOrder()) + continue; + + RawEvents.push_back(Handle); + } std::shared_ptr ResultEvent = DiscardEvent ? nullptr From 3df98016530fb871f04d7ea416b2d26849d8ff11 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 5 Sep 2025 10:30:15 +0000 Subject: [PATCH 36/45] Fix event create logic --- sycl/source/detail/queue_impl.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7370b9060ea6d..8090bf6873230 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -459,7 +459,6 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( [&](detail::CG::StorageInitHelper &CGData) -> std::pair { std::vector Args; - bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); bool SchedulerBypass = std::all_of(CGData.MEvents.begin(), CGData.MEvents.end(), [&](EventImplPtr &Event) { @@ -484,6 +483,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( if (SchedulerBypass) { std::vector RawEvents; + bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); for (EventImplPtr &Event : CGData.MEvents) { auto Handle = Event->getHandle(); @@ -534,6 +534,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( std::unique_ptr CommandGroup; std::vector> StreamStorage; std::vector> AuxiliaryResources; + bool DiscardEvent = false; Args = extractArgsAndReqsFromLambda( KRInfo.GetKernelFuncPtr(), @@ -615,7 +616,7 @@ queue_impl::submit_direct(bool CallerNeedsEvent, if (isInOrder()) { if (SchedulerBypass) { LastEvent = nullptr; - } else if (EventImpl) { + } else { LastEvent = EventImpl; } } From 469db076f677de717a7acac9fa858397460c6933 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 8 Sep 2025 14:20:10 +0000 Subject: [PATCH 37/45] Enable the no-handler path for tests --- sycl/include/sycl/sycl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 56d8fceb34dc2..499d2d23d7b33 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -32,6 +32,8 @@ can be disabled by setting SYCL_DISABLE_FSYCL_SYCLHPP_WARNING macro.") #undef __SYCL_TOSTRING #undef __SYCL_STRINGIFY +#define __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + #include #include From 80e053831006beae951f1c0b2348bd97176bea10 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 8 Sep 2025 14:43:16 +0000 Subject: [PATCH 38/45] Fix unused Props parameter --- sycl/include/sycl/queue.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8e1803be9bbe7..5cad1ff21d1b5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3872,6 +3872,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { + (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; @@ -3895,6 +3896,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { PropertiesT Props, nd_range Range, const KernelType &KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) const { + (void)Props; detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); detail::v1::KernelRuntimeInfo KRInfo{}; From 0e081d2d02f2486d645dbb6fe762e56791560901 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 8 Sep 2025 14:52:10 +0000 Subject: [PATCH 39/45] Avoid redefinition of no-handler macro --- sycl/include/sycl/sycl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 499d2d23d7b33..d1a16664d4faa 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -32,7 +32,9 @@ can be disabled by setting SYCL_DISABLE_FSYCL_SYCLHPP_WARNING macro.") #undef __SYCL_TOSTRING #undef __SYCL_STRINGIFY +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT #define __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT +#endif #include From eb415c432419448280386dd2da622e456f4ebf4a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 8 Sep 2025 15:28:26 +0000 Subject: [PATCH 40/45] Fixed unused argument --- sycl/include/sycl/queue.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 5cad1ff21d1b5..ace2cc2ce3412 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2718,6 +2718,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); #ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + (void)Properties; return submit_kernel_direct_with_event( ext::oneapi::experimental::empty_properties_t{}, nd_range<1>{1, 1}, From 634da82ba2a3575db925a5dead727d2b76939d5d Mon Sep 17 00:00:00 2001 From: Alexandr Konovalov Date: Thu, 11 Sep 2025 15:07:12 +0200 Subject: [PATCH 41/45] [SYCL] Postpone creation of HostKernel copy Do not create copy of HostKernel till it became used out of submit stack. --- sycl/include/sycl/queue.hpp | 38 +++++++++++++++++++++++-------- sycl/source/detail/queue_impl.cpp | 9 ++++++-- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index ace2cc2ce3412..3f51e4a52859e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -151,6 +151,17 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +template +std::shared_ptr CopyHostKernel(const void *KernelFunc) { + const KernelType &KernelFuncRef = *static_cast(KernelFunc); + std::shared_ptr HostKernel; + HostKernel.reset(new detail::HostKernel( + KernelFuncRef)); + return HostKernel; +} + +using HostKernelFactory = std::shared_ptr(*)(const void*); + using KernelParamDescGetterFuncPtr = detail::kernel_param_desc_t (*)(int); // This class is intended to store the kernel runtime information, @@ -172,13 +183,20 @@ class __SYCL_EXPORT KernelRuntimeInfo { return MKernelName; } - std::shared_ptr &HostKernel() { return MHostKernel; } - const std::shared_ptr &HostKernel() const { - return MHostKernel; + char *GetKernelFuncPtr() const { + return static_cast(const_cast(MHostKernelPtr)); } - char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } - char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } + void SaveHostKernelRef(const void *KernelFuncPtr, HostKernelFactory Factory) { + MHostKernelFactory = Factory; + MHostKernelPtr = KernelFuncPtr; + } + + std::shared_ptr CopyHostKernel() const { + if (MHostKernelFactory && MHostKernelPtr) + return MHostKernelFactory(MHostKernelPtr); + return nullptr; + } detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { return MDeviceKernelInfoPtr; @@ -189,7 +207,10 @@ class __SYCL_EXPORT KernelRuntimeInfo { private: detail::ABINeutralKernelNameStrT MKernelName; - std::shared_ptr MHostKernel; + HostKernelFactory MHostKernelFactory = nullptr; + // points to the kernel function object allocated on stack, it's a lambda + // function, so have to use void* here + const void *MHostKernelPtr = nullptr; detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; }; @@ -3720,9 +3741,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { typename TransformUserItemType::type>, void>; - KRInfo.HostKernel().reset( - new detail::HostKernel( - KernelFunc)); + KRInfo.SaveHostKernelRef(&KernelFunc, + detail::v1::CopyHostKernel); KRInfo.KernelName() = detail::getKernelName(); KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8090bf6873230..ee4248c1e01aa 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -536,13 +536,18 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( std::vector> AuxiliaryResources; bool DiscardEvent = false; + // At this point, HostKernel points to the lambda function allocated on + // stack. To have pointer valid after submission, we need to put it in + // dynamic memory. + std::shared_ptr HostKernel = KRInfo.CopyHostKernel(); + Args = extractArgsAndReqsFromLambda( - KRInfo.GetKernelFuncPtr(), + HostKernel->getPtr(), KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, KRInfo.DeviceKernelInfoPtr()->NumParams); CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), KRInfo.HostKernel(), + std::move(NDRDesc), std::move(HostKernel), nullptr, // MKernel nullptr, // MKernelBundle std::move(CGData), std::move(Args), From 45d0d24a8e9a3b8c2b11519a76c6fae52c872c11 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Thu, 11 Sep 2025 17:11:48 +0000 Subject: [PATCH 42/45] Kernel name fix --- sycl/include/sycl/queue.hpp | 6 ------ sycl/source/detail/queue_impl.cpp | 8 +++----- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 12878b3f5639e..04c379cb17922 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -158,11 +158,6 @@ class __SYCL_EXPORT KernelRuntimeInfo { KernelRuntimeInfo &operator=(KernelRuntimeInfo &&rhs) = delete; - detail::ABINeutralKernelNameStrT &KernelName() { return MKernelName; } - const detail::ABINeutralKernelNameStrT &KernelName() const { - return MKernelName; - } - std::shared_ptr &HostKernel() { return MHostKernel; } const std::shared_ptr &HostKernel() const { return MHostKernel; @@ -3716,7 +3711,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { new detail::HostKernel( KernelFunc)); - KRInfo.KernelName() = detail::getKernelName(); KRInfo.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 5d285c3fe1c96..dc85d12cad12e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -503,11 +503,9 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( enqueueImpKernel( *this, NDRDesc, Args, nullptr, nullptr, - toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + *KRInfo.DeviceKernelInfoPtr(), RawEvents, ResultEvent.get(), nullptr, UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, false, 0, nullptr, KRInfo.GetKernelFuncPtr(), - KRInfo.DeviceKernelInfoPtr()->NumParams, - KRInfo.DeviceKernelInfoPtr()->ParamDescGetter, false); + false, false, 0, nullptr, KRInfo.GetKernelFuncPtr()); if (!DiscardEvent) { ResultEvent->setEnqueued(); @@ -537,7 +535,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( nullptr, // MKernel nullptr, // MKernelBundle std::move(CGData), std::move(Args), - toKernelNameStrT(KRInfo.KernelName()), *KRInfo.DeviceKernelInfoPtr(), + *KRInfo.DeviceKernelInfoPtr(), std::move(StreamStorage), std::move(AuxiliaryResources), detail::CGType::Kernel, UR_KERNEL_CACHE_CONFIG_DEFAULT, false, // MKernelIsCooperative From d56ed5299f1d7c0e7fee6e3d9e7aff28cbd8d74f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 12 Sep 2025 12:14:27 +0000 Subject: [PATCH 43/45] Lambda type template instead of std::function --- sycl/source/detail/queue_impl.cpp | 3 ++- sycl/source/detail/queue_impl.hpp | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index dc85d12cad12e..bf2026bff0c4a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -446,7 +446,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( // No special captures supported yet for the no-handler path assert(!KRInfo.DeviceKernelInfoPtr()->HasSpecialCaptures); - SubmitCommandFuncType SubmitKernelFunc = + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) -> std::pair { std::vector Args; @@ -556,6 +556,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( return submit_direct(CallerNeedsEvent, SubmitKernelFunc); } +template detail::EventImplPtr queue_impl::submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 56f3187bacf35..c78efd3a5e65d 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -951,9 +951,7 @@ class queue_impl : public std::enable_shared_from_this { bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); - using SubmitCommandFuncType = std::function( - detail::CG::StorageInitHelper &CGData)>; - + template detail::EventImplPtr submit_direct(bool CallerNeedsEvent, SubmitCommandFuncType &SubmitCommandFunc); From 140fbde31c8b20ec29a98562c821d9b8b98228c3 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 16 Sep 2025 09:53:06 +0000 Subject: [PATCH 44/45] Add checks for event list size --- sycl/source/detail/queue_impl.cpp | 40 +++++-------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bf2026bff0c4a..e67d447261c87 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -451,44 +451,16 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( -> std::pair { std::vector Args; - bool SchedulerBypass = std::all_of(CGData.MEvents.begin(), CGData.MEvents.end(), - [&](EventImplPtr &Event) { - // Events that don't have an initialized context are throwaway events that - // don't represent actual dependencies. Calling getContextImpl() would set - // their context, which we wish to avoid as it is expensive. - // NOP events also don't represent actual dependencies. - if (Event->isDefaultConstructed() || Event->isNOP()) - return true; - - if (Event->isHost()) - return Event->isCompleted(); - - // Cross-context dependencies can't be passed to the backend directly. - if (&Event->getContextImpl() != &getContextImpl()) - return false; - - // A nullptr here means that the commmand does not produce a UR event or it - // hasn't been enqueued yet. - return Event->getHandle() != nullptr; - }); + bool SchedulerBypass = CGData.MEvents.size() > 0 ? + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, getContextImpl()) : true; if (SchedulerBypass) { - std::vector RawEvents; bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); + std::vector RawEvents; - for (EventImplPtr &Event : CGData.MEvents) { - auto Handle = Event->getHandle(); - if (Handle == nullptr) - continue; - - // Do not add redundant event dependencies for in-order queues. - // At this stage dependency is definitely ur task and need to check if - // current one is a host task. In this case we should not skip ur event due - // to different sync mechanisms for different task types on in-order queue. - if (Event->getWorkerQueue().get() == this && isInOrder()) - continue; - - RawEvents.push_back(Handle); + if (CGData.MEvents.size() > 0) { + detail::Command::getUrEvents(CGData.MEvents, this, false); } std::shared_ptr ResultEvent = From dc6ecd05b9bf5ec46e79fa692c2755120738477d Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 16 Sep 2025 10:40:03 +0000 Subject: [PATCH 45/45] RawEvents fix --- sycl/source/detail/queue_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index e67d447261c87..b88acea2fdd4b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -460,7 +460,7 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( std::vector RawEvents; if (CGData.MEvents.size() > 0) { - detail::Command::getUrEvents(CGData.MEvents, this, false); + RawEvents = detail::Command::getUrEvents(CGData.MEvents, this, false); } std::shared_ptr ResultEvent =