diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index b4b01b4fb45ea..9f1fa77774361 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -1,6 +1,6 @@ # Internal function to create SYCL unit tests with code reuse -# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...) -function(add_sycl_unittest_internal test_dirname link_variant is_preview) +# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...) +function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh) # Enable exception handling for these unit tests set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) # Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its # parent scope. if (${is_preview}) - set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview") + endif() + + if (${is_no_cgh}) + set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH") endif() if ("${link_variant}" MATCHES "SHARED") @@ -65,6 +69,18 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) set(sycl_cache_suffix "_preview") endif() + if (${is_no_cgh}) + set(sycl_cache_suffix "_no_cgh") + endif() + + if (${is_no_cgh}) + target_compile_definitions( + ${test_dirname} + PRIVATE + __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + ) + endif() + if (SYCL_ENABLE_XPTI_TRACING) target_compile_definitions(${test_dirname} PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY) @@ -150,7 +166,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview) -Wno-inconsistent-missing-override ) endif() - target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING) endfunction() @@ -160,6 +175,7 @@ endfunction() # the SYCL preview features enabled. # Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)` macro(add_sycl_unittest test_name_prefix link_variant) - add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN}) - add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN}) + add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN}) endmacro() diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index adc48ef08a674..3c0b4606154d7 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -109,6 +109,26 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props, const sycl::detail::code_location &CodeLoc) { return Q.submit_with_event(Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc); } + +template +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc) { + Q.submit_kernel_direct_without_event(Props, Range, KernelFunc, CodeLoc); +} + +template +event submit_kernel_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc) { + return Q.submit_kernel_direct_with_event(Props, Range, KernelFunc, + CodeLoc); +} } // namespace detail template @@ -126,6 +146,17 @@ void submit(const queue &Q, CommandGroupFunc &&CGF, submit(Q, empty_properties_t{}, std::forward(CGF), CodeLoc); } +template +void submit(const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + sycl::ext::oneapi::experimental::detail::submit_kernel_direct_impl< + KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc, + CodeLoc); +} + template event submit_with_event(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, @@ -143,6 +174,18 @@ event submit_with_event(const queue &Q, CommandGroupFunc &&CGF, std::forward(CGF), CodeLoc); } +template +event submit_with_event(const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc = + sycl::detail::code_location::current()) { + return sycl::ext::oneapi::experimental::detail:: + submit_kernel_direct_with_event_impl(Q, Props, Range, KernelFunc, + CodeLoc); +} + template void single_task(handler &CGH, const KernelType &KernelObj) { CGH.single_task(KernelObj); @@ -259,10 +302,21 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + if constexpr (sizeof...(ReductionsT) == 0) { + submit(std::move(Q), empty_properties_t{}, Range, KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Range, KernelObj, + std::forward(Reductions)...); + }); + } +#else submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); }); +#endif } template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + if constexpr (sizeof...(ReductionsT) == 0) { + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + submit(std::move(Q), ConfigAccess.getProperties(), + ConfigAccess.getRange(), KernelObj); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } +#else submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, std::forward(Reductions)...); }); +#endif } template diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 4138edd5821e6..4dd7c067126a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<2> r, range<2> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template void launch_grouped(const queue &q, range<3> r, range<3> size, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + submit(q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), k); +#else submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); +#endif } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 3eb710f32ab60..1d9b490e8f9c5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -140,6 +140,40 @@ class __SYCL_EXPORT SubmissionInfo { ext::oneapi::experimental::event_mode_enum::none; }; +// This class is intended to store the kernel runtime information, +// extracted from the compile time kernel structures. +class __SYCL_EXPORT KernelDataDesc { +public: + KernelDataDesc() {} + + KernelDataDesc(const KernelDataDesc &rhs) = delete; + + KernelDataDesc(KernelDataDesc &&rhs) = delete; + + KernelDataDesc &operator=(const KernelDataDesc &rhs) = delete; + + KernelDataDesc &operator=(KernelDataDesc &&rhs) = delete; + + std::shared_ptr &HostKernel() { return MHostKernel; } + const std::shared_ptr &HostKernel() const { + return MHostKernel; + } + + char *GetKernelFuncPtr() { return (*MHostKernel).getPtr(); } + char *GetKernelFuncPtr() const { return (*MHostKernel).getPtr(); } + + detail::DeviceKernelInfo *&DeviceKernelInfoPtr() { + return MDeviceKernelInfoPtr; + } + detail::DeviceKernelInfo *DeviceKernelInfoPtr() const { + return MDeviceKernelInfoPtr; + } + +private: + std::shared_ptr MHostKernel; + detail::DeviceKernelInfo *MDeviceKernelInfoPtr = nullptr; +}; + } // namespace v1 } // namespace detail @@ -158,6 +192,19 @@ template event submit_with_event_impl(const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + +template +void submit_kernel_direct_impl(const queue &Q, PropertiesT Props, + nd_range Range, + const KernelType &KernelFunc, + const sycl::detail::code_location &CodeLoc); + +template +event submit_kernel_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); } // namespace detail } // namespace ext::oneapi::experimental @@ -2650,12 +2697,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + return submit_kernel_direct_with_event(Properties, nd_range<1>{1, 1}, + KernelFunc); + +#else return submit( [&](handler &CGH) { CGH.template single_task( Properties, KernelFunc); }, TlsCodeLocCapture.query()); +#endif } /// single_task version with a kernel represented as a lambda. @@ -3203,11 +3259,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); +#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + if constexpr (sizeof...(RestT) == 1) { + return submit_kernel_direct_with_event( + ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); + } else { + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Rest...); + }, + TlsCodeLocCapture.query()); + } +#else return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); }, TlsCodeLocCapture.query()); +#endif } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3586,6 +3655,19 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF, const sycl::detail::code_location &CodeLoc); + template + friend void ext::oneapi::experimental::detail::submit_kernel_direct_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + + template + friend event + ext::oneapi::experimental::detail::submit_kernel_direct_with_event_impl( + const queue &Q, PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc); + template void ProcessSubmitProperties(PropertiesT Props, detail::v1::SubmissionInfo &SI) const { @@ -3599,6 +3681,33 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } } + template struct TransformUserItemType { + using type = std::conditional_t< + std::is_convertible_v, LambdaArgType>, nd_item, + std::conditional_t, LambdaArgType>, + item, LambdaArgType>>; + }; + + template + void ProcessKernelDataDesc(const KernelType &KernelFunc, + detail::v1::KernelDataDesc &KDDesc) const { + + using LambdaArgType = sycl::detail::lambda_arg_type>; + using TransformedArgType = std::conditional_t< + WrapAsVal == detail::WrapAs::parallel_for, + std::conditional_t< + std::is_integral::value && Dims == 1, item, + typename TransformUserItemType::type>, + void>; + + KDDesc.HostKernel().reset( + new detail::HostKernel( + KernelFunc)); + + KDDesc.DeviceKernelInfoPtr() = &detail::getDeviceKernelInfo(); + } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES /// TODO: Unused. Remove these when ABI-break window is open. /// Not using `type_erased_cgfo_ty` on purpose. @@ -3670,6 +3779,30 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + event submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + event submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + event submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + + void submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const; + /// A template-free version of submit_without_event as const member function. void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, @@ -3699,6 +3832,57 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { TlsCodeLocCapture.isToplevel()); } + template > + event + submit_kernel_direct_with_event(PropertiesT Props, nd_range Range, + const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + (void)Props; + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::KernelDataDesc KDDesc{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessKernelDataDesc(KernelFunc, + KDDesc); + + detail::KernelWrapper::wrap(KernelFunc); + + return submit_kernel_direct_with_event_impl(Range, KDDesc, + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } + + template + void submit_kernel_direct_without_event( + PropertiesT Props, nd_range Range, const KernelType &KernelFunc, + const detail::code_location &CodeLoc = + detail::code_location::current()) const { + (void)Props; + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + detail::v1::KernelDataDesc KDDesc{}; + + using NameT = + typename detail::get_kernel_name_t::name; + + ProcessKernelDataDesc(KernelFunc, KDDesc); + + detail::KernelWrapper, PropertiesT>::wrap(KernelFunc); + + submit_kernel_direct_without_event_impl(Range, KDDesc, + TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); + } + /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. /// diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c59b5eaa20387..6b0f98fd91b4e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -417,6 +417,135 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } +std::vector queue_impl::extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams) { + + size_t IndexShift = 0; + std::vector Args; + + Args.reserve(NumKernelParams); + + for (size_t I = 0; I < NumKernelParams; ++I) { + detail::kernel_param_desc_t ParamDesc = ParamDescGetter(I); + void *Ptr = LambdaPtr + ParamDesc.offset; + const detail::kernel_param_kind_t &Kind = ParamDesc.kind; + const int &Size = ParamDesc.info; + + Args.emplace_back(Kind, Ptr, Size, I + IndexShift); + } + + return Args; +} + +detail::EventImplPtr queue_impl::submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + + KernelData KData; + + KData.setDeviceKernelInfoPtr(KDDesc.DeviceKernelInfoPtr()); + KData.setKernelFunc(KDDesc.GetKernelFuncPtr()); + KData.setNDRDesc(NDRDesc); + + return submit_kernel_direct_impl(KData, KDDesc.HostKernel(), CallerNeedsEvent, + CodeLoc, IsTopCodeLoc); +} + +detail::EventImplPtr queue_impl::submit_kernel_direct_impl( + KernelData &KData, std::shared_ptr HostKernel, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + + auto SubmitKernelFunc = + [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + KData.extractArgsAndReqsFromLambda(); + + CommandGroup.reset(new detail::CGExecKernel( + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // KernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); + }; + + return submit_direct(CallerNeedsEvent, SubmitKernelFunc); +} + +template +detail::EventImplPtr +queue_impl::submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc) { + detail::CG::StorageInitHelper CGData; + std::unique_lock Lock(MMutex); + + // Graphs are not supported yet for the no-handler path + assert(!hasCommandGraph()); + + // Set the No Last Event Mode to false, since the no-handler path + // does not support it yet. + MNoLastEventMode.store(false, std::memory_order_relaxed); + + // Used by queue_empty() and getLastEvent() + MEmpty.store(false, std::memory_order_release); + + // Sync with an external event + std::optional ExternalEvent = popExternalEvent(); + if (ExternalEvent) { + CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent)); + } + + // Sync with the last event for in order queue + EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr; + if (isInOrder() && LastEvent) { + CGData.MEvents.push_back(LastEvent); + } + + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder()) { + MMissedCleanupRequests.unset( + [&](MissedCleanupRequestsType &MissedCleanupRequests) { + for (auto &UpdatedGraph : MissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MissedCleanupRequests.clear(); + }); + + if (MDefaultGraphDeps.LastBarrier && + !MDefaultGraphDeps.LastBarrier->isEnqueued()) { + CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier); + } + } + + EventImplPtr EventImpl = SubmitCommandFunc(CGData); + + // Sync with the last event for in order queue + if (isInOrder() && !EventImpl->isDiscarded()) { + LastEvent = EventImpl; + } + + // Barrier and un-enqueued commands synchronization for out or order queue + if (!isInOrder() && !EventImpl->isEnqueued()) { + MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); + } + + return CallerNeedsEvent ? EventImpl : nullptr; +} + template event queue_impl::submitWithHandler(const std::vector &DepEvents, bool CallerNeedsEvent, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 2b1d8f637b05d..dc5ea2e72f35e 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -359,6 +359,51 @@ class queue_impl : public std::enable_shared_from_this { return createSyclObjFromImpl(ResEvent); } + event submit_kernel_direct_with_event( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + event submit_kernel_direct_with_event( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + event submit_kernel_direct_with_event( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + detail::EventImplPtr EventImpl = submit_kernel_direct_impl( + NDRDescT{Range}, KDDesc, true, CodeLoc, IsTopCodeLoc); + return createSyclObjFromImpl(EventImpl); + } + + void submit_kernel_direct_without_event( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, + IsTopCodeLoc); + } + + void submit_kernel_direct_without_event( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, + IsTopCodeLoc); + } + + void submit_kernel_direct_without_event( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + submit_kernel_direct_impl(NDRDescT{Range}, KDDesc, false, CodeLoc, + IsTopCodeLoc); + } + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, @@ -870,6 +915,46 @@ class queue_impl : public std::enable_shared_from_this { bool IsTopCodeLoc, const v1::SubmissionInfo &SubmitInfo); + std::vector extractArgsAndReqsFromLambda( + char *LambdaPtr, detail::kernel_param_desc_t (*ParamDescGetter)(int), + size_t NumKernelParams); + + /// Performs kernel submission to the queue. + /// + /// \param NDRDesc is an NDRange descriptor + /// \param KDDesc is a descriptor of the kernel + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. + detail::EventImplPtr submit_kernel_direct_impl( + const NDRDescT &NDRDesc, const v1::KernelDataDesc &KDDesc, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + + /// Performs kernel submission to the queue. + /// + /// \param KData contains aggregated data related to the kernel + /// \param HostKernel stores the kernel lambda instance + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. + detail::EventImplPtr submit_kernel_direct_impl( + KernelData &KData, std::shared_ptr HostKernel, + bool CallerNeedsEvent, const detail::code_location &CodeLoc, + bool IsTopCodeLoc); + + template + detail::EventImplPtr submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); + /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. /// \param HandlerFunc is a function that submits the operation with a diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index b73c24091d4ed..cb7da34228098 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -319,6 +319,48 @@ event queue::submit_with_event_impl( return impl->submit_with_event(CGH, SubmitInfo, CodeLoc, IsTopCodeLoc); } +event queue::submit_kernel_direct_with_event_impl( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + +event queue::submit_kernel_direct_with_event_impl( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + +event queue::submit_kernel_direct_with_event_impl( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + return impl->submit_kernel_direct_with_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + +void queue::submit_kernel_direct_without_event_impl( + nd_range<1> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + +void queue::submit_kernel_direct_without_event_impl( + nd_range<2> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + +void queue::submit_kernel_direct_without_event_impl( + nd_range<3> Range, const detail::v1::KernelDataDesc &KDDesc, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) const { + impl->submit_kernel_direct_without_event(Range, KDDesc, CodeLoc, + IsTopCodeLoc); +} + void queue::submit_without_event_impl( const detail::type_erased_cgfo_ty &CGH, const detail::v1::SubmissionInfo &SubmitInfo, diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 0481c5b393302..98515b118ef88 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3755,6 +3755,12 @@ _ZNK4sycl3_V15queue22submit_with_event_implERKNS0_6detail19type_erased_cgfo_tyER _ZNK4sycl3_V15queue25submit_without_event_implERKNS0_6detail19type_erased_cgfo_tyERKNS2_2v114SubmissionInfoERKNS2_13code_locationEb _ZNK4sycl3_V15queue28ext_codeplay_supports_fusionEv _ZNK4sycl3_V15queue30ext_oneapi_get_last_event_implEv +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue36submit_kernel_direct_with_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi1EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi2EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb +_ZNK4sycl3_V15queue39submit_kernel_direct_without_event_implENS0_8nd_rangeILi3EEERKNS0_6detail2v117KernelRuntimeInfoERKNS4_13code_locationEb _ZNK4sycl3_V15queue3getEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue15reference_countEEENS0_6detail18is_queue_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue8get_infoINS0_4info5queue6deviceEEENS0_6detail18is_queue_info_descIT_E11return_typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 2c6be6c1684bb..bce53d9f078b7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -298,6 +298,7 @@ ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_K_NAEBVproperty_list@23@@Z ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@V?$id@$02@23@V?$range@$02@23@1W4mode@access@23@PEAXHH_N_K4AEBVproperty_list@23@@Z ??0HostProfilingInfo@detail@_V1@sycl@@QEAA@XZ +??0KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??0LocalAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VLocalAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -508,6 +509,7 @@ ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@AEBUcode_location@123@@Z ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@XZ ??1AccessorBaseHost@detail@_V1@sycl@@QEAA@XZ +??1KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAA@XZ ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ ??1SYCLCategory@detail@_V1@sycl@@UEAA@XZ ??1SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ @@ -711,6 +713,8 @@ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB ?Clear@exception_list@_V1@sycl@@AEAAXXZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAPEAVDeviceKernelInfo@345@XZ +?DeviceKernelInfoPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEAVDeviceKernelInfo@345@XZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ ?EventMode@SubmissionInfo@detail@_V1@sycl@@QEAAAEAW4event_mode_enum@experimental@oneapi@ext@34@XZ @@ -719,8 +723,14 @@ ?EventMode@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBW4event_mode_enum@experimental@oneapi@ext@45@XZ ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAPEADXZ +?GetKernelFuncPtr@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAPEADXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?HostKernel@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@VHostKernelBase@detail@_V1@sycl@@@std@@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEAAAEAVstring@345@XZ +?KernelName@KernelRuntimeInfo@v1@detail@_V1@sycl@@QEBAAEBVstring@345@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@v1@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@345@XZ @@ -4467,6 +4477,12 @@ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?stringifyErrorCode@detail@_V1@sycl@@YAPEBDH@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_with_event_impl@queue@_V1@sycl@@AEBA?AVevent@23@V?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@7823@AEBUcode_location@823@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$00@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$01@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z +?submit_direct_without_event_impl@queue@_V1@sycl@@AEBAXV?$nd_range@$02@23@AEBVSubmissionInfo@v1@detail@23@AEBVKernelRuntimeInfo@6723@AEBUcode_location@723@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index dedd4ebbcb407..f8cb84900d4f8 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -626,6 +626,8 @@ TEST_F(CommandGraphTest, AccessorModeEdges) { // Tests the transitive queue recording behaviour with queue shortcuts. TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { +// Graphs not supported yet for the no-handler submit path +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT device Dev; context Ctx{{Dev}}; queue Q1{Ctx, Dev}; @@ -669,6 +671,7 @@ TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { ext::oneapi::experimental::queue_state::executing); ASSERT_EQ(Q3.ext_oneapi_get_state(), ext::oneapi::experimental::queue_state::executing); +#endif } // Tests that dynamic_work_group_memory.get() will throw on the host side. diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index c45d72ea4c343..6272b3d7d7a8a 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -26,40 +26,52 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunch = 0; inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { ++counter_urEnqueueKernelLaunch; +// The no-handler scheduler submission includes a fix for the event return, +// which is not yet applied to the handler-based path +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemcpy = 0; inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) { ++counter_urUSMEnqueueMemcpy; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueFill = 0; inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) { ++counter_urUSMEnqueueFill; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueuePrefetch = 0; inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) { ++counter_urUSMEnqueuePrefetch; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemAdvise = 0; inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) { ++counter_urUSMEnqueueMemAdvise; +#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT auto params = *static_cast(pParams); EXPECT_EQ(*params.pphEvent, nullptr); +#endif return UR_RESULT_SUCCESS; } diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 31a2914e2c803..7dae968013948 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -48,7 +48,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { EXPECT_EQ(GEventsWaitCounter, expectedCount); } -enum class CommandType { KERNEL = 1, MEMSET = 2 }; +enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; std::vector> ExecutedCommands; inline ur_result_t customEnqueueKernelLaunch(void *pParams) { @@ -162,3 +162,44 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { + ExecutedCommands.clear(); + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + &customEnqueueKernelLaunch); + + sycl::platform Plt = sycl::platform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0}); + }); + }); + + event Ev2 = InOrderQueue.parallel_for( + nd_range<1>{range{32}, range{32}}, [](nd_item<1>) {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +} diff --git a/sycl/unittests/xpti_trace/CMakeLists.txt b/sycl/unittests/xpti_trace/CMakeLists.txt index 44574d36a9aa4..0fbb3be4046fc 100644 --- a/sycl/unittests/xpti_trace/CMakeLists.txt +++ b/sycl/unittests/xpti_trace/CMakeLists.txt @@ -8,3 +8,4 @@ add_sycl_unittest(XptiTraceTests OBJECT ) target_link_libraries(XptiTraceTests_non_preview PRIVATE xpti xptitest_subscriber) target_link_libraries(XptiTraceTests_preview PRIVATE xpti xptitest_subscriber) +target_link_libraries(XptiTraceTests_no_cgh PRIVATE xpti xptitest_subscriber)