From d40e0d13851f76e533195b586ec6658ee491734b Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 24 Jan 2025 11:13:31 +0000 Subject: [PATCH] [SYCL][Graph] Support for native-command WIP Prototype [sycl_ext_codeplay_enqueue_native_command](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc) support for SYCL-Graph. TODO: * buffer support * HIP/Level-Zero/OpenCL support * spec wording --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 2 +- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 2 +- ...t_codeplay_enqueue_native_command.asciidoc | 46 ++++++-- .../sycl_ext_oneapi_graph.asciidoc | 8 +- .../sycl/detail/backend_traits_cuda.hpp | 11 ++ .../sycl/detail/backend_traits_hip.hpp | 11 ++ .../sycl/detail/backend_traits_level_zero.hpp | 10 ++ .../sycl/detail/backend_traits_opencl.hpp | 10 ++ .../sycl/ext/oneapi/experimental/graph.hpp | 3 +- sycl/include/sycl/handler.hpp | 3 - sycl/include/sycl/interop_handle.hpp | 30 ++++- sycl/source/detail/graph_impl.cpp | 2 + sycl/source/detail/graph_impl.hpp | 2 + sycl/source/detail/scheduler/commands.cpp | 66 ++++++++++- sycl/source/interop_handle.cpp | 16 +++ .../Graph/NativeCommand/lit.local.cfg | 1 + .../native_cuda_explicit_usm.cpp | 81 ++++++++++++++ .../native_cuda_record_buffer.cpp | 98 +++++++++++++++++ .../NativeCommand/native_cuda_record_usm.cpp | 91 +++++++++++++++ .../NativeCommand/native_hip_explicit_usm.cpp | 75 +++++++++++++ .../native_hip_record_buffer.cpp | 104 ++++++++++++++++++ .../NativeCommand/native_hip_record_usm.cpp | 95 ++++++++++++++++ .../NativeCommand/native_level-zero_usm.cpp | 68 ++++++++++++ .../NativeCommand/native_opencl_buffer.cpp | 85 ++++++++++++++ .../Extensions/CommandGraph/Exceptions.cpp | 13 --- 25 files changed, 894 insertions(+), 39 deletions(-) create mode 100644 sycl/test-e2e/Graph/NativeCommand/lit.local.cfg create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp create mode 100644 sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..202bfdcdc19e3 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,7 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/Bensuo/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 58a9a6ed5f458..6914f5612f5b4 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -4,4 +4,4 @@ # Date: Tue Feb 4 13:14:19 2025 +0000 # Merge pull request #2614 from kurapov-peter/spills # Add UR_KERNEL_INFO_SPILL_MEM_SIZE kernel info prop -set(UNIFIED_RUNTIME_TAG 08d36b76a5b1c4f080e3301507a39525ab5ab365) +set(UNIFIED_RUNTIME_TAG "ewan/native_command") diff --git a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc index 143d8f3fd4521..523c7c2b454cb 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc @@ -173,12 +173,47 @@ dependencies are satisfied. The SYCL command described above completes once all of the native asynchronous tasks it contains have completed. -The call to `interopCallable` must not submit any synchronous tasks to the +The call to `interopCallable` should not submit any synchronous tasks to the native backend object, and it must not block waiting for any tasks to complete. The call also must not add tasks to backend objects that underly any other queue, aside from the queue that is associated with this handler. If it does any of these things, the behavior is undefined. +=== sycl_ext_oneapi_graph Interaction + +`ext_codeplay_enqueue_native_command` can be used in the +link:../experimental/sycl_ext_oneapi_graph.asciidoc[sycl_ext_oneapi_graph] +extension as a graph node. The `interopCallable` object will be invoked +during `command_graph::finalize()` when the backend object for the graph +is available to give to the user as a handle. The user then may +add nodes to this backend graph objects using native APIs. Note that this +involves a synchronous API call to a native backend object, which is an +exception to earlier advice about submitting synchronous task to native +backend objects inside `interopCallable`. + +The runtime will schedule the dependencies of the user added nodes such +that they respect the graph node edges. + +=== Additions to the interop_handler class + +```c++ +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; + +class interop_handle { + template + backend_return_t ext_codeplay_get_native_graph<>() const; + + bool ext_codeplay_has_graph<>() const; +}; +``` + +* CUGraph +* hipGraph_t +* ze_command_list_handle_t +* cl_command_buffer_khr + + == Example This example demonstrates how to use this extension to enqueue asynchronous @@ -206,12 +241,3 @@ q.submit([&](sycl::handler &cgh) { }); q.wait(); ``` - -== Issues - -=== sycl_ext_oneapi_graph - -`ext_codeplay_enqueue_native_command` -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. - diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 649c00fb474b5..2171eda82d96e 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -2077,13 +2077,9 @@ extensions. ==== sycl_ext_codeplay_enqueue_native_command -`ext_codeplay_enqueue_native_command`, defined in +`ext_codeplay_enqueue_native_command` commands, defined in link:../experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc[sycl_ext_codeplay_enqueue_native_command] -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. - -Removing this restriction is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. +can be used in graph nodes. See the section on `sycl_ext_oneapi_graph`. ==== sycl_ext_intel_queue_index diff --git a/sycl/include/sycl/detail/backend_traits_cuda.hpp b/sycl/include/sycl/detail/backend_traits_cuda.hpp index 89bef47d01a4b..9a4df94693329 100644 --- a/sycl/include/sycl/detail/backend_traits_cuda.hpp +++ b/sycl/include/sycl/detail/backend_traits_cuda.hpp @@ -24,6 +24,7 @@ typedef struct CUctx_st *CUcontext; typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; typedef struct CUmod_st *CUmodule; +typedef struct CUgraph_st *CUgraph; // As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2 #if defined(_WIN64) || defined(__LP64__) @@ -102,6 +103,16 @@ template <> struct BackendReturn { using type = CUstream; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; +template <> struct BackendInput { + using type = CUgraph; +}; + +template <> struct BackendReturn { + using type = CUgraph; +}; + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/backend_traits_hip.hpp b/sycl/include/sycl/detail/backend_traits_hip.hpp index 9f7cbb2bfdd91..a867c4e3d62b8 100644 --- a/sycl/include/sycl/detail/backend_traits_hip.hpp +++ b/sycl/include/sycl/detail/backend_traits_hip.hpp @@ -25,6 +25,7 @@ typedef struct ihipStream_t *HIPstream; typedef struct ihipEvent_t *HIPevent; typedef struct ihipModule_t *HIPmodule; typedef void *HIPdeviceptr; +typedef struct ihipGraph *HIPGraph; namespace sycl { inline namespace _V1 { @@ -96,6 +97,16 @@ template <> struct BackendReturn { using type = HIPstream; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; +template <> struct BackendInput { + using type = HIPGraph; +}; + +template <> struct BackendReturn { + using type = HIPGraph; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = false; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/detail/backend_traits_level_zero.hpp b/sycl/include/sycl/detail/backend_traits_level_zero.hpp index f2220c863c123..00e954de9dde7 100644 --- a/sycl/include/sycl/detail/backend_traits_level_zero.hpp +++ b/sycl/include/sycl/detail/backend_traits_level_zero.hpp @@ -207,6 +207,16 @@ template <> struct BackendReturn { using type = ze_kernel_handle_t; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; +template <> struct BackendInput { + using type = ze_command_list_handle_t; +}; + +template <> struct BackendReturn { + using type = ze_command_list_handle_t; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = true; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/detail/backend_traits_opencl.hpp b/sycl/include/sycl/detail/backend_traits_opencl.hpp index b203013c0e903..ff5cd9364fb11 100644 --- a/sycl/include/sycl/detail/backend_traits_opencl.hpp +++ b/sycl/include/sycl/detail/backend_traits_opencl.hpp @@ -132,6 +132,16 @@ template <> struct BackendReturn { using type = cl_kernel; }; +using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; +template <> struct BackendInput { + using type = cl_command_buffer_khr; +}; + +template <> struct BackendReturn { + using type = cl_command_buffer_khr; +}; + template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = true; static constexpr bool MakeDevice = true; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index e2e87c30ea945..6401bdbbab463 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -114,7 +114,8 @@ enum class node_type { prefetch = 6, memadvise = 7, ext_oneapi_barrier = 8, - host_task = 9 + host_task = 9, + native_command = 10 }; /// Class representing a node in the graph, returned by command_graph::add(). diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index a76d6002d9d87..705c9c520dec4 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1934,9 +1934,6 @@ class __SYCL_EXPORT handler { void(interop_handle)>::value> ext_codeplay_enqueue_native_command([[maybe_unused]] FuncT &&Func) { #ifndef __SYCL_DEVICE_ONLY__ - throwIfGraphAssociated< - ext::oneapi::experimental::detail::UnsupportedGraphFeatures:: - sycl_ext_codeplay_enqueue_native_command>(); ext_codeplay_enqueue_native_command_impl(Func); #endif } diff --git a/sycl/include/sycl/interop_handle.hpp b/sycl/include/sycl/interop_handle.hpp index 2e7408cf5c0f9..3669ca0ecd003 100644 --- a/sycl/include/sycl/interop_handle.hpp +++ b/sycl/include/sycl/interop_handle.hpp @@ -49,6 +49,9 @@ class interop_handle { /// interop_handle. __SYCL_EXPORT backend get_backend() const noexcept; + /// Returns true if command-group is being added to a graph as a node + __SYCL_EXPORT bool ext_codeplay_has_graph() const noexcept; + /// Receives a SYCL accessor that has been defined as a requirement for the /// command group, and returns the underlying OpenCL memory object that is /// used by the SYCL runtime. If the accessor passed as parameter is not part @@ -134,6 +137,26 @@ class interop_handle { #endif } + using graph = ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::modifiable>; + template + backend_return_t ext_codeplay_get_native_graph() const { +#ifndef __SYCL_DEVICE_ONLY__ + // TODO: replace the exception thrown below with the SYCL 2020 exception + // with the error code 'errc::backend_mismatch' when those new exceptions + // are ready to be used. + if (Backend != get_backend()) + throw exception(make_error_code(errc::invalid), + "Incorrect backend argument was passed"); + + // C-style cast required to allow various native types + return (backend_return_t)getNativeGraph(); +#else + // we believe this won't be ever called on device side + return 0; +#endif + } + /// Returns the SYCL application interoperability native backend object /// associated with the device associated with the SYCL queue that the host /// task was submitted to. The native backend object returned must be in @@ -186,8 +209,9 @@ class interop_handle { interop_handle(std::vector MemObjs, const std::shared_ptr &Queue, const std::shared_ptr &Device, - const std::shared_ptr &Context) - : MQueue(Queue), MDevice(Device), MContext(Context), + const std::shared_ptr &Context, + const ur_exp_command_buffer_handle_t &Graph) + : MQueue(Queue), MDevice(Device), MContext(Context), MGraph(Graph), MMemObjs(std::move(MemObjs)) {} template @@ -211,10 +235,12 @@ class interop_handle { getNativeQueue(int32_t &NativeHandleDesc) const; __SYCL_EXPORT ur_native_handle_t getNativeDevice() const; __SYCL_EXPORT ur_native_handle_t getNativeContext() const; + __SYCL_EXPORT ur_native_handle_t getNativeGraph() const; std::shared_ptr MQueue; std::shared_ptr MDevice; std::shared_ptr MContext; + ur_exp_command_buffer_handle_t MGraph; std::vector MMemObjs; }; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 5f9f79d878d03..c940b35c81448 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -824,6 +824,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNode( std::shared_ptr Node) { // Queue which will be used for allocation operations for accessors. + // Will also be used in native commands to return to the user in + // `interop_handler::get_native_queue()` calls auto AllocaQueue = std::make_shared( DeviceImpl, sycl::detail::getSyclObjImpl(Ctx), sycl::async_handler{}, sycl::property_list{}); diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index e609123b4f285..f5d960b2d15e6 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -71,6 +71,8 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { return node_type::host_task; case sycl::detail::CGType::ExecCommandBuffer: return node_type::subgraph; + case sycl::detail::CGType::EnqueueNativeCommand: + return node_type::native_command; default: assert(false && "Invalid Graph Node Type"); return node_type::empty; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 005008a74ebd0..d92077084fc2b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -454,7 +454,7 @@ class DispatchHostTask { "Host task submissions should have an associated queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), - HostTask.MQueue->getContextImplPtr()}; + HostTask.MQueue->getContextImplPtr(), nullptr}; // TODO: should all the backends that support this entry point use this // for host task? auto &Queue = HostTask.MQueue; @@ -2879,6 +2879,19 @@ ur_result_t enqueueReadWriteHostPipe(const QueueImplPtr &Queue, return Error; } +namespace { + +struct CommandBufferNativeCommandData { + sycl::interop_handle ih; + std::function func; +}; + +void CommandBufferInteropFreeFunc(void *InteropData) { + auto *Data = reinterpret_cast(InteropData); + return Data->func(Data->ih); +} +} // namespace + ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { assert(MQueue && "Command buffer enqueue should have an associated queue"); // Wait on host command dependencies @@ -3045,6 +3058,55 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } + case CGType::EnqueueNativeCommand: { + // Queue is created by graph_impl before creating command to submit to + // scheduler. + const AdapterPtr &Adapter = MQueue->getAdapter(); + const auto Backend = MQueue->get_device().get_backend(); + CGHostTask *HostTask = (CGHostTask *)MCommandGroup.get(); + + // TODO - Doc this + ur_exp_command_buffer_handle_t ChildCommandBuffer = nullptr; + if (Backend == sycl::backend::ext_oneapi_cuda || + Backend == sycl::backend::ext_oneapi_hip) { + + ur_exp_command_buffer_desc_t Desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC /*stype*/, + nullptr /*pnext*/, false /* updatable */, false /* in-order */, + false /* profilable*/ + }; + auto ContextImpl = sycl::detail::getSyclObjImpl(MQueue->get_context()); + auto DeviceImpl = sycl::detail::getSyclObjImpl(MQueue->get_device()); + Adapter->call( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc, + &ChildCommandBuffer); + } + + std::vector ReqToMem; // TODO work with buffers + interop_handle IH{ReqToMem, HostTask->MQueue, + HostTask->MQueue->getDeviceImplPtr(), + HostTask->MQueue->getContextImplPtr(), + ChildCommandBuffer ? ChildCommandBuffer : MCommandBuffer}; + CommandBufferNativeCommandData CustomOpData{ + IH, HostTask->MHostTask->MInteropTask}; + + Adapter->call( + MCommandBuffer, CommandBufferInteropFreeFunc, &CustomOpData, + ChildCommandBuffer, MSyncPointDeps.size(), + MSyncPointDeps.empty() ? nullptr : MSyncPointDeps.data(), + &OutSyncPoint); + + if (ChildCommandBuffer) { + ur_result_t Res = Adapter->call_nocheck< + sycl::detail::UrApiKind::urCommandBufferReleaseExp>( + ChildCommandBuffer); + (void)Res; + assert(Res == UR_RESULT_SUCCESS); + } + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } default: throw exception(make_error_code(errc::runtime), @@ -3416,7 +3478,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { EnqueueNativeCommandData CustomOpData{ interop_handle{ReqToMem, HostTask->MQueue, HostTask->MQueue->getDeviceImplPtr(), - HostTask->MQueue->getContextImplPtr()}, + HostTask->MQueue->getContextImplPtr(), nullptr}, HostTask->MHostTask->MInteropTask}; ur_bool_t NativeCommandSupport = false; diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index aabf22702ef5f..d0b178dd1e9b3 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -23,6 +23,10 @@ backend interop_handle::get_backend() const noexcept { return detail::getImplBackend(MQueue); } +bool interop_handle::ext_codeplay_has_graph() const noexcept { + return MGraph != nullptr; +} + ur_native_handle_t interop_handle::getNativeMem(detail::Requirement *Req) const { auto Iter = std::find_if(std::begin(MMemObjs), std::end(MMemObjs), @@ -53,5 +57,17 @@ interop_handle::getNativeQueue(int32_t &NativeHandleDesc) const { return MQueue->getNative(NativeHandleDesc); } +ur_native_handle_t interop_handle::getNativeGraph() const { + if (!MGraph) { + throw exception(make_error_code(errc::invalid), + "Command-Group is not being added as a graph node"); + } + + auto Adapter = MQueue->getAdapter(); + ur_native_handle_t Handle; + Adapter->call(MGraph, + &Handle); + return Handle; +} } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg b/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg new file mode 100644 index 0000000000000..f01e2216db41b --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/lit.local.cfg @@ -0,0 +1 @@ +config.required_features += ['aspect-ext_oneapi_limited_graph'] diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp new file mode 100644 index 0000000000000..a51f498146d2e --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_explicit_usm.cpp @@ -0,0 +1,81 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + CUgraph NativeGraph = + IH.ext_codeplay_get_native_graph(); + + CUDA_MEMCPY3D Params; + std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D)); + Params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + Params.srcDevice = (CUdeviceptr)PtrX; + Params.srcHost = nullptr; + Params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + Params.dstDevice = (CUdeviceptr)PtrY, Params.dstHost = nullptr; + Params.WidthInBytes = Size * sizeof(int); + Params.Height = 1; + Params.Depth = 1; + + CUgraphNode Node; + CUcontext Context = IH.get_native_context(); + auto Res = cuGraphAddMemcpyNode(&Node, NativeGraph, nullptr, 0, &Params, + Context); + assert(Res == CUDA_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp new file mode 100644 index 0000000000000..c94f4356676b8 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_buffer.cpp @@ -0,0 +1,98 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with cuGraphCreate + CUgraph NativeGraph = + IH.ext_codeplay_get_native_graph(); + + auto PtrX = IH.get_native_mem(AccX); + auto PtrY = IH.get_native_mem(AccY); + + // Start stream capture + auto Res = + cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); + + // Add memcopy node + Res = cuMemcpyAsync(PtrY, PtrX, Size * sizeof(int), NativeStream); + assert(Res == CUDA_SUCCESS); + + // cuStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + CUgraph RecordedGraph; + Res = cuStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == CUDA_SUCCESS); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + CUgraphNode Node; + cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == CUDA_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp new file mode 100644 index 0000000000000..1aa85f760e22a --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_cuda_record_usm.cpp @@ -0,0 +1,91 @@ +// RUN: %{build} -o %t.out -lcuda +// RUN: %{run} %t.out +// REQUIRES: cuda + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with cuGraphCreate + CUgraph NativeGraph = + IH.ext_codeplay_get_native_graph(); + + // Start stream capture + auto Res = + cuStreamBeginCapture(NativeStream, CU_STREAM_CAPTURE_MODE_GLOBAL); + assert(Res == CUDA_SUCCESS); + + // Add memcopy node + Res = cuMemcpyAsync((CUdeviceptr)PtrY, (CUdeviceptr)PtrX, + Size * sizeof(int), NativeStream); + assert(Res == CUDA_SUCCESS); + + // cuStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + CUgraph RecordedGraph; + Res = cuStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == CUDA_SUCCESS); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + CUgraphNode Node; + cuGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == CUDA_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp new file mode 100644 index 0000000000000..caad5b86224c1 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_explicit_usm.cpp @@ -0,0 +1,75 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include +#include + +#define __HIP_PLATFORM_AMD__ + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Graph already created with hipGraphCreate + hipGraph_t NativeGraph = + IH.ext_codeplay_get_native_graph(); + + hipGraphNode_t Node; + auto Res = hipGraphAddMemcpyNode1D(&Node, NativeGraph, nullptr, 0 + PtrY, PtrX, Size * sizeof(int), hipMemcpyDefault)); + + assert(Res == hipSuccess); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp new file mode 100644 index 0000000000000..d7c24d030ecd6 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_buffer.cpp @@ -0,0 +1,104 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include +#include + +#define __HIP_PLATFORM_AMD__ + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with hipGraphCreate + hipGraph_t NativeGraph = + IH.ext_codeplay_get_native_graph(); + + auto PtrX = IH.get_native_mem(AccX); + auto PtrY = IH.get_native_mem(AccY); + + // Start stream capture + auto Res = + hipStreamBeginCapture(NativeStream, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); + + // Add memcopy node + Res = hipMemcpyWithStream(PtrY, PtrX, sizeof(int) * Size, + hipMemcpyDefault, NativeStream); + assert(Res == hipSuccess); + + // hipStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + hipGraph_t RecordedGraph; + Res = hipStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == hipSuccess); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + hipGraphNode_t Node; + hipGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, + RecordedGraph); + assert(Res == hipSuccess); + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp new file mode 100644 index 0000000000000..043b56e952173 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_hip_record_usm.cpp @@ -0,0 +1,95 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: target-amd + +#include "../graph_common.hpp" +#include +#include +#include + +#define __HIP_PLATFORM_AMD__ + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + // Newly created stream for this node + auto NativeStream = IH.get_native_queue(); + // Graph already created with hipGraphCreate + hipGraph_t NativeGraph = + IH.ext_codeplay_get_native_graph(); + + // Start stream capture + auto Res = + hipStreamBeginCapture(NativeStream, hipStreamCaptureModeGlobal); + assert(Res == hipSuccess); + + // Add memcopy node + Res = hipMemcpyWithStream(PtrY, PtrX, sizeof(int) * Size, + hipMemcpyDefault, NativeStream); + assert(Res == hipSuccess); + + // hipStreamEndCapture returns a new graph, if we overwrite + // "NativeGraph" it won't be picked up by the UR runtime, as it's + // a passed-by-value pointer + hipGraph_t RecordedGraph; + Res = hipStreamEndCapture(NativeStream, &RecordedGraph); + assert(Res == hipSuccess); + + // Add graph to native graph as a child node + // Need to return a node object for the node to be created, + // can't be nullptr. + hipGraphNode_t Node; + hipGraphAddChildGraphNode(&Node, NativeGraph, nullptr, 0, RecordedGraph); + assert(Res == hipSuccess); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp b/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp new file mode 100644 index 0000000000000..9b6c05c9160e0 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_level-zero_usm.cpp @@ -0,0 +1,68 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// REQUIRES: level_zero, level_zero_dev_kit + +#include "../graph_common.hpp" +#include +#include +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + int *PtrY = malloc_device(Size, Queue); + + exp_ext::command_graph Graph{Queue}; + + Graph.begin_recording(Queue); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrX[i] = i; + PtrY[i] = 0; + } + }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + ze_command_list_handle_t NativeGraph = + IH.ext_codeplay_get_native_graph(); + + auto Res = zeCommandListAppendMemoryCopy( + NativeGraph, PtrY, PtrX, Size * sizeof(int), nullptr, 0, nullptr); + assert(Res == ZE_RESULT_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(EventB); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + std::vector HostData(Size); + + Queue.copy(PtrY, HostData.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostData[i], + std::string("HostData at index ") + std::to_string(i))); + } + + free(PtrX, Queue); + free(PtrY, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp b/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp new file mode 100644 index 0000000000000..1de6710b2a2e3 --- /dev/null +++ b/sycl/test-e2e/Graph/NativeCommand/native_opencl_buffer.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -Wno-error=deprecated-declarations -o %t.out %threads_lib %opencl_lib +// RUN: %{run} %t.out +// REQUIRES: opencl + +#include "../graph_common.hpp" +#include +#include + +#include + +int main() { + queue Queue; + + int *PtrX = malloc_device(Size, Queue); + buffer BufX{Size}; + BufX.set_write_back(false); + buffer BufY{Size}; + BufY.set_write_back(false); + + { + + exp_ext::command_graph Graph{ + Queue, {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + CGH.single_task([=]() { + for (size_t i = 0; i < Size; i++) { + AccX[i] = i; + AccY[i] = 0; + } + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccX = BufX.get_access(); + auto AccY = BufY.get_access(); + + CGH.ext_codeplay_enqueue_native_command([=](interop_handle IH) { + if (!IH.ext_codeplay_has_graph()) { + assert(false && "Native Handle should have a graph"); + } + auto Platform = + get_native(Queue.get_context().get_platform()); + clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = + reinterpret_cast( + clGetExtensionFunctionAddressForPlatform( + Platform, "clCommandCopyBufferKHR")); + assert(clCommandCopyBufferKHR != nullptr); + + cl_command_buffer_khr NativeGraph = + IH.ext_codeplay_get_native_graph(); + auto SrcBuffer = IH.get_native_mem(AccX); + auto DstBuffer = IH.get_native_mem(AccY); + + auto Res = clCommandCopyBufferKHR( + NativeGraph, nullptr, nullptr, SrcBuffer[0], DstBuffer[0], 0, 0, + Size * sizeof(int), 0, nullptr, nullptr, nullptr); + assert(Res == CL_SUCCESS); + }); + }); + + Queue.submit([&](handler &CGH) { + auto AccY = BufY.get_access(); + CGH.parallel_for(range<1>{Size}, [=](id<1> it) { AccY[it] *= 2; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + } + + auto HostAcc = BufY.get_host_access(); + for (size_t i = 0; i < Size; i++) { + const int Ref = i * 2; + assert(check_value(Ref, HostAcc[i], + std::string("HostAcc at index ") + std::to_string(i))); + } + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 452738b8fca86..ddb9236e5047e 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -380,19 +380,6 @@ TEST_F(CommandGraphTest, BindlessExceptionCheck) { sycl::free(ImgMemUSM, Ctxt); } -// ext_codeplay_enqueue_native_command isn't supported with SYCL graphs -TEST_F(CommandGraphTest, EnqueueCustomCommandCheck) { - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - Graph.add([&](sycl::handler &CGH) { - CGH.ext_codeplay_enqueue_native_command([=](sycl::interop_handle IH) {}); - }); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); -} - // sycl_ext_oneapi_work_group_scratch_memory isn't supported with SYCL graphs TEST_F(CommandGraphTest, WorkGroupScratchMemoryCheck) { ASSERT_THROW(