From 9e3024c3a15ea6ba148b7653cbe12418429e806b Mon Sep 17 00:00:00 2001 From: "Zhang, Winston" Date: Tue, 27 Feb 2024 20:03:03 -0800 Subject: [PATCH] [UR] Draft for adding support for counter-based events Draft for counter-based events implementation. As of now, only the creation of event, cmdlists, cmdqueue/pools are implemented. Signed-off-by: Zhang, Winston --- source/adapters/level_zero/command_buffer.cpp | 7 +- source/adapters/level_zero/context.cpp | 13 +++- source/adapters/level_zero/context.hpp | 8 +-- source/adapters/level_zero/event.cpp | 15 +++-- source/adapters/level_zero/event.hpp | 7 +- source/adapters/level_zero/kernel.cpp | 3 +- source/adapters/level_zero/queue.cpp | 64 ++++++++++++++----- source/adapters/level_zero/queue.hpp | 5 +- 8 files changed, 89 insertions(+), 33 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7dc2a42fd6..4cc3da086a 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -942,9 +942,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); if (Event) { - UR_CALL(createEventAndAssociateQueue( - Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, - SignalCommandList, false, false, true)); + UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, + UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + SignalCommandList, false, false, true, + Queue->usingCounterBasedEvents())); if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) { // Multiple submissions of a command buffer implies that we need to save diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index c4b5423adb..ffe63d7965 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -468,7 +468,8 @@ static const uint32_t MaxNumEventsPerPool = [] { ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, - bool ProfilingEnabled, ur_device_handle_t Device) { + bool ProfilingEnabled, ur_device_handle_t Device, + std::optional CounterBasedEventEnabled) { // Lock while updating event pool machinery. std::scoped_lock Lock(ZeEventPoolCacheMutex); @@ -510,6 +511,16 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; if (ProfilingEnabled) ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + if (CounterBasedEventEnabled.has_value() && + CounterBasedEventEnabled.value()) { + ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + ze_event_pool_counter_based_exp_desc_t counterBasedExt = { + ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC}; + counterBasedExt.flags |= + ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE | + ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE; + ZeEventPoolDesc.pNext = &counterBasedExt; + } urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); std::vector ZeDevices; diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 2c80ff0e33..dfea4d5133 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -192,10 +192,10 @@ struct ur_context_handle_t_ : _ur_object { // pool then create new one. The HostVisible parameter tells if we need a // slot for a host-visible event. The ProfilingEnabled tells is we need a // slot for an event with profiling capabilities. - ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, - bool HostVisible, - bool ProfilingEnabled, - ur_device_handle_t Device); + ur_result_t getFreeSlotInExistingOrNewPool( + ze_event_pool_handle_t &, size_t &, bool HostVisible, + bool ProfilingEnabled, ur_device_handle_t Device, + std::optional CounterBasedEventEnabled = std::nullopt); // Get ur_event_handle_t from cache. ur_event_handle_t getEventFromContextCache(bool HostVisible, diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 1f3b62ce8f..c58a3a021e 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -606,7 +606,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( UR_CALL(createEventAndAssociateQueue( UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, /* IsInternal */ false, /* IsMultiDevice */ false, - /* HostVisible */ true)); + /* HostVisible */ true, UrQueue->usingCounterBasedEvents())); ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent)); @@ -1049,7 +1049,8 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent) { + ur_event_handle_t *RetEvent, + std::optional CounterBasedEventEnabled) { bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); @@ -1071,14 +1072,18 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, size_t Index = 0; if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled, Device)) + ZeEventPool, Index, HostVisible, ProfilingEnabled, Device, + CounterBasedEventEnabled.has_value() + ? CounterBasedEventEnabled.value() + : false)) return Res; ZeStruct ZeEventDesc; ZeEventDesc.index = Index; ZeEventDesc.wait = 0; - if (HostVisible) { + if (HostVisible || (CounterBasedEventEnabled.has_value() && + CounterBasedEventEnabled.value())) { ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; } else { // @@ -1287,7 +1292,7 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( UR_CALL(createEventAndAssociateQueue( Queue, &MultiDeviceEvent, EventList[I]->CommandType, CommandList, - IsInternal, IsMultiDevice)); + IsInternal, IsMultiDevice, Queue->usingCounterBasedEvents())); MultiDeviceZeEvent = MultiDeviceEvent->ZeEvent; const auto &ZeCommandList = CommandList->first; EventList[I]->RefCount.increment(); diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index c266de8c0d..c0cba0091b 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -29,9 +29,10 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); -ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, - bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent); +ur_result_t +EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, + bool IsMultiDevice, bool HostVisible, ur_event_handle_t *RetEvent, + std::optional CounterBasedEventEnabled = std::nullopt); } // extern "C" // This is an experimental option that allows to disable caching of events in diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 0e5ce3215a..f9e573f383 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -206,7 +206,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, - CommandList, IsInternal, false)); + CommandList, IsInternal, false, + Queue->usingCounterBasedEvents())); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 241e3a23a2..3be5221664 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1127,7 +1127,9 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, reinterpret_cast(this), &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, /* IsInternal */ false, /* IsMultiDevice */ true, - /* HostVisible */ true); + /* HostVisible */ true, + (reinterpret_cast(this)) + ->usingCounterBasedEvents()); if (Res) return Res; @@ -1364,6 +1366,28 @@ bool ur_queue_handle_t_::isInOrderQueue() const { 0); } +bool ur_queue_handle_t_::usingCounterBasedEvents() const { + if (!this->isInOrderQueue()) + return false; + + static const bool UseDriverCounterBasedEvents = [this] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS"); + if (!UrRet) + return false; + return std::atoi(UrRet) != 0; + }(); + + bool usingInOrderList = true; + for (auto &&It = this->CommandListMap.begin(); + It != this->CommandListMap.end(); ++It) { + if (It->second.ZeQueueDesc.flags != ZE_COMMAND_QUEUE_FLAG_IN_ORDER) { + usingInOrderList = false; + break; + } + } + return UseDriverCounterBasedEvents && usingInOrderList; +} + // Helper function to perform the necessary cleanup of the events from reset cmd // list. ur_result_t CleanupEventListFromResetCmdList( @@ -1498,12 +1522,11 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, // visible pool. // \param HostVisible tells if the event must be created in the // host-visible pool. If not set then this function will decide. -ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, - ur_event_handle_t *Event, - ur_command_t CommandType, - ur_command_list_ptr_t CommandList, - bool IsInternal, bool IsMultiDevice, - std::optional HostVisible) { +ur_result_t createEventAndAssociateQueue( + ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, + std::optional HostVisible, + std::optional usingCounterBasedEvents) { if (!HostVisible.has_value()) { // Internal/discarded events do not need host-scope visibility. @@ -1516,8 +1539,10 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, : nullptr; if (*Event == nullptr) - UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, - HostVisible.value(), Event)); + UR_CALL(EventCreate( + Queue->Context, Queue, IsMultiDevice, HostVisible.value(), Event, + usingCounterBasedEvents.has_value() ? usingCounterBasedEvents.value() + : false)); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType; @@ -1805,6 +1830,9 @@ ur_queue_handle_t_::ur_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) { ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal; ZeCommandQueueDesc.index = QueueIndex; ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + if (Queue->usingCounterBasedEvents()) { + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + } const char *Priority = "Normal"; if (Queue->isPriorityLow()) { ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; @@ -1859,27 +1887,33 @@ ur_result_t ur_queue_handle_t_::createCommandList( ze_command_list_handle_t ZeCommandList; uint32_t QueueGroupOrdinal; + ZeStruct ZeCommandListDesc; + if (usingCounterBasedEvents()) { + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + ZeCommandListDesc.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; + } auto &QGroup = getQueueGroup(UseCopyEngine); auto &ZeCommandQueue = ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); if (ForcedCmdQueue) QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); - ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + if (!usingCounterBasedEvents()) { + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + } ZeStruct ZeQueueDesc; ZeQueueDesc.ordinal = QueueGroupOrdinal; std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); - - UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); - UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); + if (!usingCounterBasedEvents()) { + UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); + UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); + } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index b255e5963e..d4f99be1c3 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -399,6 +399,8 @@ struct ur_queue_handle_t_ : _ur_object { // Returns true if the queue is a in-order queue. bool isInOrderQueue() const; + bool usingCounterBasedEvents() const; + // Returns true if the queue has discard events property. bool isDiscardEvents() const; @@ -543,7 +545,8 @@ struct ur_queue_handle_t_ : _ur_object { ur_result_t createEventAndAssociateQueue( ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, - std::optional HostVisible = std::nullopt); + std::optional HostVisible = std::nullopt, + std::optional usingCounterBasedEvents = std::nullopt); // Helper function to perform the necessary cleanup of the events from reset cmd // list.