diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 2166aa0101..56ffbe0145 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -293,7 +293,8 @@ static ur_result_t enqueueCommandBufferMemCopyHelper( SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. @@ -358,7 +359,8 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. @@ -401,7 +403,8 @@ static ur_result_t enqueueCommandBufferFillHelper( SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. @@ -453,8 +456,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, // Create signal & wait events to be used in the command-list for sync // on command-buffer enqueue. auto RetCommandBuffer = *CommandBuffer; - UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->SignalEvent)); - UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->WaitEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->SignalEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->WaitEvent)); // Add prefix commands ZE2UR_CALL(zeCommandListAppendEventReset, @@ -550,7 +555,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; // Get sync point and register the event with it. @@ -732,7 +738,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( } ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH; // Get sync point and register the event with it. @@ -795,7 +802,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( } ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE; // Get sync point and register the event with it. @@ -933,9 +941,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); if (Event) { - UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, - UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, - SignalCommandList, false, true)); + UR_CALL(createEventAndAssociateQueue( + Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + SignalCommandList, false, false, true)); if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) { // Multiple submissions of a command buffer implies that we need to save diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 2bd893b043..f36442b491 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -471,12 +471,17 @@ static const uint32_t MaxNumEventsPerPool = [] { ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, - bool ProfilingEnabled) { + bool ProfilingEnabled, ur_device_handle_t Device) { // Lock while updating event pool machinery. std::scoped_lock Lock(ZeEventPoolCacheMutex); + ze_device_handle_t ZeDevice = nullptr; + + if (Device) { + ZeDevice = Device->ZeDevice; + } std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled); + getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice); if (!ZePoolCache->empty()) { if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { @@ -511,9 +516,14 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); std::vector ZeDevices; - std::for_each( - Devices.begin(), Devices.end(), - [&](const ur_device_handle_t &D) { ZeDevices.push_back(D->ZeDevice); }); + if (ZeDevice) { + ZeDevices.push_back(ZeDevice); + } else { + std::for_each(Devices.begin(), Devices.end(), + [&](const ur_device_handle_t &D) { + ZeDevices.push_back(D->ZeDevice); + }); + } ZE2UR_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), &ZeDevices[0], ZePool)); @@ -528,11 +538,10 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( return UR_RESULT_SUCCESS; } -ur_event_handle_t -ur_context_handle_t_::getEventFromContextCache(bool HostVisible, - bool WithProfiling) { +ur_event_handle_t ur_context_handle_t_::getEventFromContextCache( + bool HostVisible, bool WithProfiling, ur_device_handle_t Device) { std::scoped_lock Lock(EventCacheMutex); - auto Cache = getEventCache(HostVisible, WithProfiling); + auto Cache = getEventCache(HostVisible, WithProfiling, Device); if (Cache->empty()) return nullptr; @@ -546,8 +555,14 @@ ur_context_handle_t_::getEventFromContextCache(bool HostVisible, void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) { std::scoped_lock Lock(EventCacheMutex); - auto Cache = - getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); + ur_device_handle_t Device = nullptr; + + if (!Event->IsMultiDevice && Event->UrQueue) { + Device = Event->UrQueue->Device; + } + + auto Cache = getEventCache(Event->isHostVisible(), + Event->isProfilingEnabled(), Device); Cache->emplace_back(Event); } @@ -562,8 +577,14 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { return UR_RESULT_SUCCESS; } - std::list *ZePoolCache = - getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); + ze_device_handle_t ZeDevice = nullptr; + + if (!Event->IsMultiDevice && Event->UrQueue) { + ZeDevice = Event->UrQueue->Device->ZeDevice; + } + + std::list *ZePoolCache = getZeEventPoolCache( + Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice); // Put the empty pool to the cache of the pools. if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 96935d470e..2c80ff0e33 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -142,6 +142,9 @@ struct ur_context_handle_t_ : _ur_object { // // Cache of event pools to which host-visible events are added to. std::vector> ZeEventPoolCache{4}; + std::vector *>> + ZeEventPoolCacheDeviceMap{4}; // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. @@ -163,6 +166,9 @@ struct ur_context_handle_t_ : _ur_object { // Caches for events. std::vector> EventCaches{4}; + std::vector< + std::unordered_map *>> + EventCachesDeviceMap{4}; // Initialize the PI context. ur_result_t initialize(); @@ -188,20 +194,46 @@ struct ur_context_handle_t_ : _ur_object { // slot for an event with profiling capabilities. ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, bool HostVisible, - bool ProfilingEnabled); + bool ProfilingEnabled, + ur_device_handle_t Device); // Get ur_event_handle_t from cache. ur_event_handle_t getEventFromContextCache(bool HostVisible, - bool WithProfiling); + bool WithProfiling, + ur_device_handle_t Device); // Add ur_event_handle_t to cache. void addEventToContextCache(ur_event_handle_t); - auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; - else - return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; + auto getZeEventPoolCache(bool HostVisible, bool WithProfiling, + ze_device_handle_t ZeDevice) { + if (HostVisible) { + if (ZeDevice) { + auto ZeEventPoolCacheMap = WithProfiling + ? &ZeEventPoolCacheDeviceMap[0] + : &ZeEventPoolCacheDeviceMap[1]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + (*ZeEventPoolCacheMap)[ZeDevice] = &ZeEventPoolCache.back(); + } + return (*ZeEventPoolCacheMap)[ZeDevice]; + } else { + return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; + } + } else { + if (ZeDevice) { + auto ZeEventPoolCacheMap = WithProfiling + ? &ZeEventPoolCacheDeviceMap[2] + : &ZeEventPoolCacheDeviceMap[3]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + (*ZeEventPoolCacheMap)[ZeDevice] = &ZeEventPoolCache.back(); + } + return (*ZeEventPoolCacheMap)[ZeDevice]; + } else { + return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; + } + } } // Decrement number of events living in the pool upon event destroy @@ -240,11 +272,33 @@ struct ur_context_handle_t_ : _ur_object { private: // Get the cache of events for a provided scope and profiling mode. - auto getEventCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &EventCaches[0] : &EventCaches[1]; - else - return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + auto getEventCache(bool HostVisible, bool WithProfiling, + ur_device_handle_t Device) { + if (HostVisible) { + if (Device) { + auto EventCachesMap = + WithProfiling ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + return (*EventCachesMap)[Device]; + } else { + return WithProfiling ? &EventCaches[0] : &EventCaches[1]; + } + } else { + if (Device) { + auto EventCachesMap = + WithProfiling ? &EventCachesDeviceMap[2] : &EventCachesDeviceMap[3]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + return (*EventCachesMap)[Device]; + } else { + return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + } + } } }; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 3ceb9b4a22..1f3b62ce8f 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -76,7 +76,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_EVENTS_WAIT, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -103,9 +103,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( std::scoped_lock lock(Queue->Mutex); if (OutEvent) { - UR_CALL(createEventAndAssociateQueue( - Queue, OutEvent, UR_COMMAND_EVENTS_WAIT, Queue->CommandListMap.end(), - /* IsInternal */ false)); + UR_CALL(createEventAndAssociateQueue(Queue, OutEvent, + UR_COMMAND_EVENTS_WAIT, + Queue->CommandListMap.end(), false, + /* IsInternal */ false)); } UR_CALL(Queue->synchronize()); @@ -157,7 +158,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_event_handle_t &Event, bool IsInternal) { UR_CALL(createEventAndAssociateQueue( Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, - IsInternal)); + IsInternal, false)); Event->WaitList = EventWaitList; @@ -604,7 +605,8 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( // Create a "proxy" host-visible event. UR_CALL(createEventAndAssociateQueue( UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true)); + /* IsInternal */ false, /* IsMultiDevice */ false, + /* HostVisible */ true)); ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent)); @@ -750,7 +752,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( ur_event_handle_t *Event ///< [out] pointer to the handle of the event object created. ) { - UR_CALL(EventCreate(Context, nullptr, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -768,7 +770,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( // we dont have urEventCreate, so use this check for now to know that // the call comes from urEventCreate() if (NativeEvent == nullptr) { - UR_CALL(EventCreate(Context, nullptr, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -1046,12 +1048,19 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // a host-visible pool. // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, - bool HostVisible, ur_event_handle_t *RetEvent) { + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent) { bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); - if (auto CachedEvent = - Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { + ur_device_handle_t Device = nullptr; + + if (!IsMultiDevice && Queue) { + Device = Queue->Device; + } + + if (auto CachedEvent = Context->getEventFromContextCache( + HostVisible, ProfilingEnabled, Device)) { *RetEvent = CachedEvent; return UR_RESULT_SUCCESS; } @@ -1062,7 +1071,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, size_t Index = 0; if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled)) + ZeEventPool, Index, HostVisible, ProfilingEnabled, Device)) return Res; ZeStruct ZeEventDesc; @@ -1264,9 +1273,45 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } std::shared_lock Lock(EventList[I]->Mutex); - this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; - this->UrEventList[TmpListLength] = EventList[I]; - this->UrEventList[TmpListLength]->RefCount.increment(); + + if (Queue && Queue->Device != CurQueue->Device && + !EventList[I]->IsMultiDevice) { + ze_event_handle_t MultiDeviceZeEvent = nullptr; + ur_event_handle_t MultiDeviceEvent; + bool IsInternal = true; + bool IsMultiDevice = true; + + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + false, true)); + + UR_CALL(createEventAndAssociateQueue( + Queue, &MultiDeviceEvent, EventList[I]->CommandType, CommandList, + IsInternal, IsMultiDevice)); + MultiDeviceZeEvent = MultiDeviceEvent->ZeEvent; + const auto &ZeCommandList = CommandList->first; + EventList[I]->RefCount.increment(); + + zeCommandListAppendWaitOnEvents(ZeCommandList, 1u, + &EventList[I]->ZeEvent); + zeEventHostSignal(MultiDeviceZeEvent); + + UR_CALL(Queue->executeCommandList(CommandList, /* IsBlocking */ false, + /* OkToBatchCommand */ true)); + + // Acquire lock of newly created MultiDeviceEvent to increase it's + // RefCount + std::shared_lock Lock(MultiDeviceEvent->Mutex); + + this->ZeEventList[TmpListLength] = MultiDeviceZeEvent; + this->UrEventList[TmpListLength] = MultiDeviceEvent; + this->UrEventList[TmpListLength]->RefCount.increment(); + } else { + this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; + this->UrEventList[TmpListLength] = EventList[I]; + this->UrEventList[TmpListLength]->RefCount.increment(); + } + TmpListLength += 1; } } diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index d4e975012c..c266de8c0d 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -30,7 +30,8 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, - bool HostVisible, ur_event_handle_t *RetEvent); + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent); } // extern "C" // This is an experimental option that allows to disable caching of events in @@ -190,6 +191,11 @@ struct ur_event_handle_t_ : _ur_object { // plugin. bool IsDiscarded = {false}; + // Indicates that this event is needed to be visible by multiple devices. + // When possible, allocate Event from single device pool for optimal + // performance + bool IsMultiDevice = {false}; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 3b3fc7b154..b36c309092 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -206,7 +206,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index a423e55b71..e977d1ac15 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -67,7 +67,7 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -117,7 +117,7 @@ ur_result_t enqueueMemCopyRectHelper( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -227,7 +227,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -361,7 +361,7 @@ static ur_result_t enqueueMemImageCommandHelper( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -911,9 +911,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_CALL(TmpWaitList.createAndRetainUrZeEventList( NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - UR_CALL( - createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, - Queue->CommandListMap.end(), IsInternal)); + UR_CALL(createEventAndAssociateQueue( + Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(), + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -1071,7 +1071,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP, Queue->CommandListMap.end(), - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; } @@ -1262,7 +1262,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -1318,7 +1318,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_ADVISE, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 0178d45d72..241e3a23a2 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1126,7 +1126,8 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, auto Res = createEventAndAssociateQueue( reinterpret_cast(this), &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true); + /* IsInternal */ false, /* IsMultiDevice */ true, + /* HostVisible */ true); if (Res) return Res; @@ -1266,8 +1267,19 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { } ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { - auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; - Cache->emplace_back(Event); + if (!Event->IsMultiDevice && Event->UrQueue) { + auto Device = Event->UrQueue->Device; + auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0] + : &EventCachesDeviceMap[1]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + (*EventCachesMap)[Device]->emplace_back(Event); + } else { + auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; + Cache->emplace_back(Event); + } return UR_RESULT_SUCCESS; } @@ -1444,8 +1456,20 @@ ur_result_t ur_queue_handle_t_::synchronize() { return UR_RESULT_SUCCESS; } -ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) { - auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; +ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, + bool HostVisible) { + std::list *Cache; + + if (!IsMultiDevice) { + auto Device = this->Device; + Cache = HostVisible ? EventCachesDeviceMap[0][Device] + : EventCachesDeviceMap[1][Device]; + if (!Cache) { + return nullptr; + } + } else { + Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; + } // If we don't have any events, return nullptr. // If we have only a single event then it was used by the last command and we @@ -1470,13 +1494,15 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) { // \param CommandList is the command list where the event is added // \param IsInternal tells if the event is internal, i.e. visible in the L0 // plugin only. +// \param IsMultiDevice tells if the event must be created in the multi-device +// visible pool. // \param HostVisible tells if the event must be created in the // host-visible pool. If not set then this function will decide. ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, ur_command_list_ptr_t CommandList, - bool IsInternal, + bool IsInternal, bool IsMultiDevice, std::optional HostVisible) { if (!HostVisible.has_value()) { @@ -1485,15 +1511,18 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, } // If event is discarded then try to get event from the queue cache. - *Event = - IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr; + *Event = IsInternal ? Queue->getEventFromQueueCache(IsMultiDevice, + HostVisible.value()) + : nullptr; if (*Event == nullptr) - UR_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event)); + UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, + HostVisible.value(), Event)); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType; (*Event)->IsDiscarded = IsInternal; + (*Event)->IsMultiDevice = IsMultiDevice; (*Event)->CommandList = CommandList; // Discarded event doesn't own ze_event, it is used by multiple // ur_event_handle_t objects. We destroy corresponding ze_event by releasing @@ -1569,7 +1598,8 @@ ur_result_t ur_queue_handle_t_::signalEventFromCmdListIfLastEventDiscarded( UR_CALL(createEventAndAssociateQueue( reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ false)); + /* IsInternal */ false, /* IsMultiDevice */ true, + /* HostVisible */ false)); UR_CALL(urEventReleaseInternal(Event)); LastCommandEvent = Event; @@ -1882,7 +1912,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, if (auto Res = createEventAndAssociateQueue( reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, - /*IsInternal*/ true)) + /* IsInternal */ true, /* IsMultiDevice */ true)) return Res; Event->WaitList = ActiveBarriersWaitList; diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 8022c45e0e..b255e5963e 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -343,6 +343,9 @@ struct ur_queue_handle_t_ : _ur_object { // inside all command lists in the queue as described in the 2-event model. // Leftover events in the cache are relased at the queue destruction. std::vector> EventCaches{2}; + std::vector< + std::unordered_map *>> + EventCachesDeviceMap{2}; // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. @@ -417,7 +420,8 @@ struct ur_queue_handle_t_ : _ur_object { // two times in a row and have to do round-robin between two events. Otherwise // it picks an event from the beginning of the cache and returns it. Event // from the last command is always appended to the end of the list. - ur_event_handle_t getEventFromQueueCache(bool HostVisible); + ur_event_handle_t getEventFromQueueCache(bool IsMultiDevice, + bool HostVisible); // Returns true if an OpenCommandList has commands that need to be submitted. // If IsCopy is 'true', then the OpenCommandList containing copy commands is @@ -532,13 +536,14 @@ struct ur_queue_handle_t_ : _ur_object { // \param CommandList is the command list where the event is added // \param IsInternal tells if the event is internal, i.e. visible in the L0 // plugin only. +// \param IsMultiDevice Indicates that this event must be visible by +// multiple devices. // \param ForceHostVisible tells if the event must be created in // the host-visible pool -ur_result_t -createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, - ur_command_t CommandType, - ur_command_list_ptr_t CommandList, bool IsInternal, - std::optional HostVisible = std::nullopt); +ur_result_t createEventAndAssociateQueue( + ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, + std::optional HostVisible = std::nullopt); // Helper function to perform the necessary cleanup of the events from reset cmd // list.