From 4444b9c87a227f6b91aa12ff33abc48ab72f6386 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 16 Apr 2024 09:35:21 -0700
Subject: [PATCH 01/19] [L0] Enable Batching out of order commands without
 signal events

- Given a command being enqueued without dependencies or dependencies
  outside of a given batch of commands, then signal the event at the end
of the command list allowing for parallel execution of the commands.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/command_buffer.cpp |  8 ++-
 source/adapters/level_zero/context.cpp        | 27 +++++++--
 source/adapters/level_zero/context.hpp        | 10 ++--
 source/adapters/level_zero/event.cpp          | 19 +++---
 source/adapters/level_zero/event.hpp          |  2 +
 source/adapters/level_zero/image.cpp          |  8 ++-
 source/adapters/level_zero/kernel.cpp         |  6 +-
 source/adapters/level_zero/memory.cpp         | 58 +++++++++++--------
 source/adapters/level_zero/queue.cpp          | 56 ++++++++++++++++++
 source/adapters/level_zero/queue.hpp          | 18 ++++++
 10 files changed, 163 insertions(+), 49 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 46e2e33607..e7341e6a0f 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -922,8 +922,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       // Create command-list to execute before `CommandListPtr` and will signal
       // when `EventWaitList` dependencies are complete.
       ur_command_list_ptr_t WaitCommandList{};
-      UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
-                                                      false, false));
+      UR_CALL(Queue->Context->getAvailableCommandList(
+          Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList,
+          false));
 
       ZE2UR_CALL(zeCommandListAppendBarrier,
                  (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
@@ -958,7 +959,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // Create a command-list to signal RetEvent on completion
   ur_command_list_ptr_t SignalCommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
-                                                  false, false));
+                                                  false, NumEventsInWaitList,
+                                                  EventWaitList, false));
   // Reset the wait-event for the UR command-buffer that is signaled when its
   // submission dependencies have been satisfied.
   ZE2UR_CALL(zeCommandListAppendEventReset,
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index f9cf5009fb..f30225fe3a 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -624,7 +624,8 @@ static const size_t CmdListsCleanupThreshold = [] {
 // Retrieve an available command list to be used in a PI call.
 ur_result_t ur_context_handle_t_::getAvailableCommandList(
     ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
-    bool UseCopyEngine, bool AllowBatching,
+    bool UseCopyEngine, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, bool AllowBatching,
     ze_command_queue_handle_t *ForcedCmdQueue) {
   // Immediate commandlists have been pre-allocated and are always available.
   if (Queue->UsingImmCmdLists) {
@@ -656,9 +657,27 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   // for this queue.
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
-      CommandList = CommandBatch.OpenCommandList;
-      UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
-      return UR_RESULT_SUCCESS;
+      bool CannotBatch = false;
+      // If this command should be batched, but the command has a dependency on
+      // a command in the current batch, then the open command list must be
+      // executed and this command must be batched into a new command list.
+      if (NumEventsInWaitList > 0) {
+        for (auto &Event : CommandBatch.OpenCommandList->second.EventList) {
+          for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
+            if (Event == EventWaitList[i]) {
+              CannotBatch = true;
+              break;
+            }
+          }
+          if (CannotBatch)
+            break;
+        }
+      }
+      if (!CannotBatch) {
+        CommandList = CommandBatch.OpenCommandList;
+        UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
+        return UR_RESULT_SUCCESS;
+      }
     }
     // If this command isn't allowed to be batched or doesn't match the forced
     // command queue, then we need to go ahead and execute what is already in
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index 6e4244eea0..f8e1f49b09 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -267,11 +267,11 @@ struct ur_context_handle_t_ : _ur_object {
   // When using immediate commandlists, retrieves an immediate command list
   // for executing on this device. Immediate commandlists are created only
   // once for each SYCL Queue and after that they are reused.
-  ur_result_t
-  getAvailableCommandList(ur_queue_handle_t Queue,
-                          ur_command_list_ptr_t &CommandList,
-                          bool UseCopyEngine, bool AllowBatching = false,
-                          ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
+  ur_result_t getAvailableCommandList(
+      ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
+      bool UseCopyEngine, uint32_t NumEventsInWaitList,
+      const ur_event_handle_t *EventWaitList, bool AllowBatching = false,
+      ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
 
   // Checks if Device is covered by this context.
   // For that the Device or its root devices need to be in the context.
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 97ffe2f19e..ca9ee6aaa6 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
 
     // Get a new command list to be used on this call
     ur_command_list_ptr_t CommandList{};
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                    UseCopyEngine));
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
     ze_event_handle_t ZeEvent = nullptr;
     ur_event_handle_t InternalEvent;
@@ -255,7 +255,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     // Get an arbitrary command-list in the queue.
     ur_command_list_ptr_t CmdList;
     UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch));
 
     // Insert the barrier into the command-list and execute.
     UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
@@ -310,7 +311,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
           if (ZeQueue) {
             ur_command_list_ptr_t CmdList;
             UR_CALL(Queue->Context->getAvailableCommandList(
-                Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue));
+                Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
+                EventWaitList, OkToBatch, &ZeQueue));
             CmdLists.push_back(CmdList);
           }
         }
@@ -323,7 +325,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     // Get any available command list.
     ur_command_list_ptr_t CmdList;
     UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch));
     CmdLists.push_back(CmdList);
   }
 
@@ -611,7 +614,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
 
     ur_command_list_ptr_t CommandList{};
     UR_CALL(UrQueue->Context->getAvailableCommandList(
-        UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch))
+        UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch))
 
     // Create a "proxy" host-visible event.
     UR_CALL(createEventAndAssociateQueue(
@@ -1317,8 +1320,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
           // Get a command list prior to acquiring an event lock.
           // This prevents a potential deadlock with recursive
           // event locks.
-          UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                          false, true));
+          UR_CALL(Queue->Context->getAvailableCommandList(
+              Queue, CommandList, false, 0, nullptr, true));
         }
 
         std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index a566c77825..ede243aa6a 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -196,6 +196,8 @@ struct ur_event_handle_t_ : _ur_object {
   // performance
   bool IsMultiDevice = {false};
 
+  bool IsInnerBatchedEvent = {false};
+
   // Besides each PI object keeping a total reference count in
   // _ur_object::RefCount we keep special track of the event *external*
   // references. This way we are able to tell when the event is not referenced
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 3b767f9127..b4b5662a33 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -790,8 +790,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList,
-                                                   UseCopyEngine, OkToBatch));
+  UR_CALL(hQueue->Context->getAvailableCommandList(
+      hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -800,7 +801,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
   UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY,
                                        CommandList, IsInternal,
                                        /*IsMultiDevice*/ false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
+                         numEventsInWaitList, phEventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index d96351dd5d..f9c7f33cae 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(
-      Queue, CommandList, UseCopyEngine, true /* AllowBatching */));
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      true /* AllowBatching */));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent{};
@@ -210,7 +211,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
                                        CommandList, IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   // Save the kernel in the event, so that when the event is signalled
diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 39a970063f..5ff63afd9d 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -61,8 +61,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -70,7 +71,8 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -111,8 +113,9 @@ ur_result_t enqueueMemCopyRectHelper(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -120,8 +123,8 @@ ur_result_t enqueueMemCopyRectHelper(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -221,8 +224,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
   ur_command_list_ptr_t CommandList{};
   // We want to batch these commands to avoid extra submissions (costly)
   bool OkToBatch = true;
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -230,8 +234,8 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -312,8 +316,9 @@ static ur_result_t enqueueMemImageCommandHelper(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -321,7 +326,8 @@ static ur_result_t enqueueMemImageCommandHelper(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -878,7 +884,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
         Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(),
         IsInternal, false));
 
-    ZeEvent = (*Event)->ZeEvent;
+    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                           NumEventsInWaitList, EventWaitList));
     (*Event)->WaitList = TmpWaitList;
   }
 
@@ -968,8 +975,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
   } else {
     // For discrete devices we need a command list
     ur_command_list_ptr_t CommandList{};
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                    UseCopyEngine));
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
     // Add the event to the command list.
     CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
@@ -1035,7 +1042,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
                                          Queue->CommandListMap.end(),
                                          IsInternal, false));
-    ZeEvent = (*Event)->ZeEvent;
+    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                           NumEventsInWaitList, EventWaitList));
     (*Event)->WaitList = TmpWaitList;
   }
 
@@ -1090,7 +1098,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
 
   ur_command_list_ptr_t CommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(
-      reinterpret_cast<ur_queue_handle_t>(Queue), CommandList, UseCopyEngine));
+      reinterpret_cast<ur_queue_handle_t>(Queue), CommandList, UseCopyEngine,
+      NumEventsInWaitList, EventWaitList));
 
   CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
   (*Event)->RefCount.increment();
@@ -1217,8 +1226,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   ur_command_list_ptr_t CommandList{};
   // TODO: Change UseCopyEngine argument to 'true' once L0 backend
   // support is added
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
   // TODO: do we need to create a unique command type for this?
   ze_event_handle_t ZeEvent = nullptr;
@@ -1227,7 +1236,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH,
                                        CommandList, IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &WaitList = (*Event)->WaitList;
@@ -1274,7 +1284,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(
   // TODO: Additional analysis is required to check if this operation will
   // run faster on copy engines.
   UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine));
+                                                  UseCopyEngine, 0, nullptr));
 
   // TODO: do we need to create a unique command type for this?
   ze_event_handle_t ZeEvent = nullptr;
@@ -1283,7 +1293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_ADVISE,
                                        CommandList, IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, 0, nullptr));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c6aaf4b034..c30fa822fc 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1395,6 +1395,13 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
               zeCommandListAppendBarrier,
               (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
         }
+        // Append Signalling of the inner events at the end of the batch
+        for (auto &Event : CommandList->second.EventList) {
+          if (Event->IsInnerBatchedEvent) {
+            ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                       (CommandList->first, Event->ZeEvent));
+          }
+        }
       } else {
         // If we don't have host visible proxy then signal event if needed.
         this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
@@ -1718,6 +1725,55 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice,
   return RetEvent;
 }
 
+// This helper function checks to see if an event for a command can be included
+// at the end of a command list batch. This will only be true if the event does
+// not have dependencies or the dependencies are not for events which exist in
+// this batch.
+bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
+                       uint32_t NumEventsInWaitList,
+                       const ur_event_handle_t *EventWaitList) {
+  auto &CommandBatch =
+      UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+  // First see if there is an command-list open for batching commands
+  // for this queue.
+  if (Queue->hasOpenCommandList(UseCopyEngine)) {
+    // If this command should be batched, but the command has a dependency on a
+    // command in the current batch, then the command needs to have an event
+    // to track its completion so this event cannot be batched to the end of the
+    // command list.
+    if (NumEventsInWaitList > 0) {
+      for (auto &Event : CommandBatch.OpenCommandList->second.EventList) {
+        for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
+          if (Event == EventWaitList[i]) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// This helper function checks to see if a signal event at the end of a command
+// should be set. If the Queue is out of order and the command has no
+// dependencies, then this command can be enqueued without a signal event set in
+// a command list batch. The signal event will be appended at the end of the
+// batch to be signalled at the end of the command list.
+ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
+                           ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
+                           uint32_t NumEventsInWaitList,
+                           const ur_event_handle_t *EventWaitList) {
+  if (eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
+                        EventWaitList) &&
+      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) {
+    ZeEvent = nullptr;
+    (*Event)->IsInnerBatchedEvent = true;
+  } else {
+    (*ZeEvent) = (*Event)->ZeEvent;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 // This helper function creates a ur_event_handle_t and associate a
 // ur_queue_handle_t. Note that the caller of this function must have acquired
 // lock on the Queue that is passed in.
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 03922bd2dc..009220a406 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -688,6 +688,24 @@ ur_result_t createEventAndAssociateQueue(
     ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
     std::optional<bool> HostVisible = std::nullopt);
 
+// This helper function checks to see if an event for a command can be included
+// at the end of a command list batch. This will only be true if the event does
+// not have dependencies or the dependencies are not for events which exist in
+// this batch.
+bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
+                       uint32_t NumEventsInWaitList,
+                       const ur_event_handle_t *EventWaitList);
+
+// This helper function checks to see if a signal event at the end of a command
+// should be set. If the Queue is out of order and the command has no
+// dependencies, then this command can be enqueued without a signal event set in
+// a command list batch. The signal event will be appended at the end of the
+// batch to be signalled at the end of the command list.
+ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
+                           ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
+                           uint32_t NumEventsInWaitList,
+                           const ur_event_handle_t *EventWaitList);
+
 // Helper function to perform the necessary cleanup of the events from reset cmd
 // list.
 ur_result_t CleanupEventListFromResetCmdList(

From c313bdf41692f1bebfbf7611bc481103472ccf19 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 08:34:04 -0700
Subject: [PATCH 02/19] [L0] Enable optional signal event only for Integrated
 gpus

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/device.hpp | 4 ++++
 source/adapters/level_zero/queue.cpp  | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 484890670b..3cdfcbce7e 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object {
            (ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
   }
 
+  bool isIntegrated() {
+    return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+  }
+
   // Does this device represent a single compute slice?
   bool isCCS() const {
     return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c30fa822fc..5423de5d36 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1763,7 +1763,8 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
                            ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
                            uint32_t NumEventsInWaitList,
                            const ur_event_handle_t *EventWaitList) {
-  if (eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
+  if (Queue->Device->isIntegrated() &&
+      eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
                         EventWaitList) &&
       !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) {
     ZeEvent = nullptr;

From c1aa8397498692dc9ed122b3492276063949cf26 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 08:45:04 -0700
Subject: [PATCH 03/19] [L0] Use check for event batch for integrated when
 retreiving cmd list

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/context.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index f30225fe3a..d61390ec37 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -657,23 +657,12 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   // for this queue.
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
-      bool CannotBatch = false;
-      // If this command should be batched, but the command has a dependency on
-      // a command in the current batch, then the open command list must be
-      // executed and this command must be batched into a new command list.
-      if (NumEventsInWaitList > 0) {
-        for (auto &Event : CommandBatch.OpenCommandList->second.EventList) {
-          for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
-            if (Event == EventWaitList[i]) {
-              CannotBatch = true;
-              break;
-            }
-          }
-          if (CannotBatch)
-            break;
-        }
+      bool batchingAllowed = true;
+      if (Queue->Device->isIntegrated()) {
+        batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
+                                            NumEventsInWaitList, EventWaitList);
       }
-      if (!CannotBatch) {
+      if (batchingAllowed) {
         CommandList = CommandBatch.OpenCommandList;
         UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
         return UR_RESULT_SUCCESS;

From 71b1b6f1c9e5a3541a6390ea913c0803efbfde99 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 12:59:00 -0700
Subject: [PATCH 04/19] [L0] Fix append of Signal Events to occur for all batch
 types

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/queue.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 5423de5d36..ac5b0e900a 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1395,13 +1395,6 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
               zeCommandListAppendBarrier,
               (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
         }
-        // Append Signalling of the inner events at the end of the batch
-        for (auto &Event : CommandList->second.EventList) {
-          if (Event->IsInnerBatchedEvent) {
-            ZE2UR_CALL(zeCommandListAppendSignalEvent,
-                       (CommandList->first, Event->ZeEvent));
-          }
-        }
       } else {
         // If we don't have host visible proxy then signal event if needed.
         this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
@@ -1410,6 +1403,13 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
       // If we don't have host visible proxy then signal event if needed.
       this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
     }
+    // Append Signalling of the inner events at the end of the batch
+    for (auto &Event : CommandList->second.EventList) {
+      if (Event->IsInnerBatchedEvent) {
+        ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                    (CommandList->first, Event->ZeEvent));
+      }
+    }
 
     // Close the command list and have it ready for dispatch.
     ZE2UR_CALL(zeCommandListClose, (CommandList->first));

From 5c504a766375d92b073d11869fb3edd2f69b9133 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 13:11:48 -0700
Subject: [PATCH 05/19] [L0] Fix formatting issues in execute command lists

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/queue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index ac5b0e900a..f7c8a93a9f 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1407,7 +1407,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
     for (auto &Event : CommandList->second.EventList) {
       if (Event->IsInnerBatchedEvent) {
         ZE2UR_CALL(zeCommandListAppendSignalEvent,
-                    (CommandList->first, Event->ZeEvent));
+                   (CommandList->first, Event->ZeEvent));
       }
     }
 

From bad94a5859c19959b40374ce1cd2d982d63e54b5 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 15:45:43 -0700
Subject: [PATCH 06/19] [L0] fix buffer map/unmap host event usecase

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/memory.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 5ff63afd9d..3a5b5141ae 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -884,8 +884,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
         Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(),
         IsInternal, false));
 
-    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                           NumEventsInWaitList, EventWaitList));
+    ZeEvent = (*Event)->ZeEvent;
     (*Event)->WaitList = TmpWaitList;
   }
 
@@ -988,6 +987,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     char *ZeHandleSrc;
     UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
 
+    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                           NumEventsInWaitList, EventWaitList));
+
     ZE2UR_CALL(zeCommandListAppendMemoryCopy,
                (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent,
                 WaitList.Length, WaitList.ZeEventList));
@@ -1042,8 +1044,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
                                          Queue->CommandListMap.end(),
                                          IsInternal, false));
-    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                           NumEventsInWaitList, EventWaitList));
+    ZeEvent = (*Event)->ZeEvent;
     (*Event)->WaitList = TmpWaitList;
   }
 
@@ -1116,6 +1117,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
   UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                               Queue->Device));
 
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList));
+
   ZE2UR_CALL(zeCommandListAppendMemoryCopy,
              (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr,
               MapInfo.Size, ZeEvent, (*Event)->WaitList.Length,

From cd3e6ecf7fa1ebc6c9b76f89860e554a1e025905 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 17:32:07 -0700
Subject: [PATCH 07/19] [L0] Fix usm prefect and advise to always set signal
 event.

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/memory.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 3a5b5141ae..93752f9bfd 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -1240,8 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH,
                                        CommandList, IsInternal, false));
-  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+  ZeEvent = (*Event)->ZeEvent;
   (*Event)->WaitList = TmpWaitList;
 
   const auto &WaitList = (*Event)->WaitList;
@@ -1297,7 +1296,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_ADVISE,
                                        CommandList, IsInternal, false));
-  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, 0, nullptr));
+  ZeEvent = (*Event)->ZeEvent;
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;

From c14995157a518c95173f88e51e1ab42d44e22970 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 18 Apr 2024 19:27:54 -0700
Subject: [PATCH 08/19] [L0] Fix recursive lock with host visible events during
 wait

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp | 9 ++++++++-
 source/adapters/level_zero/event.hpp | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index ca9ee6aaa6..ec2a68d664 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -600,6 +600,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
                                                           this->Mutex);
 
   if (!HostVisibleEvent) {
+    this->IsCreatingHostProxyEvent = true;
     if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy)
       die("getOrCreateHostVisibleEvent: missing host-visible event");
 
@@ -628,6 +629,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
                (CommandList->first, HostVisibleEvent->ZeEvent));
 
     UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch))
+    this->IsCreatingHostProxyEvent = false;
   }
 
   ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
@@ -938,7 +940,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
   std::list<ur_event_handle_t> EventsToBeReleased;
   ur_queue_handle_t AssociatedQueue = nullptr;
   {
-    std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    // If the Event is already locked, then continue with the cleanup, otherwise
+    // block on locking the event.
+    std::unique_lock<ur_shared_mutex> EventLock(Event->Mutex, std::try_to_lock);
+    if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) {
+      EventLock.lock();
+    }
     if (SetEventCompleted)
       Event->Completed = true;
     // Exit early of event was already cleanedup.
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index ede243aa6a..35da70a9b5 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -196,8 +196,12 @@ struct ur_event_handle_t_ : _ur_object {
   // performance
   bool IsMultiDevice = {false};
 
+  // Indicates inner batched event which was not used as a signal event.
   bool IsInnerBatchedEvent = {false};
 
+  // Indicates within creation of proxy event.
+  bool IsCreatingHostProxyEvent = {false};
+
   // Besides each PI object keeping a total reference count in
   // _ur_object::RefCount we keep special track of the event *external*
   // references. This way we are able to tell when the event is not referenced

From 4c93ce220123b711a0c437889fee057df0bf35c6 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Mon, 22 Apr 2024 10:39:07 -0700
Subject: [PATCH 09/19] [L0] Add UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT to toggle
 event usage

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/common.hpp  | 11 +++++++++++
 source/adapters/level_zero/context.cpp |  3 ++-
 source/adapters/level_zero/queue.cpp   |  3 ++-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 93bf407567..02d97a0094 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -241,6 +241,17 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] {
   return L0QueueSyncLockingModeValue;
 }();
 
+static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] {
+  const char *UrL0OutOfOrderIntegratedSignalEventEnv =
+      std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT");
+  uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 0;
+  if (UrL0OutOfOrderIntegratedSignalEventEnv) {
+    UrL0OutOfOrderIntegratedSignalEventValue =
+        std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv);
+  }
+  return UrL0OutOfOrderIntegratedSignalEventValue;
+}();
+
 // This class encapsulates actions taken along with a call to Level Zero API.
 class ZeCall {
 private:
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index d61390ec37..f7946fa002 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -658,7 +658,8 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
       bool batchingAllowed = true;
-      if (Queue->Device->isIntegrated()) {
+      if (Queue->Device->isIntegrated() &&
+          !UrL0OutOfOrderIntegratedSignalEvent) {
         batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
                                             NumEventsInWaitList, EventWaitList);
       }
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index f7c8a93a9f..5632efa947 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1766,7 +1766,8 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
   if (Queue->Device->isIntegrated() &&
       eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
                         EventWaitList) &&
-      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) {
+      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists &&
+      !UrL0OutOfOrderIntegratedSignalEvent) {
     ZeEvent = nullptr;
     (*Event)->IsInnerBatchedEvent = true;
   } else {

From ccd73c7df1f6440caa37fc6859f456a9747556f2 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Mon, 22 Apr 2024 17:06:03 -0700
Subject: [PATCH 10/19] [L0] Fix ondemand host and host signal events to append
 a barrier

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp | 9 +++++++--
 source/adapters/level_zero/queue.cpp | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index ec2a68d664..b030c22d2c 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -623,8 +623,13 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
         /* IsInternal */ false, /* IsMultiDevice */ false,
         /* HostVisible */ true));
 
-    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
-               (CommandList->first, 1, &ZeEvent));
+    if (this->IsInnerBatchedEvent) {
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+                 (CommandList->first, ZeEvent, 0, nullptr));
+    } else {
+      ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+                 (CommandList->first, 1, &ZeEvent));
+    }
     ZE2UR_CALL(zeCommandListAppendSignalEvent,
                (CommandList->first, HostVisibleEvent->ZeEvent));
 
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 5632efa947..0d605f2e7a 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1323,6 +1323,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
     // in the command list is not empty, otherwise we are going to just create
     // and remove proxy event right away and dereference deleted object
     // afterwards.
+    bool AppendBarrierNeeded = true;
     if (ZeEventsScope == LastCommandInBatchHostVisible &&
         !CommandList->second.EventList.empty()) {
       // If there are only internal events in the command list then we don't
@@ -1391,6 +1392,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
           ZE2UR_CALL(zeCommandListAppendSignalEvent,
                      (CommandList->first, HostVisibleEvent->ZeEvent));
         } else {
+          AppendBarrierNeeded = false;
           ZE2UR_CALL(
               zeCommandListAppendBarrier,
               (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
@@ -1404,6 +1406,10 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
       this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
     }
     // Append Signalling of the inner events at the end of the batch
+    if (CommandList->second.EventList.size() > 0 && AppendBarrierNeeded) {
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+                 (CommandList->first, nullptr, 0, nullptr));
+    }
     for (auto &Event : CommandList->second.EventList) {
       if (Event->IsInnerBatchedEvent) {
         ZE2UR_CALL(zeCommandListAppendSignalEvent,

From 8ccf75a1459af3cab8d694d4f6c23d2d7b702927 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Tue, 23 Apr 2024 17:22:26 -0700
Subject: [PATCH 11/19] [L0] Queue Sync given event wait on inner batched event

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index b030c22d2c..f806d5f988 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -691,7 +691,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
 
           ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
           logger::debug("ZeEvent = {}", ur_cast<std::uintptr_t>(ZeEvent));
-          ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+          // If this event was an inner batched event, then lock and sync with
+          // the Queue instead of waiting on the event.
+          if (HostVisibleEvent->IsInnerBatchedEvent && Event->UrQueue) {
+            std::scoped_lock<ur_shared_mutex> Lock(Event->UrQueue->Mutex);
+            UR_CALL(Event->UrQueue->synchronize());
+          } else {
+            ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+          }
           Event->Completed = true;
         }
       }

From 6812c822cf7133339e55595f65270de2ec5e27af Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Wed, 24 Apr 2024 08:21:38 -0700
Subject: [PATCH 12/19] [L0] Track and sync the executing Queue directly during
 event wait

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp  |  5 ++---
 source/adapters/level_zero/event.hpp  |  3 +++
 source/adapters/level_zero/image.cpp  |  3 ++-
 source/adapters/level_zero/kernel.cpp |  3 ++-
 source/adapters/level_zero/memory.cpp | 18 ++++++++++++------
 source/adapters/level_zero/queue.cpp  |  4 +++-
 source/adapters/level_zero/queue.hpp  |  3 ++-
 7 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index f806d5f988..79e2a5447b 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -693,9 +693,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
           logger::debug("ZeEvent = {}", ur_cast<std::uintptr_t>(ZeEvent));
           // If this event was an inner batched event, then lock and sync with
           // the Queue instead of waiting on the event.
-          if (HostVisibleEvent->IsInnerBatchedEvent && Event->UrQueue) {
-            std::scoped_lock<ur_shared_mutex> Lock(Event->UrQueue->Mutex);
-            UR_CALL(Event->UrQueue->synchronize());
+          if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) {
+            ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue));
           } else {
             ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
           }
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index 35da70a9b5..68608c620d 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -199,6 +199,9 @@ struct ur_event_handle_t_ : _ur_object {
   // Indicates inner batched event which was not used as a signal event.
   bool IsInnerBatchedEvent = {false};
 
+  // Queue where the batched command was executed.
+  ze_command_queue_handle_t ZeBatchedQueue = {nullptr};
+
   // Indicates within creation of proxy event.
   bool IsCreatingHostProxyEvent = {false};
 
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index b4b5662a33..d9cb19c398 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -802,7 +802,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
                                        CommandList, IsInternal,
                                        /*IsMultiDevice*/ false));
   UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
-                         numEventsInWaitList, phEventWaitList));
+                         numEventsInWaitList, phEventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index f9c7f33cae..65feaae511 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -212,7 +212,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
                                        CommandList, IsInternal, false));
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   // Save the kernel in the event, so that when the event is signalled
diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 93752f9bfd..a31e988dea 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -72,7 +72,8 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -124,7 +125,8 @@ ur_result_t enqueueMemCopyRectHelper(
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -235,7 +237,8 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -327,7 +330,8 @@ static ur_result_t enqueueMemImageCommandHelper(
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -988,7 +992,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
 
     UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                           NumEventsInWaitList, EventWaitList));
+                           NumEventsInWaitList, EventWaitList,
+                           CommandList->second.ZeQueue));
 
     ZE2UR_CALL(zeCommandListAppendMemoryCopy,
                (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent,
@@ -1118,7 +1123,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
                               Queue->Device));
 
   UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
-                         NumEventsInWaitList, EventWaitList));
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
 
   ZE2UR_CALL(zeCommandListAppendMemoryCopy,
              (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr,
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 0d605f2e7a..56a78789fe 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1768,7 +1768,8 @@ bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
 ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
                            ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
                            uint32_t NumEventsInWaitList,
-                           const ur_event_handle_t *EventWaitList) {
+                           const ur_event_handle_t *EventWaitList,
+                           ze_command_queue_handle_t ZeQueue) {
   if (Queue->Device->isIntegrated() &&
       eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
                         EventWaitList) &&
@@ -1776,6 +1777,7 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
       !UrL0OutOfOrderIntegratedSignalEvent) {
     ZeEvent = nullptr;
     (*Event)->IsInnerBatchedEvent = true;
+    (*Event)->ZeBatchedQueue = ZeQueue;
   } else {
     (*ZeEvent) = (*Event)->ZeEvent;
   }
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 009220a406..4acfde91d3 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -704,7 +704,8 @@ bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
 ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
                            ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
                            uint32_t NumEventsInWaitList,
-                           const ur_event_handle_t *EventWaitList);
+                           const ur_event_handle_t *EventWaitList,
+                           ze_command_queue_handle_t ZeQueue);
 
 // Helper function to perform the necessary cleanup of the events from reset cmd
 // list.

From 33bf61cf032a84be2b124858a7b42b2a14188e5c Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 25 Apr 2024 07:21:51 -0700
Subject: [PATCH 13/19] [L0] fix Append Barrier to only occur when at least one
 inner event exists

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/event.cpp | 2 +-
 source/adapters/level_zero/queue.cpp | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index 79e2a5447b..34c5f5e91b 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -691,7 +691,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
 
           ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
           logger::debug("ZeEvent = {}", ur_cast<std::uintptr_t>(ZeEvent));
-          // If this event was an inner batched event, then lock and sync with
+          // If this event was an inner batched event, then sync with
           // the Queue instead of waiting on the event.
           if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) {
             ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue));
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 56a78789fe..1e3df20e99 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1406,12 +1406,13 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
       this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
     }
     // Append Signalling of the inner events at the end of the batch
-    if (CommandList->second.EventList.size() > 0 && AppendBarrierNeeded) {
-      ZE2UR_CALL(zeCommandListAppendBarrier,
-                 (CommandList->first, nullptr, 0, nullptr));
-    }
     for (auto &Event : CommandList->second.EventList) {
       if (Event->IsInnerBatchedEvent) {
+        if (AppendBarrierNeeded) {
+          ZE2UR_CALL(zeCommandListAppendBarrier,
+                     (CommandList->first, nullptr, 0, nullptr));
+          AppendBarrierNeeded = false;
+        }
         ZE2UR_CALL(zeCommandListAppendSignalEvent,
                    (CommandList->first, Event->ZeEvent));
       }

From d621b5072923774ccdc4168adf5652bd8f433c9e Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Thu, 25 Apr 2024 10:31:21 -0700
Subject: [PATCH 14/19] [L0] Set UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT as default
 on

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/common.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 02d97a0094..e16d767b71 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -241,10 +241,12 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] {
   return L0QueueSyncLockingModeValue;
 }();
 
+// Controls whether the L0 Adapter creates signal events for commands on
+// integrated gpu devices.
 static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] {
   const char *UrL0OutOfOrderIntegratedSignalEventEnv =
       std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT");
-  uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 0;
+  uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1;
   if (UrL0OutOfOrderIntegratedSignalEventEnv) {
     UrL0OutOfOrderIntegratedSignalEventValue =
         std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv);

From 987c422c668455db04da12acbc5178084823c3b0 Mon Sep 17 00:00:00 2001
From: "Neil R. Spruit" <neil.r.spruit@intel.com>
Date: Fri, 26 Apr 2024 07:46:06 -0700
Subject: [PATCH 15/19] [L0] Reordered/added checks for OOQ signal events and
 added comments

Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
---
 source/adapters/level_zero/context.cpp |  4 ++--
 source/adapters/level_zero/queue.cpp   | 32 ++++++++++++++++----------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index f7946fa002..cb86026508 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -658,8 +658,8 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
       bool batchingAllowed = true;
-      if (Queue->Device->isIntegrated() &&
-          !UrL0OutOfOrderIntegratedSignalEvent) {
+      if (!UrL0OutOfOrderIntegratedSignalEvent &&
+          Queue->Device->isIntegrated()) {
         batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
                                             NumEventsInWaitList, EventWaitList);
       }
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 1e3df20e99..011dfc6da0 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1405,16 +1405,25 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
       // If we don't have host visible proxy then signal event if needed.
       this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
     }
-    // Append Signalling of the inner events at the end of the batch
-    for (auto &Event : CommandList->second.EventList) {
-      if (Event->IsInnerBatchedEvent) {
-        if (AppendBarrierNeeded) {
-          ZE2UR_CALL(zeCommandListAppendBarrier,
-                     (CommandList->first, nullptr, 0, nullptr));
-          AppendBarrierNeeded = false;
+    // Append Signalling of the inner events at the end of the batch if this is
+    // an integrated gpu and out of order signal events are not allowed.
+    if (!UrL0OutOfOrderIntegratedSignalEvent && this->Device->isIntegrated()) {
+      for (auto &Event : CommandList->second.EventList) {
+        // If the events scope does not apply a barrier already above, then we
+        // need to apply a barrier to wait on all the previous commands without
+        // signal events to complete before we can signal the batched events as
+        // completed. This functionality is only used if this command list is
+        // out of order and there are events created that were not used as
+        // signal events.
+        if (Event->IsInnerBatchedEvent) {
+          if (AppendBarrierNeeded) {
+            ZE2UR_CALL(zeCommandListAppendBarrier,
+                       (CommandList->first, nullptr, 0, nullptr));
+            AppendBarrierNeeded = false;
+          }
+          ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                     (CommandList->first, Event->ZeEvent));
         }
-        ZE2UR_CALL(zeCommandListAppendSignalEvent,
-                   (CommandList->first, Event->ZeEvent));
       }
     }
 
@@ -1771,11 +1780,10 @@ ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
                            uint32_t NumEventsInWaitList,
                            const ur_event_handle_t *EventWaitList,
                            ze_command_queue_handle_t ZeQueue) {
-  if (Queue->Device->isIntegrated() &&
+  if (!UrL0OutOfOrderIntegratedSignalEvent && Queue->Device->isIntegrated() &&
       eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
                         EventWaitList) &&
-      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists &&
-      !UrL0OutOfOrderIntegratedSignalEvent) {
+      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) {
     ZeEvent = nullptr;
     (*Event)->IsInnerBatchedEvent = true;
     (*Event)->ZeBatchedQueue = ZeQueue;

From 8f375039dfe8ceaacdb996a5b0d8879c2b907c07 Mon Sep 17 00:00:00 2001
From: Przemek Malon <przemek.malon@codeplay.com>
Date: Wed, 29 Nov 2023 11:25:34 +0000
Subject: [PATCH 16/19] [Bindless][Exp] Add device queries for sampled image
 fetch

Added the following queries for device capabilities of fetching
sampled images:

- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
- DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
---
 include/ur_api.h                              | 404 +++++++++---------
 include/ur_print.hpp                          |  90 ++++
 scripts/core/EXP-BINDLESS-IMAGES.rst          |   8 +
 scripts/core/exp-bindless-images.yml          |  18 +
 source/adapters/cuda/device.cpp               |  24 ++
 source/loader/layers/validation/ur_valddi.cpp |   2 +-
 source/loader/ur_libapi.cpp                   |   2 +-
 source/ur_api.cpp                             |   2 +-
 tools/urinfo/urinfo.hpp                       |  18 +
 9 files changed, 369 insertions(+), 199 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 8680dbeffb..93dd26cd50 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -1434,201 +1434,213 @@ urDeviceGetSelected(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Supported device info
 typedef enum ur_device_info_t {
-    UR_DEVICE_INFO_TYPE = 0,                                        ///< [::ur_device_type_t] type of the device
-    UR_DEVICE_INFO_VENDOR_ID = 1,                                   ///< [uint32_t] vendor Id of the device
-    UR_DEVICE_INFO_DEVICE_ID = 2,                                   ///< [uint32_t] Id of the device
-    UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3,                           ///< [uint32_t] the number of compute units
-    UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4,                    ///< [uint32_t] max work item dimensions
-    UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5,                         ///< [size_t[]] return an array of max work item sizes
-    UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6,                         ///< [size_t] max work group size
-    UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7,                            ///< [::ur_device_fp_capability_flags_t] single precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_HALF_FP_CONFIG = 8,                              ///< [::ur_device_fp_capability_flags_t] half precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9,                            ///< [::ur_device_fp_capability_flags_t] double precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_QUEUE_PROPERTIES = 10,                           ///< [::ur_queue_flags_t] command queue properties supported by the device
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11,                ///< [uint32_t] preferred vector width for char
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12,               ///< [uint32_t] preferred vector width for short
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13,                 ///< [uint32_t] preferred vector width for int
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14,                ///< [uint32_t] preferred vector width for long
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15,               ///< [uint32_t] preferred vector width for float
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16,              ///< [uint32_t] preferred vector width for double
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17,                ///< [uint32_t] preferred vector width for half float
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18,                   ///< [uint32_t] native vector width for char
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19,                  ///< [uint32_t] native vector width for short
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20,                    ///< [uint32_t] native vector width for int
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21,                   ///< [uint32_t] native vector width for long
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22,                  ///< [uint32_t] native vector width for float
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23,                 ///< [uint32_t] native vector width for double
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24,                   ///< [uint32_t] native vector width for half float
-    UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25,                        ///< [uint32_t] max clock frequency in MHz
-    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                          ///< [uint32_t] memory clock frequency in MHz
-    UR_DEVICE_INFO_ADDRESS_BITS = 27,                               ///< [uint32_t] address bits
-    UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28,                         ///< [uint64_t] max memory allocation size
-    UR_DEVICE_INFO_IMAGE_SUPPORTED = 29,                            ///< [::ur_bool_t] images are supported
-    UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30,                        ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the read_only qualifier
-    UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31,                       ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the write_only qualifier
-    UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32,                  ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the read_write qualifier
-    UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33,                          ///< [size_t] max width of Image2D object
-    UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34,                         ///< [size_t] max height of Image2D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35,                          ///< [size_t] max width of Image3D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36,                         ///< [size_t] max height of Image3D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37,                          ///< [size_t] max depth of Image3D object
-    UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38,                      ///< [size_t] max image buffer size
-    UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39,                       ///< [size_t] max image array size
-    UR_DEVICE_INFO_MAX_SAMPLERS = 40,                               ///< [uint32_t] max number of samplers that can be used in a kernel
-    UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41,                         ///< [size_t] max size in bytes of all arguments passed to a kernel
-    UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42,                        ///< [uint32_t] memory base address alignment
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43,                      ///< [::ur_device_mem_cache_type_t] global memory cache type
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44,                  ///< [uint32_t] global memory cache line size in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45,                      ///< [uint64_t] size of global memory cache in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46,                            ///< [uint64_t] size of global memory in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                            ///< [uint64_t] size of global memory which is free in bytes
-    UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48,                   ///< [uint64_t] max constant buffer size in bytes
-    UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49,                          ///< [uint32_t] max number of __const declared arguments in a kernel
-    UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50,                             ///< [::ur_device_local_mem_type_t] local memory type
-    UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51,                             ///< [uint64_t] local memory size in bytes
-    UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52,                   ///< [::ur_bool_t] support error correction to global and local memory
-    UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53,                        ///< [::ur_bool_t] unified host device memory
-    UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54,                 ///< [size_t] profiling timer resolution in nanoseconds
-    UR_DEVICE_INFO_ENDIAN_LITTLE = 55,                              ///< [::ur_bool_t] little endian byte order
-    UR_DEVICE_INFO_AVAILABLE = 56,                                  ///< [::ur_bool_t] device is available
-    UR_DEVICE_INFO_COMPILER_AVAILABLE = 57,                         ///< [::ur_bool_t] device compiler is available
-    UR_DEVICE_INFO_LINKER_AVAILABLE = 58,                           ///< [::ur_bool_t] device linker is available
-    UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59,                     ///< [::ur_device_exec_capability_flags_t] device kernel execution
-                                                                    ///< capability bit-field
-    UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60,                 ///< [::ur_queue_flags_t] device command queue property bit-field
-    UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61,                   ///< [::ur_queue_flags_t] host queue property bit-field
-    UR_DEVICE_INFO_BUILT_IN_KERNELS = 62,                           ///< [char[]] a semi-colon separated list of built-in kernels
-    UR_DEVICE_INFO_PLATFORM = 63,                                   ///< [::ur_platform_handle_t] the platform associated with the device
-    UR_DEVICE_INFO_REFERENCE_COUNT = 64,                            ///< [uint32_t] Reference count of the device object.
-                                                                    ///< The reference count returned should be considered immediately stale.
-                                                                    ///< It is unsuitable for general use in applications. This feature is
-                                                                    ///< provided for identifying memory leaks.
-    UR_DEVICE_INFO_IL_VERSION = 65,                                 ///< [char[]] IL version
-    UR_DEVICE_INFO_NAME = 66,                                       ///< [char[]] Device name
-    UR_DEVICE_INFO_VENDOR = 67,                                     ///< [char[]] Device vendor
-    UR_DEVICE_INFO_DRIVER_VERSION = 68,                             ///< [char[]] Driver version
-    UR_DEVICE_INFO_PROFILE = 69,                                    ///< [char[]] Device profile
-    UR_DEVICE_INFO_VERSION = 70,                                    ///< [char[]] Device version
-    UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71,                    ///< [char[]] Version of backend runtime
-    UR_DEVICE_INFO_EXTENSIONS = 72,                                 ///< [char[]] Return a space separated list of extension names
-    UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73,                         ///< [size_t] Maximum size in bytes of internal printf buffer
-    UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74,                ///< [::ur_bool_t] prefer user synchronization when sharing object with
-                                                                    ///< other API
-    UR_DEVICE_INFO_PARENT_DEVICE = 75,                              ///< [::ur_device_handle_t] return parent device handle
-    UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76,                       ///< [::ur_device_partition_t[]] Returns an array of partition types
-                                                                    ///< supported by the device
-    UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77,                  ///< [uint32_t] maximum number of sub-devices when the device is
-                                                                    ///< partitioned
-    UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78,                  ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
-                                                                    ///< supported affinity domains for partitioning.
-                                                                    ///< If the device does not support any affinity domains, then 0 will be returned.
-    UR_DEVICE_INFO_PARTITION_TYPE = 79,                             ///< [::ur_device_partition_property_t[]] return an array of
-                                                                    ///< ::ur_device_partition_property_t for properties specified in
-                                                                    ///< ::urDevicePartition
-    UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80,                         ///< [uint32_t] max number of sub groups
-    UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81,     ///< [::ur_bool_t] support sub group independent forward progress
-    UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82,                      ///< [uint32_t[]] return an array of sub group sizes supported on Intel
-                                                                    ///< device
-    UR_DEVICE_INFO_USM_HOST_SUPPORT = 83,                           ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
-                                                                    ///< access
-    UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84,                         ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
-                                                                    ///< access
-    UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85,                  ///< [::ur_device_usm_access_capability_flags_t] support USM single device
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86,                   ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87,                  ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_UUID = 88,                                       ///< [uint8_t[]] return device UUID
-    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                ///< [char[]] return device PCI address
-    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                               ///< [uint32_t] return Intel GPU EU count
-    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                          ///< [uint32_t] return Intel GPU EU SIMD width
-    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                              ///< [uint32_t] return Intel GPU number of slices
-    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                  ///< [uint32_t] return Intel GPU EU count per subslice
-    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                    ///< [uint32_t] return Intel GPU number of subslices per slice
-    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                      ///< [uint32_t] return Intel GPU number of threads per EU
-    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                       ///< [uint32_t] return max memory bandwidth in Mb/s
-    UR_DEVICE_INFO_IMAGE_SRGB = 97,                                 ///< [::ur_bool_t] device supports sRGB images
-    UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98,                         ///< [::ur_bool_t] Return true if sub-device should do its own program
-                                                                    ///< build
-    UR_DEVICE_INFO_ATOMIC_64 = 99,                                  ///< [::ur_bool_t] support 64 bit atomics
-    UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100,          ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory order capabilities
-    UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101,          ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory scope capabilities
-    UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102,           ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory fence order capabilities
-    UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103,           ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory fence scope capabilities
-    UR_DEVICE_INFO_BFLOAT16 = 104,                                  ///< [::ur_bool_t] support for bfloat16
-    UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105,                 ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
-                                                                    ///< queue index. Otherwise, returns the number of queue indices that are
-                                                                    ///< available for this device.
-    UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106,       ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
-                                                                    ///< point
-    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                          ///< [uint32_t] return the width in bits of the memory bus interface of the
-                                                                    ///< device.
-    UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108,                        ///< [size_t[3]] return max 3D work groups
-    UR_DEVICE_INFO_ASYNC_BARRIER = 109,                             ///< [::ur_bool_t] return true if Async Barrier is supported
-    UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110,                       ///< [::ur_bool_t] return true if specifying memory channels is supported
-    UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111,            ///< [::ur_bool_t] Return true if the device supports enqueueing commands
-                                                                    ///< to read and write pipes from the host.
-    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,              ///< [uint32_t] The maximum number of registers available per block.
-    UR_DEVICE_INFO_IP_VERSION = 113,                                ///< [uint32_t] The device IP version. The meaning of the device IP version
-                                                                    ///< is implementation-defined, but newer devices should have a higher
-                                                                    ///< version than older devices.
-    UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114,                    ///< [::ur_bool_t] return true if the device supports virtual memory.
-    UR_DEVICE_INFO_ESIMD_SUPPORT = 115,                             ///< [::ur_bool_t] return true if the device supports ESIMD.
-    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                         ///< [::ur_device_handle_t[]] The set of component devices contained by
-                                                                    ///< this composite device.
-    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                          ///< [::ur_device_handle_t] The composite device containing this component
-                                                                    ///< device.
-    UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000,             ///< [::ur_bool_t] Returns true if the device supports the use of
-                                                                    ///< command-buffers.
-    UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001,      ///< [::ur_bool_t] Returns true if the device supports updating the kernel
-                                                                    ///< commands in a command-buffer.
-    UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000,            ///< [::ur_bool_t] returns true if the device supports the creation of
-                                                                    ///< bindless images
-    UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of
-                                                                    ///< bindless images backed by shared USM
-    UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002,     ///< [::ur_bool_t] returns true if the device supports the creation of 1D
-                                                                    ///< bindless images backed by USM
-    UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003,     ///< [::ur_bool_t] returns true if the device supports the creation of 2D
-                                                                    ///< bindless images backed by USM
-    UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004,                  ///< [uint32_t] returns the required alignment of the pitch between two
-                                                                    ///< rows of an image in bytes
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005,             ///< [size_t] returns the maximum linear width allowed for images allocated
-                                                                    ///< using USM
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006,            ///< [size_t] returns the maximum linear height allowed for images
-                                                                    ///< allocated using USM
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007,             ///< [size_t] returns the maximum linear pitch allowed for images allocated
-                                                                    ///< using USM
-    UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008,                     ///< [::ur_bool_t] returns true if the device supports allocating mipmap
-                                                                    ///< resources
-    UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009,          ///< [::ur_bool_t] returns true if the device supports sampling mipmap
-                                                                    ///< images with anisotropic filtering
-    UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A,              ///< [uint32_t] returns the maximum anisotropic ratio supported by the
-                                                                    ///< device
-    UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B,     ///< [::ur_bool_t] returns true if the device supports using images created
-                                                                    ///< from individual mipmap levels
-    UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C,      ///< [::ur_bool_t] returns true if the device supports importing external
-                                                                    ///< memory resources
-    UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D,      ///< [::ur_bool_t] returns true if the device supports exporting internal
-                                                                    ///< memory resources
-    UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E,   ///< [::ur_bool_t] returns true if the device supports importing external
-                                                                    ///< semaphore resources
-    UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F,   ///< [::ur_bool_t] returns true if the device supports exporting internal
-                                                                    ///< event resources
-    UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010,                    ///< [::ur_bool_t] returns true if the device supports allocating and
-                                                                    ///< accessing cubemap resources
-    UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
-                                                                    ///< images across face boundaries
+    UR_DEVICE_INFO_TYPE = 0,                                         ///< [::ur_device_type_t] type of the device
+    UR_DEVICE_INFO_VENDOR_ID = 1,                                    ///< [uint32_t] vendor Id of the device
+    UR_DEVICE_INFO_DEVICE_ID = 2,                                    ///< [uint32_t] Id of the device
+    UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3,                            ///< [uint32_t] the number of compute units
+    UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4,                     ///< [uint32_t] max work item dimensions
+    UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5,                          ///< [size_t[]] return an array of max work item sizes
+    UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6,                          ///< [size_t] max work group size
+    UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7,                             ///< [::ur_device_fp_capability_flags_t] single precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_HALF_FP_CONFIG = 8,                               ///< [::ur_device_fp_capability_flags_t] half precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9,                             ///< [::ur_device_fp_capability_flags_t] double precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_QUEUE_PROPERTIES = 10,                            ///< [::ur_queue_flags_t] command queue properties supported by the device
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11,                 ///< [uint32_t] preferred vector width for char
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12,                ///< [uint32_t] preferred vector width for short
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13,                  ///< [uint32_t] preferred vector width for int
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14,                 ///< [uint32_t] preferred vector width for long
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15,                ///< [uint32_t] preferred vector width for float
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16,               ///< [uint32_t] preferred vector width for double
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17,                 ///< [uint32_t] preferred vector width for half float
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18,                    ///< [uint32_t] native vector width for char
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19,                   ///< [uint32_t] native vector width for short
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20,                     ///< [uint32_t] native vector width for int
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21,                    ///< [uint32_t] native vector width for long
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22,                   ///< [uint32_t] native vector width for float
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23,                  ///< [uint32_t] native vector width for double
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24,                    ///< [uint32_t] native vector width for half float
+    UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25,                         ///< [uint32_t] max clock frequency in MHz
+    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                           ///< [uint32_t] memory clock frequency in MHz
+    UR_DEVICE_INFO_ADDRESS_BITS = 27,                                ///< [uint32_t] address bits
+    UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28,                          ///< [uint64_t] max memory allocation size
+    UR_DEVICE_INFO_IMAGE_SUPPORTED = 29,                             ///< [::ur_bool_t] images are supported
+    UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30,                         ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the read_only qualifier
+    UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31,                        ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the write_only qualifier
+    UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32,                   ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the read_write qualifier
+    UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33,                           ///< [size_t] max width of Image2D object
+    UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34,                          ///< [size_t] max height of Image2D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35,                           ///< [size_t] max width of Image3D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36,                          ///< [size_t] max height of Image3D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37,                           ///< [size_t] max depth of Image3D object
+    UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38,                       ///< [size_t] max image buffer size
+    UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39,                        ///< [size_t] max image array size
+    UR_DEVICE_INFO_MAX_SAMPLERS = 40,                                ///< [uint32_t] max number of samplers that can be used in a kernel
+    UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41,                          ///< [size_t] max size in bytes of all arguments passed to a kernel
+    UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42,                         ///< [uint32_t] memory base address alignment
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43,                       ///< [::ur_device_mem_cache_type_t] global memory cache type
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44,                   ///< [uint32_t] global memory cache line size in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45,                       ///< [uint64_t] size of global memory cache in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46,                             ///< [uint64_t] size of global memory in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                             ///< [uint64_t] size of global memory which is free in bytes
+    UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48,                    ///< [uint64_t] max constant buffer size in bytes
+    UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49,                           ///< [uint32_t] max number of __const declared arguments in a kernel
+    UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50,                              ///< [::ur_device_local_mem_type_t] local memory type
+    UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51,                              ///< [uint64_t] local memory size in bytes
+    UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52,                    ///< [::ur_bool_t] support error correction to global and local memory
+    UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53,                         ///< [::ur_bool_t] unified host device memory
+    UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54,                  ///< [size_t] profiling timer resolution in nanoseconds
+    UR_DEVICE_INFO_ENDIAN_LITTLE = 55,                               ///< [::ur_bool_t] little endian byte order
+    UR_DEVICE_INFO_AVAILABLE = 56,                                   ///< [::ur_bool_t] device is available
+    UR_DEVICE_INFO_COMPILER_AVAILABLE = 57,                          ///< [::ur_bool_t] device compiler is available
+    UR_DEVICE_INFO_LINKER_AVAILABLE = 58,                            ///< [::ur_bool_t] device linker is available
+    UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59,                      ///< [::ur_device_exec_capability_flags_t] device kernel execution
+                                                                     ///< capability bit-field
+    UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60,                  ///< [::ur_queue_flags_t] device command queue property bit-field
+    UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61,                    ///< [::ur_queue_flags_t] host queue property bit-field
+    UR_DEVICE_INFO_BUILT_IN_KERNELS = 62,                            ///< [char[]] a semi-colon separated list of built-in kernels
+    UR_DEVICE_INFO_PLATFORM = 63,                                    ///< [::ur_platform_handle_t] the platform associated with the device
+    UR_DEVICE_INFO_REFERENCE_COUNT = 64,                             ///< [uint32_t] Reference count of the device object.
+                                                                     ///< The reference count returned should be considered immediately stale.
+                                                                     ///< It is unsuitable for general use in applications. This feature is
+                                                                     ///< provided for identifying memory leaks.
+    UR_DEVICE_INFO_IL_VERSION = 65,                                  ///< [char[]] IL version
+    UR_DEVICE_INFO_NAME = 66,                                        ///< [char[]] Device name
+    UR_DEVICE_INFO_VENDOR = 67,                                      ///< [char[]] Device vendor
+    UR_DEVICE_INFO_DRIVER_VERSION = 68,                              ///< [char[]] Driver version
+    UR_DEVICE_INFO_PROFILE = 69,                                     ///< [char[]] Device profile
+    UR_DEVICE_INFO_VERSION = 70,                                     ///< [char[]] Device version
+    UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71,                     ///< [char[]] Version of backend runtime
+    UR_DEVICE_INFO_EXTENSIONS = 72,                                  ///< [char[]] Return a space separated list of extension names
+    UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73,                          ///< [size_t] Maximum size in bytes of internal printf buffer
+    UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74,                 ///< [::ur_bool_t] prefer user synchronization when sharing object with
+                                                                     ///< other API
+    UR_DEVICE_INFO_PARENT_DEVICE = 75,                               ///< [::ur_device_handle_t] return parent device handle
+    UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76,                        ///< [::ur_device_partition_t[]] Returns an array of partition types
+                                                                     ///< supported by the device
+    UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77,                   ///< [uint32_t] maximum number of sub-devices when the device is
+                                                                     ///< partitioned
+    UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78,                   ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
+                                                                     ///< supported affinity domains for partitioning.
+                                                                     ///< If the device does not support any affinity domains, then 0 will be returned.
+    UR_DEVICE_INFO_PARTITION_TYPE = 79,                              ///< [::ur_device_partition_property_t[]] return an array of
+                                                                     ///< ::ur_device_partition_property_t for properties specified in
+                                                                     ///< ::urDevicePartition
+    UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80,                          ///< [uint32_t] max number of sub groups
+    UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81,      ///< [::ur_bool_t] support sub group independent forward progress
+    UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82,                       ///< [uint32_t[]] return an array of sub group sizes supported on Intel
+                                                                     ///< device
+    UR_DEVICE_INFO_USM_HOST_SUPPORT = 83,                            ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
+                                                                     ///< access
+    UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84,                          ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
+                                                                     ///< access
+    UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85,                   ///< [::ur_device_usm_access_capability_flags_t] support USM single device
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86,                    ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87,                   ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_UUID = 88,                                        ///< [uint8_t[]] return device UUID
+    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                 ///< [char[]] return device PCI address
+    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                                ///< [uint32_t] return Intel GPU EU count
+    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                           ///< [uint32_t] return Intel GPU EU SIMD width
+    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                               ///< [uint32_t] return Intel GPU number of slices
+    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                   ///< [uint32_t] return Intel GPU EU count per subslice
+    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                     ///< [uint32_t] return Intel GPU number of subslices per slice
+    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                       ///< [uint32_t] return Intel GPU number of threads per EU
+    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                        ///< [uint32_t] return max memory bandwidth in Mb/s
+    UR_DEVICE_INFO_IMAGE_SRGB = 97,                                  ///< [::ur_bool_t] device supports sRGB images
+    UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98,                          ///< [::ur_bool_t] Return true if sub-device should do its own program
+                                                                     ///< build
+    UR_DEVICE_INFO_ATOMIC_64 = 99,                                   ///< [::ur_bool_t] support 64 bit atomics
+    UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100,           ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory order capabilities
+    UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101,           ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory scope capabilities
+    UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102,            ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory fence order capabilities
+    UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103,            ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory fence scope capabilities
+    UR_DEVICE_INFO_BFLOAT16 = 104,                                   ///< [::ur_bool_t] support for bfloat16
+    UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105,                  ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
+                                                                     ///< queue index. Otherwise, returns the number of queue indices that are
+                                                                     ///< available for this device.
+    UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106,        ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
+                                                                     ///< point
+    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                           ///< [uint32_t] return the width in bits of the memory bus interface of the
+                                                                     ///< device.
+    UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108,                         ///< [size_t[3]] return max 3D work groups
+    UR_DEVICE_INFO_ASYNC_BARRIER = 109,                              ///< [::ur_bool_t] return true if Async Barrier is supported
+    UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110,                        ///< [::ur_bool_t] return true if specifying memory channels is supported
+    UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111,             ///< [::ur_bool_t] Return true if the device supports enqueueing commands
+                                                                     ///< to read and write pipes from the host.
+    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,               ///< [uint32_t] The maximum number of registers available per block.
+    UR_DEVICE_INFO_IP_VERSION = 113,                                 ///< [uint32_t] The device IP version. The meaning of the device IP version
+                                                                     ///< is implementation-defined, but newer devices should have a higher
+                                                                     ///< version than older devices.
+    UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114,                     ///< [::ur_bool_t] return true if the device supports virtual memory.
+    UR_DEVICE_INFO_ESIMD_SUPPORT = 115,                              ///< [::ur_bool_t] return true if the device supports ESIMD.
+    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                          ///< [::ur_device_handle_t[]] The set of component devices contained by
+                                                                     ///< this composite device.
+    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                           ///< [::ur_device_handle_t] The composite device containing this component
+                                                                     ///< device.
+    UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000,              ///< [::ur_bool_t] Returns true if the device supports the use of
+                                                                     ///< command-buffers.
+    UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001,       ///< [::ur_bool_t] Returns true if the device supports updating the kernel
+                                                                     ///< commands in a command-buffer.
+    UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000,             ///< [::ur_bool_t] returns true if the device supports the creation of
+                                                                     ///< bindless images
+    UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001,  ///< [::ur_bool_t] returns true if the device supports the creation of
+                                                                     ///< bindless images backed by shared USM
+    UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002,      ///< [::ur_bool_t] returns true if the device supports the creation of 1D
+                                                                     ///< bindless images backed by USM
+    UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003,      ///< [::ur_bool_t] returns true if the device supports the creation of 2D
+                                                                     ///< bindless images backed by USM
+    UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004,                   ///< [uint32_t] returns the required alignment of the pitch between two
+                                                                     ///< rows of an image in bytes
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005,              ///< [size_t] returns the maximum linear width allowed for images allocated
+                                                                     ///< using USM
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006,             ///< [size_t] returns the maximum linear height allowed for images
+                                                                     ///< allocated using USM
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007,              ///< [size_t] returns the maximum linear pitch allowed for images allocated
+                                                                     ///< using USM
+    UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008,                      ///< [::ur_bool_t] returns true if the device supports allocating mipmap
+                                                                     ///< resources
+    UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009,           ///< [::ur_bool_t] returns true if the device supports sampling mipmap
+                                                                     ///< images with anisotropic filtering
+    UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A,               ///< [uint32_t] returns the maximum anisotropic ratio supported by the
+                                                                     ///< device
+    UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B,      ///< [::ur_bool_t] returns true if the device supports using images created
+                                                                     ///< from individual mipmap levels
+    UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C,       ///< [::ur_bool_t] returns true if the device supports importing external
+                                                                     ///< memory resources
+    UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D,       ///< [::ur_bool_t] returns true if the device supports exporting internal
+                                                                     ///< memory resources
+    UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E,    ///< [::ur_bool_t] returns true if the device supports importing external
+                                                                     ///< semaphore resources
+    UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F,    ///< [::ur_bool_t] returns true if the device supports exporting internal
+                                                                     ///< event resources
+    UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010,                     ///< [::ur_bool_t] returns true if the device supports allocating and
+                                                                     ///< accessing cubemap resources
+    UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011,  ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
+                                                                     ///< images across face boundaries
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 1D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP = 0x2013,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 1D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP = 0x2014, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 2D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP = 0x2015,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 2D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP = 0x2016, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 3D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 3D sampled image data.
     /// @cond
     UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1654,7 +1666,7 @@ typedef enum ur_device_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index a5074c5da1..3d48ae9a35 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -2553,6 +2553,24 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
     case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP:
         os << "UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP";
         break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -4190,6 +4208,78 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
 
         os << ")";
     } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst
index af90c1ea0f..ee54c69291 100644
--- a/scripts/core/EXP-BINDLESS-IMAGES.rst
+++ b/scripts/core/EXP-BINDLESS-IMAGES.rst
@@ -91,6 +91,12 @@ Enums
     * ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP
     * ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP
     * ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
 
 * ${x}_command_t
     * ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP
@@ -198,6 +204,8 @@ Changelog
 +------------------------------------------------------------------------+
 | 10.0     | Added cubemap image type, sampling properties, and device   |
 |          | queries.                                                    |
++------------------------------------------------------------------------+
+| 11.0     | Added device queries for sampled image fetch capabilities.  |
 +----------+-------------------------------------------------------------+
 
 Contributors
diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml
index a6f17b1a74..42c9701433 100644
--- a/scripts/core/exp-bindless-images.yml
+++ b/scripts/core/exp-bindless-images.yml
@@ -92,6 +92,24 @@ etors:
     - name: CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP
       value: "0x2011"
       desc: "[$x_bool_t] returns true if the device supports sampling cubemapped images across face boundaries"
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
+      value: "0x2012"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 1D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
+      value: "0x2013"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 1D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
+      value: "0x2014"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 2D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
+      value: "0x2015"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 2D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
+      value: "0x2016"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 3D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
+      value: "0x2017"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 3D sampled image data."
 --- #--------------------------------------------------------------------------
 type: enum
 extend: true
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index da11f3f1bf..cca00c0b85 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -926,6 +926,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // CUDA supports cubemap seamless filtering.
     return ReturnValue(true);
   }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+    // CUDA does support fetching 1D USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+    // CUDA does not support fetching 1D non-USM sampled image data.
+    return ReturnValue(false);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+    // CUDA does support fetching 2D USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+    // CUDA does support fetching 2D non-USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+    // CUDA does not support 3D USM sampled textures
+    return ReturnValue(false);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+    // CUDA does support fetching 3D non-USM sampled image data.
+    return ReturnValue(true);
+  }
   case UR_DEVICE_INFO_DEVICE_ID: {
     int Value = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 1e14552b4e..7939ca21b9 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName) {
+        if (UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 66f4835c56..dba668e61b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index e6410ee99b..7f4746fcb7 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 111726f6cc..752a3a839f 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -383,5 +383,23 @@ inline void printDeviceInfos(ur_device_handle_t hDevice,
     std::cout << prefix;
     printDeviceInfo<ur_bool_t>(
         hDevice, UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP);
 }
 } // namespace urinfo

From 388ce451edc2fda561a55996fd54ea1bb7fac21b Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Wed, 1 May 2024 11:47:16 +0100
Subject: [PATCH 17/19] Update CODEOWNERS to include test directories

---
 .github/CODEOWNERS | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d53176133b..56f72c49e9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,16 +2,21 @@
 
 # Level Zero adapter
 source/adapters/level_zero      @oneapi-src/unified-runtime-level-zero-write
+test/adapters/level_zero        @oneapi-src/unified-runtime-level-zero-write
 
 # CUDA and HIP adapters
 source/adapters/cuda            @oneapi-src/unified-runtime-cuda-write
+test/adapters/cuda              @oneapi-src/unified-runtime-cuda-write
 source/adapters/hip             @oneapi-src/unified-runtime-hip-write
+test/adapters/hip               @oneapi-src/unified-runtime-hip-write
 
 # OpenCL adapter
 source/adapters/opencl          @oneapi-src/unified-runtime-opencl-write
+test/adapters/opencl            @oneapi-src/unified-runtime-opencl-write
 
 # Native CPU adapter
 source/adapters/native_cpu          @oneapi-src/unified-runtime-native-cpu-write
+test/adapters/native_cpu            @oneapi-src/unified-runtime-native-cpu-write
 
 # Command-buffer experimental feature
 source/adapters/**/command_buffer.*     @oneapi-src/unified-runtime-command-buffer-write
@@ -20,6 +25,7 @@ scripts/core/exp-command-buffer.yml     @oneapi-src/unified-runtime-command-buff
 test/conformance/exp_command_buffer**   @oneapi-src/unified-runtime-command-buffer-write
 
 # Bindless Images experimental feature
-scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write
-scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write
-source/adapters/**/image.*           @oneapi-src/unified-runtime-bindless-images-write
+source/adapters/**/image.*              @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/EXP-BINDLESS-IMAGES.rst    @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/exp-bindless-images.yml    @oneapi-src/unified-runtime-bindless-images-write
+test/conformance/exp_bindless_images**  @oneapi-src/unified-runtime-bindless-images-write

From d3391b3c6295f115fead3b1dbf6e7703af09ff9d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 May 2024 10:49:28 +0100
Subject: [PATCH 18/19] [CTS] Allow users to pass build flags to DPC++

These can be passed as a space-separated list of options - analogous to
CMAKE_CXX_FLAGS and co - and are passed to DPC++ when building device
programs.
---
 CMakeLists.txt                              | 1 +
 README.md                                   | 1 +
 test/conformance/device_code/CMakeLists.txt | 7 ++++++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fcd74e729..a8f5f2ad96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,6 +48,7 @@ option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
 option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
 set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
+set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
 set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
     "Path of the SYCL runtime library directory")
 set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
diff --git a/README.md b/README.md
index cb43c380b9..b1c6420420 100644
--- a/README.md
+++ b/README.md
@@ -140,6 +140,7 @@ List of options provided by CMake:
 | UR_HIP_PLATFORM         | Build HIP adapter for AMD or NVIDIA platform           | AMD/NVIDIA | AMD     |
 | UR_ENABLE_COMGR         | Enable comgr lib usage           | AMD/NVIDIA | AMD     |
 | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |
+| UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` |
 | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` |
 | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` |
 | UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` |
diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index ee16b0eb43..26358d49f6 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -52,6 +52,11 @@ macro(add_device_binary SOURCE_FILE)
             set(EXTRA_ENV DYLD_FALLBACK_LIBRARY_PATH=${UR_SYCL_LIBRARY_DIR})
         endif()
     endif()
+
+    # Convert build flags to a regular CMake list, splitting by unquoted white
+    # space as necessary.
+    separate_arguments(DPCXX_BUILD_FLAGS_LIST NATIVE_COMMAND "${UR_DPCXX_BUILD_FLAGS}")
+
     foreach(TRIPLE ${TARGET_TRIPLES})
         set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}")
         if(${TRIPLE} MATCHES "amd")
@@ -79,7 +84,7 @@ macro(add_device_binary SOURCE_FILE)
         add_custom_command(OUTPUT ${EXE_PATH}
             COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off 
             ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB}
-            ${SOURCE_FILE} -o ${EXE_PATH}
+            ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH}
 
             COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true
             ${EXE_PATH} || exit 0

From 089d914db1779221983fc579364346669b4d6790 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Thu, 2 May 2024 12:23:59 +0100
Subject: [PATCH 19/19] [CTS] Replace use of deprecated <CL/sycl.hpp>

---
 test/conformance/device_code/bar.cpp          | 10 ++---
 test/conformance/device_code/fill.cpp         | 15 ++++---
 test/conformance/device_code/fill_2d.cpp      | 18 ++++----
 test/conformance/device_code/fill_3d.cpp      | 19 ++++-----
 test/conformance/device_code/fill_usm.cpp     | 12 +++---
 test/conformance/device_code/foo.cpp          | 10 ++---
 test/conformance/device_code/image_copy.cpp   | 42 +++++++++----------
 test/conformance/device_code/indexers_usm.cpp | 25 ++++++-----
 test/conformance/device_code/mean.cpp         | 24 +++++------
 test/conformance/device_code/saxpy.cpp        | 25 +++++------
 test/conformance/device_code/saxpy_usm.cpp    | 16 +++----
 11 files changed, 103 insertions(+), 113 deletions(-)

diff --git a/test/conformance/device_code/bar.cpp b/test/conformance/device_code/bar.cpp
index fecac40c75..58f2696bf8 100644
--- a/test/conformance/device_code/bar.cpp
+++ b/test/conformance/device_code/bar.cpp
@@ -3,14 +3,14 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue deviceQueue;
-    cl::sycl::range<1> numOfItems{1};
+    sycl::queue deviceQueue;
+    sycl::range<1> numOfItems{1};
 
-    deviceQueue.submit([&](cl::sycl::handler &cgh) {
-        auto kern = [=](cl::sycl::id<1>) {};
+    deviceQueue.submit([&](sycl::handler &cgh) {
+        auto kern = [=](sycl::id<1>) {};
         cgh.parallel_for<class Bar>(numOfItems, kern);
     });
 
diff --git a/test/conformance/device_code/fill.cpp b/test/conformance/device_code/fill.cpp
index 443373edf2..fabcbcf8ec 100644
--- a/test/conformance/device_code/fill.cpp
+++ b/test/conformance/device_code/fill.cpp
@@ -3,19 +3,18 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
     std::vector<uint32_t> A(array_size, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
-    auto A_buff =
-        cl::sycl::buffer<uint32_t>(A.data(), cl::sycl::range<1>(array_size));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<class fill>(cl::sycl::range<1>{array_size},
-                                     [A_acc, val](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    auto A_buff = sycl::buffer<uint32_t>(A.data(), sycl::range<1>(array_size));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class fill>(sycl::range<1>{array_size},
+                                     [A_acc, val](sycl::item<1> itemId) {
                                          auto id = itemId.get_id(0);
                                          A_acc[id] = val;
                                      });
diff --git a/test/conformance/device_code/fill_2d.cpp b/test/conformance/device_code/fill_2d.cpp
index d4dd6c704f..5fed417ed8 100644
--- a/test/conformance/device_code/fill_2d.cpp
+++ b/test/conformance/device_code/fill_2d.cpp
@@ -3,24 +3,24 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t nd_range_x = 8;
     size_t nd_range_y = 8;
-    auto nd_range = cl::sycl::range<2>(nd_range_x, nd_range_y);
+    auto nd_range = sycl::range<2>(nd_range_x, nd_range_y);
 
     std::vector<uint32_t> A(nd_range_x * nd_range_y, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
-    auto work_range = cl::sycl::nd_range<2>(nd_range, cl::sycl::range<2>(1, 1));
-    auto A_buff = cl::sycl::buffer<uint32_t>(
-        A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
+    auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1));
+    auto A_buff = sycl::buffer<uint32_t>(
+        A.data(), sycl::range<1>(nd_range_x * nd_range_y));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
         cgh.parallel_for<class fill_2d>(
-            work_range, [A_acc, val](cl::sycl::nd_item<2> item_id) {
+            work_range, [A_acc, val](sycl::nd_item<2> item_id) {
                 auto id = item_id.get_global_linear_id();
                 A_acc[id] = val;
             });
diff --git a/test/conformance/device_code/fill_3d.cpp b/test/conformance/device_code/fill_3d.cpp
index a1f172ba6a..fd835222a2 100644
--- a/test/conformance/device_code/fill_3d.cpp
+++ b/test/conformance/device_code/fill_3d.cpp
@@ -3,26 +3,25 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t nd_range_x = 4;
     size_t nd_range_y = 4;
     size_t nd_range_z = 4;
-    auto nd_range = cl::sycl::range<3>(nd_range_x, nd_range_y, nd_range_z);
+    auto nd_range = sycl::range<3>(nd_range_x, nd_range_y, nd_range_z);
 
     std::vector<uint32_t> A(nd_range_x * nd_range_y * nd_range_y, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
-    auto work_range =
-        cl::sycl::nd_range<3>(nd_range, cl::sycl::range<3>(1, 1, 1));
-    auto A_buff = cl::sycl::buffer<uint32_t>(
-        A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
+    auto work_range = sycl::nd_range<3>(nd_range, sycl::range<3>(1, 1, 1));
+    auto A_buff = sycl::buffer<uint32_t>(
+        A.data(), sycl::range<1>(nd_range_x * nd_range_y));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
         cgh.parallel_for<class fill_3d>(
-            work_range, [A_acc, val](cl::sycl::nd_item<3> item_id) {
+            work_range, [A_acc, val](sycl::nd_item<3> item_id) {
                 auto id = item_id.get_global_linear_id();
                 A_acc[id] = val;
             });
diff --git a/test/conformance/device_code/fill_usm.cpp b/test/conformance/device_code/fill_usm.cpp
index 92cd255399..d57309cb59 100644
--- a/test/conformance/device_code/fill_usm.cpp
+++ b/test/conformance/device_code/fill_usm.cpp
@@ -3,17 +3,17 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
     std::vector<uint32_t> A(array_size, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
-    uint32_t *data = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cgh.parallel_for<class fill_usm>(cl::sycl::range<1>{array_size},
-                                         [data, val](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    uint32_t *data = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class fill_usm>(sycl::range<1>{array_size},
+                                         [data, val](sycl::item<1> itemId) {
                                              auto id = itemId.get_id(0);
                                              data[id] = val;
                                          });
diff --git a/test/conformance/device_code/foo.cpp b/test/conformance/device_code/foo.cpp
index dc108b9606..20ad92be4b 100644
--- a/test/conformance/device_code/foo.cpp
+++ b/test/conformance/device_code/foo.cpp
@@ -3,14 +3,14 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue deviceQueue;
-    cl::sycl::range<1> numOfItems{1};
+    sycl::queue deviceQueue;
+    sycl::range<1> numOfItems{1};
 
-    deviceQueue.submit([&](cl::sycl::handler &cgh) {
-        auto kern = [=](cl::sycl::id<1>) {};
+    deviceQueue.submit([&](sycl::handler &cgh) {
+        auto kern = [=](sycl::id<1>) {};
         cgh.parallel_for<class Foo>(numOfItems, kern);
     });
 
diff --git a/test/conformance/device_code/image_copy.cpp b/test/conformance/device_code/image_copy.cpp
index a64b601213..d04398c2fe 100644
--- a/test/conformance/device_code/image_copy.cpp
+++ b/test/conformance/device_code/image_copy.cpp
@@ -3,44 +3,40 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
     const int height = 8;
     const int width = 8;
-    auto image_range = cl::sycl::range<2>(height, width);
+    auto image_range = sycl::range<2>(height, width);
     const int channels = 4;
     std::vector<float> in_data(height * width * channels, 0.5f);
     std::vector<float> out_data(height * width * channels, 0);
 
-    cl::sycl::image<2> image_in(
-        in_data.data(), cl::sycl::image_channel_order::rgba,
-        cl::sycl::image_channel_type::fp32, image_range);
-    cl::sycl::image<2> image_out(
-        out_data.data(), cl::sycl::image_channel_order::rgba,
-        cl::sycl::image_channel_type::fp32, image_range);
+    sycl::image<2> image_in(in_data.data(), sycl::image_channel_order::rgba,
+                            sycl::image_channel_type::fp32, image_range);
+    sycl::image<2> image_out(out_data.data(), sycl::image_channel_order::rgba,
+                             sycl::image_channel_type::fp32, image_range);
 
-    auto work_range =
-        cl::sycl::nd_range<2>(image_range, cl::sycl::range<2>(1, 1));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cl::sycl::accessor<cl::sycl::float4, 2, cl::sycl::access::mode::read,
-                           cl::sycl::access::target::image>
+    auto work_range = sycl::nd_range<2>(image_range, sycl::range<2>(1, 1));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        sycl::accessor<sycl::float4, 2, sycl::access::mode::read,
+                       sycl::access::target::image>
             in_acc(image_in, cgh);
-        cl::sycl::accessor<cl::sycl::float4, 2, cl::sycl::access::mode::write,
-                           cl::sycl::access::target::image>
+        sycl::accessor<sycl::float4, 2, sycl::access::mode::write,
+                       sycl::access::target::image>
             out_acc(image_out, cgh);
 
-        cl::sycl::sampler smpl(
-            cl::sycl::coordinate_normalization_mode::unnormalized,
-            cl::sycl::addressing_mode::clamp,
-            cl::sycl::filtering_mode::nearest);
+        sycl::sampler smpl(sycl::coordinate_normalization_mode::unnormalized,
+                           sycl::addressing_mode::clamp,
+                           sycl::filtering_mode::nearest);
 
         cgh.parallel_for<class image_copy>(
-            work_range, [=](cl::sycl::nd_item<2> item_id) {
-                auto coords = cl::sycl::int2(item_id.get_global_id(0),
-                                             item_id.get_global_id(1));
+            work_range, [=](sycl::nd_item<2> item_id) {
+                auto coords = sycl::int2(item_id.get_global_id(0),
+                                         item_id.get_global_id(1));
                 out_acc.write(coords, in_acc.read(coords, smpl));
             });
     });
diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp
index 76b0751730..e055fa47cc 100644
--- a/test/conformance/device_code/indexers_usm.cpp
+++ b/test/conformance/device_code/indexers_usm.cpp
@@ -3,25 +3,24 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    const cl::sycl::range<3> global_range(8, 8, 8);
-    const cl::sycl::range<3> local_range(2, 2, 2);
-    const cl::sycl::id<3> global_offset(4, 4, 4);
-    const cl::sycl::nd_range<3> nd_range(global_range, local_range,
-                                         global_offset);
+    const sycl::range<3> global_range(8, 8, 8);
+    const sycl::range<3> local_range(2, 2, 2);
+    const sycl::id<3> global_offset(4, 4, 4);
+    const sycl::nd_range<3> nd_range(global_range, local_range, global_offset);
 
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
     const size_t elements_per_work_item = 6;
-    int *ptr = cl::sycl::malloc_shared<int>(global_range[0] * global_range[1] *
-                                                global_range[2] *
-                                                elements_per_work_item,
-                                            sycl_queue);
+    int *ptr =
+        sycl::malloc_shared<int>(global_range[0] * global_range[1] *
+                                     global_range[2] * elements_per_work_item,
+                                 sycl_queue);
 
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+    sycl_queue.submit([&](sycl::handler &cgh) {
         cgh.parallel_for<class indexers>(
-            nd_range, [ptr](cl::sycl::nd_item<3> index) {
+            nd_range, [ptr](sycl::nd_item<3> index) {
                 int *wi_ptr =
                     ptr + index.get_global_linear_id() * elements_per_work_item;
 
diff --git a/test/conformance/device_code/mean.cpp b/test/conformance/device_code/mean.cpp
index 61623e0914..6d5a571374 100644
--- a/test/conformance/device_code/mean.cpp
+++ b/test/conformance/device_code/mean.cpp
@@ -3,34 +3,34 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     const int array_size = 16;
     const int wg_size = 4;
     std::vector<uint32_t> in(array_size * wg_size, 1);
     std::vector<uint32_t> out(array_size, 0);
-    cl::sycl::queue sycl_queue;
-    auto in_buff = cl::sycl::buffer<uint32_t>(
-        in.data(), cl::sycl::range<1>(array_size * wg_size));
+    sycl::queue sycl_queue;
+    auto in_buff =
+        sycl::buffer<uint32_t>(in.data(), sycl::range<1>(array_size * wg_size));
     auto out_buff =
-        cl::sycl::buffer<uint32_t>(out.data(), cl::sycl::range<1>(array_size));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        sycl::buffer<uint32_t>(out.data(), sycl::range<1>(array_size));
+    sycl_queue.submit([&](sycl::handler &cgh) {
         sycl::local_accessor<uint32_t> local_mem(wg_size, cgh);
-        auto in_acc = in_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto out_acc = out_buff.get_access<cl::sycl::access::mode::write>(cgh);
+        auto in_acc = in_buff.get_access<sycl::access::mode::read>(cgh);
+        auto out_acc = out_buff.get_access<sycl::access::mode::write>(cgh);
 
-        cl::sycl::range<1> num_groups{array_size};
-        cl::sycl::range<1> group_size{wg_size};
+        sycl::range<1> num_groups{array_size};
+        sycl::range<1> group_size{wg_size};
         cgh.parallel_for_work_group<class mean>(
-            num_groups, group_size, [=](cl::sycl::group<1> group) {
+            num_groups, group_size, [=](sycl::group<1> group) {
                 auto group_id = group.get_group_id();
                 group.parallel_for_work_item([&](sycl::h_item<1> item) {
                     auto local_id = item.get_local_id(0);
                     auto in_index = (group_id * wg_size) + local_id;
                     local_mem[local_id] = in_acc[in_index];
                 });
-                cl::sycl::group_barrier(group);
+                sycl::group_barrier(group);
                 uint32_t total = 0;
                 for (int i = 0; i < wg_size; i++) {
                     total += local_mem[i];
diff --git a/test/conformance/device_code/saxpy.cpp b/test/conformance/device_code/saxpy.cpp
index 593e8e2435..ac113884b9 100644
--- a/test/conformance/device_code/saxpy.cpp
+++ b/test/conformance/device_code/saxpy.cpp
@@ -3,7 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
@@ -11,20 +11,17 @@ int main() {
     std::vector<uint32_t> Y(array_size, 2);
     std::vector<uint32_t> Z(array_size, 0);
     uint32_t A = 42;
-    auto x_buff =
-        cl::sycl::buffer<uint32_t>(X.data(), cl::sycl::range<1>(array_size));
-    auto y_buff =
-        cl::sycl::buffer<uint32_t>(Y.data(), cl::sycl::range<1>(array_size));
-    auto z_buff =
-        cl::sycl::buffer<uint32_t>(Z.data(), cl::sycl::range<1>(array_size));
+    auto x_buff = sycl::buffer<uint32_t>(X.data(), sycl::range<1>(array_size));
+    auto y_buff = sycl::buffer<uint32_t>(Y.data(), sycl::range<1>(array_size));
+    auto z_buff = sycl::buffer<uint32_t>(Z.data(), sycl::range<1>(array_size));
 
-    cl::sycl::queue sycl_queue;
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto x_acc = x_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto y_acc = y_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto z_acc = z_buff.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<class saxpy>(cl::sycl::range<1>{array_size},
-                                      [=](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto x_acc = x_buff.get_access<sycl::access::mode::read>(cgh);
+        auto y_acc = y_buff.get_access<sycl::access::mode::read>(cgh);
+        auto z_acc = z_buff.get_access<sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class saxpy>(sycl::range<1>{array_size},
+                                      [=](sycl::item<1> itemId) {
                                           auto i = itemId.get_id(0);
                                           z_acc[i] = A * x_acc[i] + y_acc[i];
                                       });
diff --git a/test/conformance/device_code/saxpy_usm.cpp b/test/conformance/device_code/saxpy_usm.cpp
index 8772a7e25d..774686ab21 100644
--- a/test/conformance/device_code/saxpy_usm.cpp
+++ b/test/conformance/device_code/saxpy_usm.cpp
@@ -3,20 +3,20 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
 
-    cl::sycl::queue sycl_queue;
-    uint32_t *X = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    uint32_t *Y = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    uint32_t *Z = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    sycl::queue sycl_queue;
+    uint32_t *X = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    uint32_t *Y = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    uint32_t *Z = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
     uint32_t A = 42;
 
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cgh.parallel_for<class saxpy>(cl::sycl::range<1>{array_size},
-                                      [=](cl::sycl::item<1> itemId) {
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class saxpy>(sycl::range<1>{array_size},
+                                      [=](sycl::item<1> itemId) {
                                           auto i = itemId.get_id(0);
                                           Z[i] = A * X[i] + Y[i];
                                       });