diff --git a/.github/workflows/benchmarks_core.yml b/.github/workflows/benchmarks_compute.yml
similarity index 69%
rename from .github/workflows/benchmarks_core.yml
rename to .github/workflows/benchmarks_compute.yml
index 34ffb7134e..2264b315b9 100644
--- a/.github/workflows/benchmarks_core.yml
+++ b/.github/workflows/benchmarks_compute.yml
@@ -1,39 +1,39 @@
 name: Compute Benchmarks
 
 on:
-  # this workflow can by only triggered by other workflows
-  # for example by: e2e_cuda.yml or e2e_opencl.yml
-  workflow_call:
-    # acceptable input from adapter-specific workflows
+  # Can be triggered via manual "dispatch" (from workflow view in GitHub Actions tab)
+  workflow_dispatch:
+    # acceptable input for adapter-specific runs
     inputs:
-      name:
-        description: Adapter name
-        type: string
-        required: true
       str_name:
         description: Formatted adapter name
-        type: string
-        required: true
-      config:
-        description: Params for sycl configuration
-        type: string
+        type: choice
         required: true
+        default: 'level_zero'
+        options:
+          - level_zero
       unit:
         description: Test unit (cpu/gpu)
-        type: string
+        type: choice
         required: true
-      runner_tag:
-        description: Tag defined for the runner
-        type: string
+        default: 'gpu'
+        options:
+          - cpu
+          - gpu
+      pr_no:
+        description: PR number (if 0, it'll run on the main)
+        type: number
         required: true
-      trigger:
-        description: Type of workflow trigger
+      bench_script_params:
+        description: Parameters passed to script executing benchmark
         type: string
-        required: true
-      comment:
-        description: Text if triggered by a comment
+        required: false
+        default: ''
+      sycl_config_params:
+        description: Extra params for SYCL configuration
         type: string
         required: false
+        default: ''
 
 permissions:
   contents: read
@@ -41,20 +41,20 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
+    # Run only on upstream; forks will not have the HW
+    if: github.repository == 'oneapi-src/unified-runtime'
     name: Build SYCL, UR, run Compute Benchmarks
     strategy:
       matrix:
         adapter: [
-          {name: "${{inputs.name}}",
-          str_name: "${{inputs.str_name}}",
-          config: "${{inputs.config}}",
+          {str_name: "${{inputs.str_name}}",
+          sycl_config: "${{inputs.sycl_config_params}}",
           unit: "${{inputs.unit}}"}
         ]
         build_type: [Release]
         compiler: [{c: clang, cxx: clang++}]
 
-    runs-on: ${{inputs.runner_tag}}
+    runs-on: "${{inputs.str_name}}_PERF"
 
     steps:
     # Workspace on self-hosted runners is not cleaned automatically.
@@ -67,15 +67,17 @@ jobs:
 
     - name: Add comment to PR
       uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-      if: ${{ always() && inputs.trigger != 'schedule' }}
+      if: ${{ always() && inputs.pr_no != 0 }}
       with:
         script: |
-          const adapter = '${{ matrix.adapter.name }}';
+          const pr_no = '${{ inputs.pr_no }}';
+          const adapter = '${{ matrix.adapter.str_name }}';
           const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
-          const body = `Compute Benchmarks ${adapter} run: \n${url}`;
+          const params = '${{ inputs.bench_script_params }}';
+          const body = `Compute Benchmarks ${adapter} run (with params: ${params}):\n${url}`;
 
           github.rest.issues.createComment({
-            issue_number: context.issue.number,
+            issue_number: pr_no,
             owner: context.repo.owner,
             repo: context.repo.repo,
             body: body
@@ -86,13 +88,12 @@ jobs:
       with:
         path: ur-repo
 
-    # On issue_comment trigger (for PRs) we need to fetch special ref for
-    # proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
+    # We need to fetch special ref for proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
     - name: Fetch PR's merge commit
-      if: ${{ inputs.trigger != 'schedule' }}
+      if: ${{ inputs.pr_no != 0 }}
       working-directory: ${{github.workspace}}/ur-repo
       env:
-        PR_NO: ${{github.event.issue.number}}
+        PR_NO: ${{ inputs.pr_no }}
       run: |
         git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/*
         git checkout origin/pr/${PR_NO}/merge
@@ -108,7 +109,7 @@ jobs:
         fetch-tags: false
 
     - name: Set CUDA env vars
-      if: matrix.adapter.name == 'CUDA'
+      if: matrix.adapter.str_name == 'cuda'
       run: |
         echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
         echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
@@ -119,7 +120,7 @@ jobs:
         -t ${{matrix.build_type}}
         -o ${{github.workspace}}/sycl_build
         --cmake-gen "Ninja"
-        --ci-defaults ${{matrix.adapter.config}}
+        --ci-defaults ${{matrix.adapter.sycl_config}}
         --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
         --cmake-opt="-DSYCL_PI_TESTS=OFF"
         --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF"
@@ -165,17 +166,13 @@ jobs:
       run: |
         echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV
 
-    - name: Extract arguments from comment
-      id: args
-      run: echo "ARGS=$(echo '${{ inputs.comment }}' | sed -n 's/.*\/benchmarks-[^ ]* \(.*\)/\1/p')" >> $GITHUB_ENV
-
     - name: Run SYCL API Overhead benchmark
       id: benchmarks
-      run: ${{github.workspace}}/ur-repo/.github/scripts/compute_benchmarks.py ${{github.workspace}}/compute-benchmarks-build/bin/ $ARGS
+      run: ${{ github.workspace }}/ur-repo/.github/scripts/compute_benchmarks.py ${{ github.workspace }}/compute-benchmarks-build/bin/ ${{ inputs.bench_script_params }}
 
     - name: Add comment to PR
       uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-      if: ${{ always() && inputs.trigger != 'schedule' }}
+      if: ${{ always() && inputs.pr_no != 0 }}
       with:
         script: |
           let markdown = ""
@@ -185,14 +182,16 @@ jobs:
           } catch(err) {
           }
 
-          const adapter = '${{ matrix.adapter.name }}';
+          const pr_no = '${{ inputs.pr_no }}';
+          const adapter = '${{ matrix.adapter.str_name }}';
           const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
           const test_status = '${{ steps.benchmarks.outcome }}';
           const job_status = '${{ job.status }}';
-          const body = `Compute Benchmarks ${adapter} run:\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`;
+          const params = '${{ inputs.bench_script_params }}';
+          const body = `Compute Benchmarks ${adapter} run (with params: ${params}):\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`;
 
           github.rest.issues.createComment({
-            issue_number: context.issue.number,
+            issue_number: pr_no,
             owner: context.repo.owner,
             repo: context.repo.repo,
             body: body
diff --git a/.github/workflows/benchmarks_level_zero.yml b/.github/workflows/benchmarks_level_zero.yml
deleted file mode 100644
index 6b38aaeedd..0000000000
--- a/.github/workflows/benchmarks_level_zero.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Compute Benchmarks Level Zero
-
-on:
-  issue_comment:
-    types: [created, edited]
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  e2e-build-hw:
-    # trigger only if PR comment contains "benchmarks-level-zero"
-    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/benchmarks-level-zero'))}}
-    name: Start benchmarks job
-    # use core flow, run it with L0 specific parameters
-    uses: ./.github/workflows/benchmarks_core.yml
-    # parameters that we pass to the core flow
-    with:
-      name: "L0"
-      runner_tag: "L0_PERF"
-      str_name: "level_zero"
-      config: ""
-      unit: "gpu"
-      trigger: "${{github.event_name}}"
-      comment: ${{github.event.comment.body}}
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 6f04308154..49e68c6b86 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -8,6 +8,7 @@ concurrency:
 
 permissions:
   contents: read
+  pull-requests: write
 
 jobs:
   ubuntu-build:
@@ -161,14 +162,12 @@ jobs:
       run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short" --verbose
 
   level-zero:
-    if: github.repository == 'oneapi-src/unified-runtime'
     name: Level Zero
     uses: ./.github/workflows/build-hw-reusable.yml
     with:
       name: L0
 
   opencl:
-    if: github.repository == 'oneapi-src/unified-runtime'
     name: OpenCL
     uses: ./.github/workflows/build-hw-reusable.yml
     with:
@@ -176,26 +175,47 @@ jobs:
       platform: "Intel(R) OpenCL"
 
   cuda:
-    if: github.repository == 'oneapi-src/unified-runtime'
     name: CUDA
     uses: ./.github/workflows/build-hw-reusable.yml
     with:
       name: CUDA
 
   hip:
-    if: github.repository == 'oneapi-src/unified-runtime'
     name: HIP
     uses: ./.github/workflows/build-hw-reusable.yml
     with:
       name: HIP
 
   native-cpu:
-    if: github.repository == 'oneapi-src/unified-runtime'
     name: Native CPU
     uses: ./.github/workflows/build-hw-reusable.yml
     with:
       name: NATIVE_CPU
 
+  e2e-level-zero:
+    name: E2E L0
+    permissions:
+      contents: read
+      pull-requests: write
+    needs: [ubuntu-build, level-zero]
+    uses: ./.github/workflows/e2e_level_zero.yml
+
+  e2e-opencl:
+    name: E2E OpenCL
+    permissions:
+      contents: read
+      pull-requests: write
+    needs: [ubuntu-build, opencl]
+    uses: ./.github/workflows/e2e_opencl.yml
+
+  e2e-cuda:
+    name: E2E CUDA
+    permissions:
+      contents: read
+      pull-requests: write
+    needs: [ubuntu-build, cuda]
+    uses: ./.github/workflows/e2e_cuda.yml
+
   windows-build:
     name: Build - Windows
     strategy:
@@ -211,6 +231,9 @@ jobs:
            adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'}
          - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'}
            compiler: {c: clang-cl, cxx: clang-cl}
+         # TODO: testing is flaky on windows-2022 in Release mode
+         - os: 'windows-2022'
+           build_type: Release
 
         build_type: [Debug, Release]
         compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}]
@@ -262,7 +285,6 @@ jobs:
       working-directory: ${{github.workspace}}/build
       run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace"
 
-
   macos-build:
     name: Build - MacOS
     strategy:
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index f1ae35587f..b0ed45d6b5 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -1,6 +1,6 @@
 name: "CodeQL"
 
-on: [push, pull_request]
+on: [push]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -24,7 +24,7 @@ jobs:
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
+      uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
       with:
         languages: cpp, python
 
@@ -38,7 +38,7 @@ jobs:
       run: cmake --build ${{github.workspace}}/build -j $(nproc)
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
+      uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
 
   analyze-windows:
     name: Analyze on Windows
@@ -54,7 +54,7 @@ jobs:
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
+      uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
       with:
         languages: cpp, python
 
@@ -68,4 +68,4 @@ jobs:
       run: cmake --build ${{github.workspace}}/build -j $(nproc) --config Release
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
+      uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index 7f2050d83d..c15f1df101 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -30,10 +30,6 @@ on:
         description: Tag defifned for the runner
         type: string
         required: true
-      trigger:
-        description: Type of workflow trigger
-        type: string
-        required: true
       xfail:
         description: Allow test failures
         type: string
@@ -56,9 +52,31 @@ permissions:
   pull-requests: write
 
 jobs:
+  changed-files:
+    name: Check for changed files
+    runs-on: ubuntu-22.04
+    outputs:
+      any_changed: ${{ steps.get-changed.outputs.any_changed }}
+    steps:
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - name: Get changed files
+      id: get-changed
+      uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2
+      with:
+        files: |
+          source/adapters/${{inputs.str_name}}/**
+
   e2e-build-hw:
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
+    # We want to run the job only if there are changes in the specific adapter
+    if: needs.changed-files.outputs.any_changed == 'true'
     name: Build SYCL, UR, run E2E
+    needs: changed-files
+    permissions:
+      contents: read
+      pull-requests: write
+
+    # Allow failures, since SYCL tests and API may be not stable
+    continue-on-error: true
     strategy:
       matrix:
         adapter: [
@@ -83,39 +101,11 @@ jobs:
         ls -la ./
         rm -rf ./* || true
 
-    - name: Add comment to PR
-      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-      if: ${{ always() && inputs.trigger != 'schedule' }}
-      with:
-        script: |
-          const adapter = '${{ matrix.adapter.name }}';
-          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
-          const body = `E2E ${adapter} build: \n${url}`;
-
-          github.rest.issues.createComment({
-            issue_number: context.issue.number,
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            body: body
-          })
-
     - name: Checkout UR
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         path: ur-repo
 
-    # On issue_comment trigger (for PRs) we need to fetch special ref for
-    # proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
-    - name: Fetch PR's merge commit
-      if: ${{ inputs.trigger != 'schedule' }}
-      working-directory: ${{github.workspace}}/ur-repo
-      env:
-        PR_NO: ${{github.event.issue.number}}
-      run: |
-        git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/*
-        git checkout origin/pr/${PR_NO}/merge
-        git rev-parse origin/pr/${PR_NO}/merge
-
     - name: Checkout SYCL
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
@@ -188,20 +178,23 @@ jobs:
       id: tests
       run: ninja -C build-e2e check-sycl-e2e
 
-    - name: Add comment to PR
-      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-      if: ${{ always() && inputs.trigger != 'schedule' }}
-      with:
-        script: |
-          const adapter = '${{ matrix.adapter.name }}';
-          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
-          const test_status = '${{ steps.tests.outcome }}';
-          const job_status = '${{ job.status }}';
-          const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`;
-
-          github.rest.issues.createComment({
-            issue_number: context.issue.number,
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            body: body
-          })
+    # FIXME: Requires pull-request: write permissions but this is only granted
+    # on pull requests from forks if using pull_request_target workflow
+    # trigger but not the pull_request trigger..
+    # - name: Add comment to PR
+    #   uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+    #   if: ${{ always() }}
+    #   with:
+    #     script: |
+    #       const adapter = '${{ matrix.adapter.name }}';
+    #       const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+    #       const test_status = '${{ steps.tests.outcome }}';
+    #       const job_status = '${{ job.status }}';
+    #       const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`;
+
+    #       github.rest.issues.createComment({
+    #         issue_number: context.issue.number,
+    #         owner: context.repo.owner,
+    #         repo: context.repo.repo,
+    #         body: body
+    #       })
diff --git a/.github/workflows/e2e_cuda.yml b/.github/workflows/e2e_cuda.yml
index 433e455eba..6bf181b0a4 100644
--- a/.github/workflows/e2e_cuda.yml
+++ b/.github/workflows/e2e_cuda.yml
@@ -1,11 +1,7 @@
 name: E2E Cuda
 
 on:
-  schedule:
-    # Run every day at 23:00 UTC
-    - cron: '0 23 * * *'
-  issue_comment:
-    types: [created, edited]
+  workflow_call:
 
 permissions:
   contents: read
@@ -13,12 +9,10 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    # trigger only if PR comment contains "e2e-cuda"
-    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-cuda')) || (github.event_name == 'schedule') }}
+    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
     name: Start e2e job
     # use core flow, run it with cuda specific parameters
     uses: ./.github/workflows/e2e_core.yml
-    # parameters that we pass to the core flow
     with:
       name: "CUDA"
       runner_tag: "CUDA_E2E"
@@ -26,4 +20,3 @@ jobs:
       prefix: "ext_oneapi_"
       config: "--cuda"
       unit: "gpu"
-      trigger: "${{github.event_name}}"
diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml
index 66c23715ea..5397dabb50 100644
--- a/.github/workflows/e2e_level_zero.yml
+++ b/.github/workflows/e2e_level_zero.yml
@@ -1,11 +1,7 @@
 name: E2E Level Zero
 
 on:
-  schedule:
-    # Run every day at 23:00 UTC
-    - cron: '0 23 * * *'
-  issue_comment:
-    types: [created, edited]
+  workflow_call:
 
 permissions:
   contents: read
@@ -13,12 +9,10 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    # trigger only if PR comment contains "e2e-level-zero"
-    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-level-zero')) || (github.event_name == 'schedule') }}
+    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
     name: Start e2e job
     # use core flow, run it with L0 specific parameters
     uses: ./.github/workflows/e2e_core.yml
-    # parameters that we pass to the core flow
     with:
       name: "L0"
       runner_tag: "L0_E2E"
@@ -26,7 +20,6 @@ jobs:
       prefix: "ext_oneapi_"
       config: ""
       unit: "gpu"
-      trigger: "${{github.event_name}}"
       # Failing tests
       xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp"
       # Flaky tests
diff --git a/.github/workflows/e2e_opencl.yml b/.github/workflows/e2e_opencl.yml
index 302a9c995a..5264b668f2 100644
--- a/.github/workflows/e2e_opencl.yml
+++ b/.github/workflows/e2e_opencl.yml
@@ -1,11 +1,7 @@
 name: E2E OpenCL
 
 on:
-  schedule:
-    # Run every day at 23:00 UTC
-    - cron: '0 23 * * *'
-  issue_comment:
-    types: [created, edited]
+  workflow_call:
 
 permissions:
   contents: read
@@ -13,12 +9,10 @@ permissions:
 
 jobs:
   e2e-build-hw:
-    # trigger only if PR comment contains "e2e-opencl"
-    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-opencl')) || (github.event_name == 'schedule') }}
+    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
     name: Start e2e job
     # use core flow, run it with OpenCL specific parameters
     uses: ./.github/workflows/e2e_core.yml
-    # parameters that we pass to the core flow
     with:
       name: "OPENCL"
       runner_tag: "OPENCL"
@@ -26,4 +20,3 @@ jobs:
       prefix: ""
       config: ""
       unit: "cpu"
-      trigger: "${{github.event_name}}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfd8752657..40fbebcf65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 # Build Options
+option(UR_BUILD_EXAMPLES "Build example applications." ON)
 option(UR_BUILD_TESTS "Build unit tests." ON)
 option(UR_BUILD_TOOLS "build ur tools" ON)
 option(UR_FORMAT_CPP_STYLE "format code style of C++ sources" OFF)
@@ -258,7 +259,9 @@ install(
     EXPORT ${PROJECT_NAME}-targets)
 
 add_subdirectory(source)
-add_subdirectory(examples)
+if(UR_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
 if(UR_BUILD_TESTS)
     add_subdirectory(test)
 endif()
diff --git a/README.md b/README.md
index 3d53d2117e..ae61b76b09 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,7 @@ List of options provided by CMake:
 
 | Name | Description | Values | Default |
 | - | - | - | - |
+| UR_BUILD_EXAMPLES | Build example applications | ON/OFF | ON |
 | UR_BUILD_TESTS | Build the tests | ON/OFF | ON |
 | UR_BUILD_TOOLS | Build tools | ON/OFF | ON |
 | UR_FORMAT_CPP_STYLE | Format code style | ON/OFF | OFF |
diff --git a/cmake/FindCUDACupti.cmake b/cmake/FindCUDACupti.cmake
new file mode 100644
index 0000000000..3e8447bcdd
--- /dev/null
+++ b/cmake/FindCUDACupti.cmake
@@ -0,0 +1,26 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This is lifted from intel-llvm's FindCUDACupti implementation
+# https://github.com/intel/llvm/blob/0cd04144d9ca83371c212e8e4709a59c968291b9/sycl/cmake/modules/FindCUDACupti.cmake
+
+macro(find_cuda_cupti_library)
+  find_library(CUDA_cupti_LIBRARY
+    NAMES cupti
+    HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+          ENV CUDA_PATH
+    PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+                  ../extras/CUPTI/lib64/
+                  ../extras/CUPTI/lib/
+  )
+endmacro()
+
+macro(find_cuda_cupti_include_dir)
+  find_path(CUDA_CUPTI_INCLUDE_DIR cupti.h PATHS
+      "${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include"
+      "${CUDA_INCLUDE_DIRS}/../extras/CUPTI/include"
+      "${CUDA_INCLUDE_DIRS}"
+      NO_DEFAULT_PATH)
+endmacro()
diff --git a/cmake/match.py b/cmake/match.py
index e138577165..5b96d3008f 100755
--- a/cmake/match.py
+++ b/cmake/match.py
@@ -63,6 +63,7 @@ def check_status(input_lines, match_lines):
 class Tag(Enum):
     OPT = "{{OPT}}"         # makes the line optional
     IGNORE = "{{IGNORE}}"   # ignores all input until next match or end of input file
+    COMMENT = "#"           # comment - line ignored
 
 
 ## @brief main function for the match file processing script
@@ -76,7 +77,15 @@ def main():
 
     with open(input_file, 'r') as input, open(match_file, 'r') as match:
         input_lines = input.readlines()
-        match_lines = match.readlines()
+        # Filter out empty lines and comments (lines beginning with the comment
+        # character, ignoring leading whitespace)
+        match_lines = list(
+            filter(
+                lambda line: line.strip()
+                and not line.lstrip().startswith(Tag.COMMENT.value),
+                match.readlines(),
+            )
+        )
 
     ignored_lines = []
 
diff --git a/include/ur_api.h b/include/ur_api.h
index 9d88eecbc6..d2d4ca6394 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -224,6 +224,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222,                     ///< Enumerator for ::urCommandBufferCommandGetInfoExp
     UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223,                         ///< Enumerator for ::urEnqueueTimestampRecordingExp
     UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224,                        ///< Enumerator for ::urEnqueueKernelLaunchCustomExp
+    UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225,                    ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -451,51 +452,50 @@ typedef enum ur_result_t {
                                                                               ///< device
     UR_RESULT_ERROR_INVALID_IMAGE_SIZE = 33,                                  ///< Invalid image size
     UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR = 34,                     ///< Invalid image format descriptor
-    UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED = 35,                          ///< Image format not supported
-    UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE = 36,                       ///< Memory object allocation failure
-    UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE = 37,                          ///< Program object parameter is invalid.
-    UR_RESULT_ERROR_UNINITIALIZED = 38,                                       ///< [Validation] adapter is not initialized or specific entry-point is not
+    UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE = 35,                       ///< Memory object allocation failure
+    UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE = 36,                          ///< Program object parameter is invalid.
+    UR_RESULT_ERROR_UNINITIALIZED = 37,                                       ///< [Validation] adapter is not initialized or specific entry-point is not
                                                                               ///< implemented
-    UR_RESULT_ERROR_OUT_OF_HOST_MEMORY = 39,                                  ///< Insufficient host memory to satisfy call
-    UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 40,                                ///< Insufficient device memory to satisfy call
-    UR_RESULT_ERROR_OUT_OF_RESOURCES = 41,                                    ///< Out of resources
-    UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE = 42,                               ///< Error occurred when building program, see build log for details
-    UR_RESULT_ERROR_PROGRAM_LINK_FAILURE = 43,                                ///< Error occurred when linking programs, see build log for details
-    UR_RESULT_ERROR_UNSUPPORTED_VERSION = 44,                                 ///< [Validation] generic error code for unsupported versions
-    UR_RESULT_ERROR_UNSUPPORTED_FEATURE = 45,                                 ///< [Validation] generic error code for unsupported features
-    UR_RESULT_ERROR_INVALID_ARGUMENT = 46,                                    ///< [Validation] generic error code for invalid arguments
-    UR_RESULT_ERROR_INVALID_NULL_HANDLE = 47,                                 ///< [Validation] handle argument is not valid
-    UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 48,                                ///< [Validation] object pointed to by handle still in-use by device
-    UR_RESULT_ERROR_INVALID_NULL_POINTER = 49,                                ///< [Validation] pointer argument may not be nullptr
-    UR_RESULT_ERROR_INVALID_SIZE = 50,                                        ///< [Validation] invalid size or dimensions (e.g., must not be zero, or is
+    UR_RESULT_ERROR_OUT_OF_HOST_MEMORY = 38,                                  ///< Insufficient host memory to satisfy call
+    UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 39,                                ///< Insufficient device memory to satisfy call
+    UR_RESULT_ERROR_OUT_OF_RESOURCES = 40,                                    ///< Out of resources
+    UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE = 41,                               ///< Error occurred when building program, see build log for details
+    UR_RESULT_ERROR_PROGRAM_LINK_FAILURE = 42,                                ///< Error occurred when linking programs, see build log for details
+    UR_RESULT_ERROR_UNSUPPORTED_VERSION = 43,                                 ///< [Validation] generic error code for unsupported versions
+    UR_RESULT_ERROR_UNSUPPORTED_FEATURE = 44,                                 ///< [Validation] generic error code for unsupported features
+    UR_RESULT_ERROR_INVALID_ARGUMENT = 45,                                    ///< [Validation] generic error code for invalid arguments
+    UR_RESULT_ERROR_INVALID_NULL_HANDLE = 46,                                 ///< [Validation] handle argument is not valid
+    UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 47,                                ///< [Validation] object pointed to by handle still in-use by device
+    UR_RESULT_ERROR_INVALID_NULL_POINTER = 48,                                ///< [Validation] pointer argument may not be nullptr
+    UR_RESULT_ERROR_INVALID_SIZE = 49,                                        ///< [Validation] invalid size or dimensions (e.g., must not be zero, or is
                                                                               ///< out of bounds)
-    UR_RESULT_ERROR_UNSUPPORTED_SIZE = 51,                                    ///< [Validation] size argument is not supported by the device (e.g., too
+    UR_RESULT_ERROR_UNSUPPORTED_SIZE = 50,                                    ///< [Validation] size argument is not supported by the device (e.g., too
                                                                               ///< large)
-    UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 52,                               ///< [Validation] alignment argument is not supported by the device (e.g.,
+    UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 51,                               ///< [Validation] alignment argument is not supported by the device (e.g.,
                                                                               ///< too small)
-    UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 53,                      ///< [Validation] synchronization object in invalid state
-    UR_RESULT_ERROR_INVALID_ENUMERATION = 54,                                 ///< [Validation] enumerator argument is not valid
-    UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 55,                             ///< [Validation] enumerator argument is not supported by the device
-    UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 56,                            ///< [Validation] image format is not supported by the device
-    UR_RESULT_ERROR_INVALID_NATIVE_BINARY = 57,                               ///< [Validation] native binary is not supported by the device
-    UR_RESULT_ERROR_INVALID_GLOBAL_NAME = 58,                                 ///< [Validation] global variable is not found in the program
-    UR_RESULT_ERROR_INVALID_FUNCTION_NAME = 59,                               ///< [Validation] function name is not found in the program
-    UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 60,                        ///< [Validation] group size dimension is not valid for the kernel or
+    UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 52,                      ///< [Validation] synchronization object in invalid state
+    UR_RESULT_ERROR_INVALID_ENUMERATION = 53,                                 ///< [Validation] enumerator argument is not valid
+    UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 54,                             ///< [Validation] enumerator argument is not supported by the device
+    UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 55,                            ///< [Validation] image format is not supported by the device
+    UR_RESULT_ERROR_INVALID_NATIVE_BINARY = 56,                               ///< [Validation] native binary is not supported by the device
+    UR_RESULT_ERROR_INVALID_GLOBAL_NAME = 57,                                 ///< [Validation] global variable is not found in the program
+    UR_RESULT_ERROR_INVALID_FUNCTION_NAME = 58,                               ///< [Validation] function name is not found in the program
+    UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 59,                        ///< [Validation] group size dimension is not valid for the kernel or
                                                                               ///< device
-    UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 61,                      ///< [Validation] global width dimension is not valid for the kernel or
+    UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 60,                      ///< [Validation] global width dimension is not valid for the kernel or
                                                                               ///< device
-    UR_RESULT_ERROR_PROGRAM_UNLINKED = 62,                                    ///< [Validation] compiled program or program with imports needs to be
+    UR_RESULT_ERROR_PROGRAM_UNLINKED = 61,                                    ///< [Validation] compiled program or program with imports needs to be
                                                                               ///< linked before kernels can be created from it.
-    UR_RESULT_ERROR_OVERLAPPING_REGIONS = 63,                                 ///< [Validation] copy operations do not support overlapping regions of
+    UR_RESULT_ERROR_OVERLAPPING_REGIONS = 62,                                 ///< [Validation] copy operations do not support overlapping regions of
                                                                               ///< memory
-    UR_RESULT_ERROR_INVALID_HOST_PTR = 64,                                    ///< Invalid host pointer
-    UR_RESULT_ERROR_INVALID_USM_SIZE = 65,                                    ///< Invalid USM size
-    UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE = 66,                           ///< Objection allocation failure
-    UR_RESULT_ERROR_ADAPTER_SPECIFIC = 67,                                    ///< An adapter specific warning/error has been reported and can be
+    UR_RESULT_ERROR_INVALID_HOST_PTR = 63,                                    ///< Invalid host pointer
+    UR_RESULT_ERROR_INVALID_USM_SIZE = 64,                                    ///< Invalid USM size
+    UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE = 65,                           ///< Objection allocation failure
+    UR_RESULT_ERROR_ADAPTER_SPECIFIC = 66,                                    ///< An adapter specific warning/error has been reported and can be
                                                                               ///< retrieved via the urPlatformGetLastError entry point.
-    UR_RESULT_ERROR_LAYER_NOT_PRESENT = 68,                                   ///< A requested layer was not found by the loader.
-    UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS = 69,                           ///< An event in the provided wait list has ::UR_EVENT_STATUS_ERROR.
-    UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE = 70,                                ///< Device in question has `::UR_DEVICE_INFO_AVAILABLE == false`
+    UR_RESULT_ERROR_LAYER_NOT_PRESENT = 67,                                   ///< A requested layer was not found by the loader.
+    UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS = 68,                           ///< An event in the provided wait list has ::UR_EVENT_STATUS_ERROR.
+    UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE = 69,                                ///< Device in question has `::UR_DEVICE_INFO_AVAILABLE == false`
     UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP = 0x1000,                      ///< Invalid Command-Buffer
     UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP = 0x1001,           ///< Sync point is not valid for the command-buffer
     UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP = 0x1002, ///< Sync point wait list is invalid
@@ -1188,6 +1188,8 @@ typedef struct ur_platform_native_properties_t {
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phPlatform`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -1195,6 +1197,7 @@ typedef struct ur_platform_native_properties_t {
 UR_APIEXPORT ur_result_t UR_APICALL
 urPlatformCreateWithNativeHandle(
     ur_native_handle_t hNativePlatform,                 ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t hAdapter,                       ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *phPlatform                    ///< [out] pointer to the handle of the platform object created.
 );
@@ -2441,13 +2444,11 @@ typedef enum ur_mem_flag_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Memory types
 typedef enum ur_mem_type_t {
-    UR_MEM_TYPE_BUFFER = 0,                 ///< Buffer object
-    UR_MEM_TYPE_IMAGE2D = 1,                ///< 2D image object
-    UR_MEM_TYPE_IMAGE3D = 2,                ///< 3D image object
-    UR_MEM_TYPE_IMAGE2D_ARRAY = 3,          ///< 2D image array object
-    UR_MEM_TYPE_IMAGE1D = 4,                ///< 1D image object
-    UR_MEM_TYPE_IMAGE1D_ARRAY = 5,          ///< 1D image array object
-    UR_MEM_TYPE_IMAGE1D_BUFFER = 6,         ///< 1D image buffer object
+    UR_MEM_TYPE_IMAGE2D = 0,                ///< 2D image object
+    UR_MEM_TYPE_IMAGE3D = 1,                ///< 3D image object
+    UR_MEM_TYPE_IMAGE2D_ARRAY = 2,          ///< 2D image array object
+    UR_MEM_TYPE_IMAGE1D = 3,                ///< 1D image object
+    UR_MEM_TYPE_IMAGE1D_ARRAY = 4,          ///< 1D image array object
     UR_MEM_TYPE_IMAGE_CUBEMAP_EXP = 0x2000, ///< Experimental cubemap image object
     /// @cond
     UR_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
@@ -2558,6 +2559,22 @@ typedef struct ur_image_desc_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Create an image object
 ///
+/// @details
+///     - The primary ::ur_image_format_t that must be supported by all the
+///       adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA,
+///       UR_IMAGE_CHANNEL_TYPE_UNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}.
+///
 /// @remarks
 ///   _Analogues_
 ///     - **clCreateImage**
@@ -2578,12 +2595,13 @@ typedef struct ur_image_desc_t {
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_INVALID_HOST_PTR
 ///         + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0`
 ///         + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -5230,6 +5248,43 @@ urKernelCreateWithNativeHandle(
     ur_kernel_handle_t *phKernel                      ///< [out] pointer to the handle of the kernel object created.
 );
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel,      ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,        ///< [in] handle of the queue object
+    uint32_t numWorkDim,             ///< [in] number of dimensions, from 1 to 3, to specify the global
+                                     ///< and work-group work-items
+    const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< the offset used to calculate the global ID of a work-item
+    const size_t *pGlobalWorkSize,   ///< [in] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< the number of global work-items in workDim that will execute the
+                                     ///< kernel function
+    size_t *pSuggestedLocalWorkSize  ///< [out] pointer to an array of numWorkDim unsigned values that specify
+                                     ///< suggested local work size that will contain the result of the query
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -5250,7 +5305,9 @@ typedef enum ur_queue_info_t {
                                        ///< The reference count returned should be considered immediately stale.
                                        ///< It is unsuitable for general use in applications. This feature is
                                        ///< provided for identifying memory leaks.
-    UR_QUEUE_INFO_SIZE = 5,            ///< [uint32_t] The size of the queue
+    UR_QUEUE_INFO_SIZE = 5,            ///< [uint32_t] The size of the queue on the device. Only a valid query
+                                       ///< if the queue was created with the `ON_DEVICE` queue flag, otherwise
+                                       ///< `::urQueueGetInfo` will return `::UR_RESULT_ERROR_INVALID_QUEUE`.
     UR_QUEUE_INFO_EMPTY = 6,           ///< [::ur_bool_t] return true if the queue was empty at the time of the
                                        ///< query
     /// @cond
@@ -5265,7 +5322,8 @@ typedef uint32_t ur_queue_flags_t;
 typedef enum ur_queue_flag_t {
     UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE = UR_BIT(0), ///< Enable/disable out of order execution
     UR_QUEUE_FLAG_PROFILING_ENABLE = UR_BIT(1),              ///< Enable/disable profiling
-    UR_QUEUE_FLAG_ON_DEVICE = UR_BIT(2),                     ///< Is a device queue
+    UR_QUEUE_FLAG_ON_DEVICE = UR_BIT(2),                     ///< Is a device queue. If this is enabled `OUT_OF_ORDER_EXEC_MODE_ENABLE`
+                                                             ///< must also be enabled.
     UR_QUEUE_FLAG_ON_DEVICE_DEFAULT = UR_BIT(3),             ///< Is the default queue for a device
     UR_QUEUE_FLAG_DISCARD_EVENTS = UR_BIT(4),                ///< Events will be discarded
     UR_QUEUE_FLAG_PRIORITY_LOW = UR_BIT(5),                  ///< Low priority queue
@@ -5310,7 +5368,7 @@ typedef enum ur_queue_flag_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `propSize != 0 && pPropValue == NULL`
 ///         + `pPropValue == NULL && pPropSizeRet == NULL`
-///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -7499,7 +7557,7 @@ urBindlessImagesSampledImageHandleDestroyExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -7559,7 +7617,7 @@ urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -7596,7 +7654,7 @@ urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_SAMPLER
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
@@ -7638,7 +7696,7 @@ urBindlessImagesSampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_QUEUE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -7799,7 +7857,7 @@ urBindlessImagesImportOpaqueFDExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
@@ -8704,7 +8762,9 @@ urCommandBufferReleaseCommandExp(
 );
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Update a kernel launch command in a finalized command-buffer.
+/// @brief Update a kernel launch command in a finalized command-buffer. This
+///        entry-point is synchronous and may block if the command-buffer is
+///        executing when the entry-point is called.
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
@@ -9498,6 +9558,7 @@ typedef struct ur_platform_get_native_handle_params_t {
 ///     allowing the callback the ability to modify the parameter's value
 typedef struct ur_platform_create_with_native_handle_params_t {
     ur_native_handle_t *phNativePlatform;
+    ur_adapter_handle_t *phAdapter;
     const ur_platform_native_properties_t **ppProperties;
     ur_platform_handle_t **pphPlatform;
 } ur_platform_create_with_native_handle_params_t;
@@ -9943,6 +10004,19 @@ typedef struct ur_kernel_create_with_native_handle_params_t {
     ur_kernel_handle_t **pphKernel;
 } ur_kernel_create_with_native_handle_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urKernelGetSuggestedLocalWorkSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_kernel_get_suggested_local_work_size_params_t {
+    ur_kernel_handle_t *phKernel;
+    ur_queue_handle_t *phQueue;
+    uint32_t *pnumWorkDim;
+    const size_t **ppGlobalWorkOffset;
+    const size_t **ppGlobalWorkSize;
+    size_t **ppSuggestedLocalWorkSize;
+} ur_kernel_get_suggested_local_work_size_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urKernelSetArgValue
 /// @details Each entry is a pointer to the parameter passed to the function;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index fb1f1823b3..deff91472c 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -49,6 +49,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnPlatformGetNativeHandle_t)(
 /// @brief Function-pointer for urPlatformCreateWithNativeHandle
 typedef ur_result_t(UR_APICALL *ur_pfnPlatformCreateWithNativeHandle_t)(
     ur_native_handle_t,
+    ur_adapter_handle_t,
     const ur_platform_native_properties_t *,
     ur_platform_handle_t *);
 
@@ -535,6 +536,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelCreateWithNativeHandle_t)(
     const ur_kernel_native_properties_t *,
     ur_kernel_handle_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSize
+typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)(
+    ur_kernel_handle_t,
+    ur_queue_handle_t,
+    uint32_t,
+    const size_t *,
+    const size_t *,
+    size_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function-pointer for urKernelSetArgValue
 typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)(
@@ -603,6 +614,7 @@ typedef struct ur_kernel_dditable_t {
     ur_pfnKernelRelease_t pfnRelease;
     ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle;
     ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle;
+    ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize;
     ur_pfnKernelSetArgValue_t pfnSetArgValue;
     ur_pfnKernelSetArgLocal_t pfnSetArgLocal;
     ur_pfnKernelSetArgPointer_t pfnSetArgPointer;
diff --git a/include/ur_print.h b/include/ur_print.h
index 753875ace9..c8fb41753e 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1442,6 +1442,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetNativeHandleParams(const str
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelCreateWithNativeHandleParams(const struct ur_kernel_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_kernel_get_suggested_local_work_size_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetSuggestedLocalWorkSizeParams(const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_kernel_set_arg_value_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index db230c91d7..7bd6cc75ab 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -929,6 +929,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP:
         os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP";
         break;
+    case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE:
+        os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -1451,9 +1454,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_result_t value) {
     case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
         os << "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
         break;
-    case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-        os << "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
-        break;
     case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
         os << "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
         break;
@@ -5463,9 +5463,6 @@ inline ur_result_t printFlag<ur_mem_flag_t>(std::ostream &os, uint32_t flag) {
 ///     std::ostream &
 inline std::ostream &operator<<(std::ostream &os, enum ur_mem_type_t value) {
     switch (value) {
-    case UR_MEM_TYPE_BUFFER:
-        os << "UR_MEM_TYPE_BUFFER";
-        break;
     case UR_MEM_TYPE_IMAGE2D:
         os << "UR_MEM_TYPE_IMAGE2D";
         break;
@@ -5481,9 +5478,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_mem_type_t value) {
     case UR_MEM_TYPE_IMAGE1D_ARRAY:
         os << "UR_MEM_TYPE_IMAGE1D_ARRAY";
         break;
-    case UR_MEM_TYPE_IMAGE1D_BUFFER:
-        os << "UR_MEM_TYPE_IMAGE1D_BUFFER";
-        break;
     case UR_MEM_TYPE_IMAGE_CUBEMAP_EXP:
         os << "UR_MEM_TYPE_IMAGE_CUBEMAP_EXP";
         break;
@@ -10220,6 +10214,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     ur::details::printPtr(os,
                           *(params->phNativePlatform));
 
+    os << ", ";
+    os << ".hAdapter = ";
+
+    ur::details::printPtr(os,
+                          *(params->phAdapter));
+
     os << ", ";
     os << ".pProperties = ";
 
@@ -11462,6 +11462,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_kernel_get_suggested_local_work_size_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_kernel_get_suggested_local_work_size_params_t *params) {
+
+    os << ".hKernel = ";
+
+    ur::details::printPtr(os,
+                          *(params->phKernel));
+
+    os << ", ";
+    os << ".hQueue = ";
+
+    ur::details::printPtr(os,
+                          *(params->phQueue));
+
+    os << ", ";
+    os << ".numWorkDim = ";
+
+    os << *(params->pnumWorkDim);
+
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
+    os << ", ";
+    os << ".pGlobalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkSize));
+
+    os << ", ";
+    os << ".pSuggestedLocalWorkSize = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppSuggestedLocalWorkSize));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_kernel_set_arg_value_params_t type
 /// @returns
@@ -17143,6 +17186,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE: {
         os << (const struct ur_kernel_create_with_native_handle_params_t *)params;
     } break;
+    case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: {
+        os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)params;
+    } break;
     case UR_FUNCTION_KERNEL_SET_ARG_VALUE: {
         os << (const struct ur_kernel_set_arg_value_params_t *)params;
     } break;
diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst
index ee54c69291..d609c3b3d9 100644
--- a/scripts/core/EXP-BINDLESS-IMAGES.rst
+++ b/scripts/core/EXP-BINDLESS-IMAGES.rst
@@ -50,6 +50,7 @@ Runtime:
   * Sampled images
   * Unsampled images
   * Mipmaps
+  * Image arrays
   * Cubemaps
   * USM backed images
 
@@ -207,6 +208,8 @@ Changelog
 +------------------------------------------------------------------------+
 | 11.0     | Added device queries for sampled image fetch capabilities.  |
 +----------+-------------------------------------------------------------+
+| 12.0     | Added image arrays to list of supported bindless images     |
++----------+-------------------------------------------------------------+
 
 Contributors
 --------------------------------------------------------------------------------
diff --git a/scripts/core/PROG.rst b/scripts/core/PROG.rst
index e7bf24986b..4e71cce8c7 100644
--- a/scripts/core/PROG.rst
+++ b/scripts/core/PROG.rst
@@ -277,6 +277,39 @@ native handle to a driver handle. For example, OpenCL platform
 may expose an extension ${x}ProgramCreateWithNativeHandle to retrieve
 a cl_program.
 
+Memory
+======
+
+UR Mem Handles
+--------------
+
+A ${x}_mem_handle_t can represent an untyped memory buffer object, created by
+${x}MemBufferCreate, or a memory image object, created by ${x}MemImageCreate.
+A ${x}_mem_handle_t manages the internal allocation and deallocation of native
+memory objects across all devices in a ${x}_context_handle_t. A
+${x}_mem_handle_t may only be used by queues that share the same
+${x}_context_handle_t. 
+
+If multiple queues in the same ${x}_context_handle_t use the same
+${x}_mem_handle_t across dependent commands, a dependency must be defined by the
+user using the enqueue entry point's phEventWaitList parameter. Provided that
+dependencies are explicitly passed to UR entry points, a UR adapter will manage
+memory migration of native memory objects across all devices in a context, if
+memory migration is indeed necessary in the backend API.
+
+.. parsed-literal::
+
+    // Q1 and Q2 are both in hContext
+    ${x}_mem_handle_t hBuffer;
+    ${x}MemBufferCreate(hContext,,,,&hBuffer);
+    ${x}EnqueueMemBufferWrite(Q1, hBuffer,,,,,,, &outEv);
+    ${x}EnqueueMemBufferRead(Q2, hBuffer,,,,, 1, &outEv /*phEventWaitList*/, );
+
+As such, the buffer written to in ${x}EnqueueMemBufferWrite can be
+successfully read using ${x}EnqueueMemBufferRead from another queue in the same
+context, since the event associated with the write operation has been passed as
+a dependency to the read operation.
+
 Memory Pooling
 ----------------------------------
 
diff --git a/scripts/core/common.yml b/scripts/core/common.yml
index 0dad27d028..8aeb8ea248 100644
--- a/scripts/core/common.yml
+++ b/scripts/core/common.yml
@@ -205,8 +205,6 @@ etors:
       desc: "Invalid image size"
     - name: ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
       desc: "Invalid image format descriptor"
-    - name: ERROR_IMAGE_FORMAT_NOT_SUPPORTED
-      desc: "Image format not supported"
     - name: ERROR_MEM_OBJECT_ALLOCATION_FAILURE
       desc: "Memory object allocation failure"
     - name: ERROR_INVALID_PROGRAM_EXECUTABLE
diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml
index c5a3d5d5e3..622e378f0b 100644
--- a/scripts/core/exp-bindless-images.yml
+++ b/scripts/core/exp-bindless-images.yml
@@ -387,7 +387,7 @@ returns:
     - $X_RESULT_ERROR_INVALID_CONTEXT
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_OPERATION
 --- #--------------------------------------------------------------------------
@@ -442,7 +442,7 @@ returns:
     - $X_RESULT_ERROR_INVALID_CONTEXT
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_OPERATION
 --- #--------------------------------------------------------------------------
@@ -479,7 +479,7 @@ returns:
     - $X_RESULT_ERROR_INVALID_CONTEXT
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_SAMPLER
     - $X_RESULT_ERROR_INVALID_OPERATION
@@ -542,7 +542,7 @@ returns:
     - $X_RESULT_ERROR_INVALID_QUEUE
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_OPERATION
 --- #--------------------------------------------------------------------------
@@ -679,7 +679,7 @@ returns:
     - $X_RESULT_ERROR_INVALID_CONTEXT
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_OPERATION
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml
index 78a1b020ef..6e276eac88 100644
--- a/scripts/core/exp-command-buffer.yml
+++ b/scripts/core/exp-command-buffer.yml
@@ -900,7 +900,7 @@ returns:
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
 --- #--------------------------------------------------------------------------
 type: function
-desc: "Update a kernel launch command in a finalized command-buffer."
+desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called."
 class: $xCommandBuffer
 name: UpdateKernelLaunchExp
 params:
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 4a0bf0bab1..5446f3bc1d 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -534,3 +534,44 @@ params:
 returns:
     - $X_RESULT_ERROR_UNSUPPORTED_FEATURE:
         - "If the adapter has no underlying equivalent handle."
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Get the suggested local work size for a kernel."
+class: $xKernel
+name: GetSuggestedLocalWorkSize
+ordinal: "0"
+details:
+    - "Query a suggested local work size for a kernel given a global size for each dimension."
+    - "The application may call this function from simultaneous threads for the same context."
+params:
+    - type: $x_kernel_handle_t
+      name: hKernel
+      desc: |
+            [in] handle of the kernel
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: |
+            [in] handle of the queue object
+    - type: uint32_t
+      name: numWorkDim
+      desc: |
+            [in] number of dimensions, from 1 to 3, to specify the global
+            and work-group work-items
+    - type: const size_t*
+      name: pGlobalWorkOffset
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the offset used to calculate the global ID of a work-item
+    - type: const size_t*
+      name: pGlobalWorkSize      
+      desc: |
+            [in] pointer to an array of numWorkDim unsigned values that specify
+            the number of global work-items in workDim that will execute the 
+            kernel function
+    - type: size_t*
+      name: pSuggestedLocalWorkSize
+      desc: |
+            [out] pointer to an array of numWorkDim unsigned values that specify
+            suggested local work size that will contain the result of the query
+returns:
+    - $X_RESULT_ERROR_UNSUPPORTED_FEATURE
diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml
index 6f88b10cdc..c4009bc56e 100644
--- a/scripts/core/memory.yml
+++ b/scripts/core/memory.yml
@@ -41,8 +41,6 @@ desc: "Memory types"
 class: $xMem
 name: $x_mem_type_t
 etors:
-    - name: BUFFER
-      desc: "Buffer object"
     - name: IMAGE2D
       desc: "2D image object"
     - name: IMAGE3D
@@ -53,8 +51,6 @@ etors:
       desc: "1D image object"
     - name: IMAGE1D_ARRAY
       desc: "1D image array object"
-    - name: IMAGE1D_BUFFER
-      desc: "1D image buffer object"
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Memory Information type"
@@ -213,6 +209,15 @@ name: ImageCreate
 ordinal: "0"
 analogue:
     - "**clCreateImage**"
+details:
+    - |
+      The primary $x_image_format_t that must be supported by all the adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, 
+      {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}.
 params:
     - type: $x_context_handle_t
       name: hContext
@@ -236,12 +241,13 @@ returns:
     - $X_RESULT_ERROR_INVALID_CONTEXT
     - $X_RESULT_ERROR_INVALID_VALUE
     - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`"
+      - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`"
     - $X_RESULT_ERROR_INVALID_IMAGE_SIZE
     - $X_RESULT_ERROR_INVALID_OPERATION
     - $X_RESULT_ERROR_INVALID_HOST_PTR:
       - "`pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0`"
       - "`pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0`"
+    - $X_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml
index c35b71622d..f3069005ab 100644
--- a/scripts/core/platform.yml
+++ b/scripts/core/platform.yml
@@ -211,6 +211,9 @@ params:
     - type: $x_native_handle_t
       name: hNativePlatform
       desc: "[in][nocheck] the native handle of the platform."
+    - type: $x_adapter_handle_t
+      name: hAdapter
+      desc: "[in] handle of the adapter associated with the native backend."
     - type: const $x_platform_native_properties_t*
       name: pProperties
       desc: "[in][optional] pointer to native platform properties struct."
diff --git a/scripts/core/queue.yml b/scripts/core/queue.yml
index 816da179ba..27e8077ba2 100644
--- a/scripts/core/queue.yml
+++ b/scripts/core/queue.yml
@@ -32,7 +32,10 @@ etors:
             The reference count returned should be considered immediately stale. 
             It is unsuitable for general use in applications. This feature is provided for identifying memory leaks.
     - name: SIZE
-      desc: "[uint32_t] The size of the queue"
+      desc: |
+            [uint32_t] The size of the queue on the device. Only a valid query
+            if the queue was created with the `ON_DEVICE` queue flag, otherwise
+            `$xQueueGetInfo` will return `$X_RESULT_ERROR_INVALID_QUEUE`.
     - name: EMPTY
       desc: "[$x_bool_t] return true if the queue was empty at the time of the query"
 --- #--------------------------------------------------------------------------
@@ -49,7 +52,7 @@ etors:
       desc: "Enable/disable profiling"
     - name: ON_DEVICE
       value: "$X_BIT(2)"
-      desc: "Is a device queue"
+      desc: "Is a device queue. If this is enabled `OUT_OF_ORDER_EXEC_MODE_ENABLE` must also be enabled."
     - name: ON_DEVICE_DEFAULT
       value: "$X_BIT(3)"
       desc: "Is the default queue for a device"
@@ -108,6 +111,7 @@ returns:
         - "`propSize != 0 && pPropValue == NULL`"
         - "`pPropValue == NULL && pPropSizeRet == NULL`"
     - $X_RESULT_ERROR_INVALID_QUEUE
+        - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`."
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index b0a61e7f88..52585ade3a 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -586,6 +586,9 @@ etors:
 - name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP
   desc: Enumerator for $xEnqueueKernelLaunchCustomExp
   value: '224'
+- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE
+  desc: Enumerator for $xKernelGetSuggestedLocalWorkSize
+  value: '225'
 ---
 type: enum
 desc: Defines structure types
diff --git a/scripts/generate_code.py b/scripts/generate_code.py
index b8bfa97ba5..bdaa475a3e 100644
--- a/scripts/generate_code.py
+++ b/scripts/generate_code.py
@@ -328,12 +328,12 @@ def _mako_info_hpp(path, namespace, tags, version, specs, meta):
         specs=specs,
         meta=meta)
 
+
 """
 Entry-point:
     generates linker version scripts
 """
-def _mako_linker_scripts(path, ext, namespace, tags, version, specs, meta):
-    name = "adapter"
+def _mako_linker_scripts(path, name, ext, namespace, tags, version, specs, meta):
     filename = f"{name}.{ext}.in"
     fin = os.path.join(templates_dir, f"{filename}.mako")
     fout = os.path.join(path, filename)
@@ -347,6 +347,7 @@ def _mako_linker_scripts(path, ext, namespace, tags, version, specs, meta):
         specs=specs,
         meta=meta)
 
+
 """
 Entry-point:
     generates lib code
@@ -370,6 +371,12 @@ def generate_loader(path, section, namespace, tags, version, specs, meta):
     loc = 0
     loc += _mako_loader_cpp(dstpath, namespace, tags, version, specs, meta)
     loc += _mako_print_cpp(dstpath, namespace, tags, version, specs, meta)
+    loc += _mako_linker_scripts(
+        dstpath, "loader", "map", namespace, tags, version, specs, meta
+    )
+    loc += _mako_linker_scripts(
+        dstpath, "loader", "def", namespace, tags, version, specs, meta
+    )
     print("Generated %s lines of code.\n"%loc)
 
 """
@@ -382,8 +389,12 @@ def generate_adapters(path, section, namespace, tags, version, specs, meta):
 
     loc = 0
     loc += _mako_null_adapter_cpp(dstpath, namespace, tags, version, specs, meta)
-    loc += _mako_linker_scripts(dstpath, "map", namespace, tags, version, specs, meta)
-    loc += _mako_linker_scripts(dstpath, "def", namespace, tags, version, specs, meta)
+    loc += _mako_linker_scripts(
+        dstpath, "adapter", "map", namespace, tags, version, specs, meta
+    )
+    loc += _mako_linker_scripts(
+        dstpath, "adapter", "def", namespace, tags, version, specs, meta
+    )
     print("Generated %s lines of code.\n"%loc)
 
 """
diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py
index 1d539d70fe..0c90f4da8e 100644
--- a/scripts/templates/helper.py
+++ b/scripts/templates/helper.py
@@ -12,8 +12,10 @@
 
 # allow imports from top-level scripts directory
 sys.path.append("..")
+from .print_helper import get_api_types_funcs
 from version import Version
 
+
 """
     Extracts traits from a spec object
 """
@@ -651,11 +653,37 @@ def get_adapter_handles(specs):
     objs = []
     for s in specs:
         for obj in s['objects']:
-            if obj_traits.is_handle(obj) and not obj_traits.is_loader_only(obj):
+            if obj_traits.is_handle(obj) and not (obj_traits.is_loader_only(obj) or 'native' in obj['name']):
                 objs.append(obj)
 
     return objs
 
+"""
+Public:
+    returns a list of all loader API functions' names
+"""
+def get_loader_functions(specs, meta, n, tags):
+    func_names = []
+    
+    # Main API functions
+    for s in specs:
+        for obj in s["objects"]:
+            if obj_traits.is_function(obj):
+                func_names.append(make_func_name(n, tags, obj))
+
+    # Process address tables functions
+    for tbl in get_pfntables(specs, meta, n, tags):
+        func_names.append(tbl['export']['name'])
+
+    # Print functions
+    api_types_funcs = get_api_types_funcs(specs, meta, n, tags)
+    for func in api_types_funcs:
+        func_names.append(func.c_name)
+    func_names.append(f"{tags['$x']}PrintFunctionParams")
+
+    return sorted(func_names)
+
+
 """
 Private:
     removes 'const' from c++ type
diff --git a/scripts/templates/ldrddi.cpp.mako b/scripts/templates/ldrddi.cpp.mako
index bbc7c7c7d0..4da75d1f91 100644
--- a/scripts/templates/ldrddi.cpp.mako
+++ b/scripts/templates/ldrddi.cpp.mako
@@ -132,7 +132,7 @@ namespace ur_loader
         %else:
         <%param_replacements={}%>
         %for i, item in enumerate(th.get_loader_prologue(n, tags, obj, meta)):
-        %if not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle':
+        %if not '_native_object_' in item['obj']:
         // extract platform's function pointer table
         auto dditable = reinterpret_cast<${item['obj']}*>( ${item['pointer']}${item['name']} )->dditable;
         auto ${th.make_pfn_name(n, tags, obj)} = dditable->${n}.${th.get_table_name(n, tags, obj)}.${th.make_pfn_name(n, tags, obj)};
@@ -151,7 +151,7 @@ namespace ur_loader
         for( size_t i = ${item['range'][0]}; i < ${item['range'][1]}; ++i )
             ${item['name']}Local[ i ] = reinterpret_cast<${item['obj']}*>( ${item['name']}[ i ] )->handle;
         %else:
-        %if not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle':
+        %if not '_native_object_' in item['obj']:
         // convert loader handle to platform handle
         %if item['optional']:
         ${item['name']} = ( ${item['name']} ) ? reinterpret_cast<${item['obj']}*>( ${item['name']} )->handle : nullptr;
@@ -279,7 +279,7 @@ namespace ur_loader
         %if item['release']:
         // release loader handle
         ${item['factory']}.release( ${item['name']} );
-        %elif not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle':
+        %elif not '_native_object_' in item['obj']:
         try
         {
             %if 'typename' in item:
diff --git a/scripts/templates/loader.def.in.mako b/scripts/templates/loader.def.in.mako
new file mode 100644
index 0000000000..1bdfbe9fa7
--- /dev/null
+++ b/scripts/templates/loader.def.in.mako
@@ -0,0 +1,11 @@
+<%!
+import re
+from templates import helper as th
+%><%
+	n=namespace
+%>\
+LIBRARY @TARGET_LIBNAME@
+EXPORTS
+%for line in th.get_loader_functions(specs, meta, n, tags):
+	${line}
+%endfor
diff --git a/scripts/templates/loader.map.in.mako b/scripts/templates/loader.map.in.mako
new file mode 100644
index 0000000000..0df1250440
--- /dev/null
+++ b/scripts/templates/loader.map.in.mako
@@ -0,0 +1,14 @@
+<%!
+import re
+from templates import helper as th
+%><%
+    n=namespace
+%>\
+@TARGET_LIBNAME@ {
+	global:
+%for line in th.get_loader_functions(specs, meta, n, tags):
+		${line};
+%endfor
+	local:
+		*;
+};
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index cd2a003a32..baa67e5961 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -76,8 +76,38 @@ else()
   message(WARNING "CUDA adapter USM pools are disabled, set UMF_ENABLE_POOL_TRACKING to enable them")
 endif()
 
+if (UR_ENABLE_TRACING)
+  include(FindCUDACupti)
+  # The following two ifs can be removed when FindCUDA -> FindCUDAToolkit.
+  # CUDA_CUPTI_INCLUDE_DIR -> CUDAToolkit_CUPTI_INCLUDE_DIR
+  if(NOT CUDA_CUPTI_INCLUDE_DIR)
+    find_cuda_cupti_include_dir()
+  endif()
+  # CUDA_cupti_LIBRARY -> CUDAToolkit_cupti_LIBRARY
+  if(NOT CUDA_cupti_LIBRARY)
+    find_cuda_cupti_library()
+  endif()
+
+  if (NOT XPTI_INCLUDES)
+    get_target_property(XPTI_INCLUDES xpti INCLUDE_DIRECTORIES)
+  endif()
+  if (NOT XPTI_PROXY_SRC)
+    get_target_property(XPTI_SRC_DIR xpti SOURCE_DIR)
+    set(XPTI_PROXY_SRC "${XPTI_SRC_DIR}/xpti_proxy.cpp")
+  endif()
+  target_compile_definitions(${TARGET_NAME} PRIVATE
+    XPTI_ENABLE_INSTRUMENTATION
+    )
+  target_include_directories(${TARGET_NAME} PUBLIC
+    ${XPTI_INCLUDES}
+    ${CUDA_CUPTI_INCLUDE_DIR}
+  )
+  target_sources(${TARGET_NAME} PRIVATE ${XPTI_PROXY_SRC})
+endif()
+
 if (CUDA_cupti_LIBRARY)
   target_compile_definitions("ur_adapter_cuda" PRIVATE CUPTI_LIB_PATH="${CUDA_cupti_LIBRARY}")
+  list(APPEND EXTRA_LIBS ${CUDA_cupti_LIBRARY})
 endif()
 
 target_link_libraries(${TARGET_NAME} PRIVATE
@@ -85,6 +115,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE
     ${PROJECT_NAME}::common
     Threads::Threads
     cudadrv
+    ${EXTRA_LIBS}
 )
 
 target_include_directories(${TARGET_NAME} PRIVATE
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index f28e58afe7..a10e8e9ca7 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -116,6 +116,13 @@ struct ur_context_handle_t_ {
     return Devices;
   }
 
+  // Gets the index of the device relative to other devices in the context
+  size_t getDeviceIndex(ur_device_handle_t hDevice) {
+    auto It = std::find(Devices.begin(), Devices.end(), hDevice);
+    assert(It != Devices.end());
+    return std::distance(Devices.begin(), It);
+  }
+
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index e6389c5ee2..375f6a98f4 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -215,7 +215,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     int Major = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get()));
-    uint64_t Capabilities =
+    ur_memory_scope_capability_flags_t Capabilities =
         (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
@@ -270,7 +270,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     int WarpSize = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get()));
-    size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
+    uint32_t Sizes[1] = {static_cast<uint32_t>(WarpSize)};
     return ReturnValue(Sizes, 1);
   }
   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
@@ -418,7 +418,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
-    return ReturnValue(0lu);
+    return ReturnValue(size_t(0));
   }
   case UR_DEVICE_INFO_MAX_SAMPLERS: {
     // This call is kind of meaningless for cuda, as samplers don't exist.
@@ -429,7 +429,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
     // __global__ function parameters are passed to the device via constant
     // memory and are limited to 4 KB.
-    return ReturnValue(4000lu);
+    return ReturnValue(size_t(4000));
   }
   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
     int MemBaseAddrAlign = 0;
@@ -542,7 +542,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
     // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
     // 1060 3GB
-    return ReturnValue(1000lu);
+    return ReturnValue(size_t(1000));
   }
   case UR_DEVICE_INFO_ENDIAN_LITTLE: {
     return ReturnValue(true);
@@ -569,10 +569,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
         ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
                         UR_QUEUE_FLAG_PROFILING_ENABLE));
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
-    // The mandated minimum capability:
-    ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
-                                  UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    return ReturnValue(Capability);
+    return ReturnValue(0);
   }
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
     // The mandated minimum capability:
@@ -647,7 +644,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   }
   case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.
-    return ReturnValue(1024lu);
+    return ReturnValue(size_t(1024));
   }
   case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
     return ReturnValue(true);
@@ -692,8 +689,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
         // respect to other CPUs and GPUs in the system
         Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
-                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
-                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
       } else {
         // on GPU architectures with compute capability lower than 6.x, atomic
         // operations from the GPU to CPU memory will not be atomic with respect
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 279426a41e..906fd49d1d 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -237,7 +237,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
 
         if (hasExceededMaxRegistersPerBlock(Device, Kernel,
                                             KernelLocalWorkGroupSize)) {
-          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          return UR_RESULT_ERROR_OUT_OF_RESOURCES;
         }
       } else {
         guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
@@ -414,37 +414,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
@@ -477,15 +446,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
@@ -496,20 +468,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
@@ -523,7 +481,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -535,6 +492,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  if (pGlobalWorkOffset == nullptr || *pGlobalWorkOffset == 0) {
+    ur_exp_launch_property_t coop_prop;
+    coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
+    coop_prop.value.cooperative = 1;
+    return urEnqueueKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
+        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
+  }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
                                numEventsInWaitList, phEventWaitList, phEvent);
@@ -553,7 +518,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
                           pLocalWorkSize, numEventsInWaitList, phEventWaitList,
                           phEvent);
   }
-
+#if CUDA_VERSION >= 11080
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -595,37 +560,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
   }
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
@@ -658,15 +592,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             CuStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
@@ -677,20 +614,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     auto &ArgIndices = hKernel->getArgIndices();
 
     CUlaunchConfig launch_config;
@@ -717,11 +640,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
   return UR_RESULT_SUCCESS;
+#else
+  setErrorMessage("This feature requires cuda 11.8 or later.",
+                  UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+  return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+#endif // CUDA_VERSION >= 11080
 }
 
 /// Set parameters for general 3D memory copy.
@@ -807,28 +734,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     ur_event_handle_t *phEvent) {
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  ScopedContext Active(Device);
-  CUstream Stream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
+    ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -873,6 +791,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
   CUdeviceptr DevPtr =
       std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
@@ -903,7 +822,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     if (phEvent) {
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -1041,20 +959,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     ur_event_handle_t *phEvent) {
   UR_ASSERT(size + offset <= std::get<BufferMem>(hBuffer->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
-
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
-    ur_result_t Result =
-        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
@@ -1080,8 +998,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       break;
     }
     default: {
-      Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern,
-                                        DstDevice);
+      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
+                                              pPattern, DstDevice));
       break;
     }
     }
@@ -1090,13 +1008,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
-    return Result;
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
+  return UR_RESULT_SUCCESS;
 }
 
 static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
@@ -1197,28 +1114,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
 
   UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  CUstream Stream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the Image, meaning we must perform the copy
     // from a different device
-    if (hImage->LastEventWritingToMemObj &&
-        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hImage->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hImage->LastEventWritingToMemObj));
+    if (hImage->LastQueueWritingToMemObj &&
+        hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hImage->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -1821,28 +1729,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  ScopedContext Active(Device);
-  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      Stream = CUstream{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
+    ScopedContext Active(Device);
+    CUstream Stream = hQueue->getNextTransferStream();
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -1884,17 +1784,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUdeviceptr DevPtr =
       std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
@@ -1917,9 +1817,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp
index c925a27295..be141f7b20 100644
--- a/source/adapters/cuda/enqueue.hpp
+++ b/source/adapters/cuda/enqueue.hpp
@@ -17,6 +17,10 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
                               const ur_event_handle_t *EventWaitList);
 
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        ur_kernel_handle_t Kernel);
+
 bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
                                      ur_kernel_handle_t Kernel,
                                      size_t BlockSize);
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index f9889a3f46..99446a3b7b 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -55,8 +55,7 @@ ur_result_t ur_event_handle_t_::start() {
 
   try {
     if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
-      // NOTE: This relies on the default stream to be unused.
-      UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
+      UR_CHECK_ERROR(cuEventRecord(EvQueued, Queue->getHostSubmitTimeStream()));
       UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
     }
   } catch (ur_result_t Err) {
@@ -176,7 +175,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
   case UR_EVENT_INFO_CONTEXT:
     return ReturnValue(hEvent->getContext());
   default:
-    detail::ur::die("Event info request not implemented");
+    break;
   }
 
   return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -207,8 +206,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   default:
     break;
   }
-  detail::ur::die("Event Profiling info request not implemented");
-  return {};
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
@@ -280,8 +278,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
 
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
-  *phEvent = ur_event_handle_t_::makeWithNative(
-      hContext, reinterpret_cast<CUevent>(hNativeEvent));
+  try {
+    EventPtr =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeWithNative(
+            hContext, reinterpret_cast<CUevent>(hNativeEvent)));
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  *phEvent = EventPtr.release();
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp
index 5ed68f0f25..3cb11469ce 100644
--- a/source/adapters/cuda/event.hpp
+++ b/source/adapters/cuda/event.hpp
@@ -90,6 +90,9 @@ struct ur_event_handle_t_ {
     const bool RequiresTimings =
         Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE ||
         Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+    if (RequiresTimings) {
+      Queue->createHostSubmitTimeStream();
+    }
     native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr;
     UR_CHECK_ERROR(cuEventCreate(
         &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 95dc2e258e..b641685920 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -35,7 +35,7 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order,
     *NumChannels = 2;
     return UR_RESULT_SUCCESS;
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGB:
-    return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
+    return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBA:
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ARGB:
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_BGRA:
@@ -49,7 +49,7 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order,
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_INTENSITY:
   case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_LUMINANCE:
   default:
-    return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
+    return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
   }
 }
 
@@ -147,7 +147,7 @@ urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type,
       cuda_format = cuda_format_and_size.first;
       pixel_size_bytes = cuda_format_and_size.second;
     } catch (const std::out_of_range &) {
-      return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
+      return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
     }
   }
 
@@ -228,7 +228,7 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format,
 #endif
 #undef MAP
   default:
-    return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
+    return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
   }
 }
 
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 675fdbe0a3..5e01845a56 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -9,7 +9,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "kernel.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
+#include "queue.hpp"
 #include "sampler.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -380,3 +382,30 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
   }
   return Result;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  // Preconditions
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_device_handle_t Device = hQueue->Device;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  size_t ThreadsPerBlock[3] = {};
+
+  // Set the active context here as guessLocalWorkSize needs an active context
+  ScopedContext Active(Device);
+
+  guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
+                     hKernel);
+
+  std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+            pSuggestedLocalWorkSize);
+  return Result;
+}
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index c28ce98748..9ea62a2c1b 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -12,6 +12,7 @@
 
 #include "common.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 
 /// Creates a UR Memory object using a CUDA memory allocation.
@@ -211,7 +212,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
 
   UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER,
+  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_ARRAY,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   UR_ASSERT(pImageDesc->numMipLevel == 0,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
@@ -227,15 +228,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   // We only support RBGA channel order
   // TODO: check SYCL CTS and spec. May also have to support BGRA
   UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
-            UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+            UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
 
   auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
       new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost});
 
+  UR_ASSERT(std::get<SurfaceMem>(URMemObj->Mem).PixelTypeSizeBytes,
+            UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
+
   try {
     if (PerformInitialCopy) {
       for (const auto &Device : hContext->getDevices()) {
-        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
+        // Synchronous behaviour is best in this case
+        ScopedContext Active(Device);
+        CUstream Stream{0}; // Use default stream
+        UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(),
+                                                            Device, Stream));
+        UR_CHECK_ERROR(cuStreamSynchronize(Stream));
       }
     }
 
@@ -429,11 +438,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
                                            const ur_device_handle_t hDevice) {
   ScopedContext Active(hDevice);
+  auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
   ur_lock LockGuard(Mem->MemoryAllocationMutex);
 
   if (Mem->isBuffer()) {
     auto &Buffer = std::get<BufferMem>(Mem->Mem);
-    auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
+    auto &DevPtr = Buffer.Ptrs[DeviceIdx];
 
     // Allocation has already been made
     if (DevPtr != BufferMem::native_type{0}) {
@@ -456,11 +466,11 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
     try {
       auto &Image = std::get<SurfaceMem>(Mem->Mem);
       // Allocation has already been made
-      if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
+      if (Image.Arrays[DeviceIdx]) {
         return UR_RESULT_SUCCESS;
       }
       UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
-      Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
+      Image.Arrays[DeviceIdx] = ImageArray;
 
       // CUDA_RESOURCE_DESC is a union of different structs, shown here
       // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
@@ -475,7 +485,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       ImageResDesc.flags = 0;
 
       UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
-      Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
+      Image.SurfObjs[DeviceIdx] = Surface;
     } catch (ur_result_t Err) {
       if (ImageArray) {
         UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
@@ -492,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                  ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                         ur_device_handle_t hDevice,
+                                         CUstream Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                       Buffer.Size, Stream));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
-    UR_CHECK_ERROR(cuMemcpyDtoD(
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
-        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
+        Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                 ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                        ur_device_handle_t hDevice,
+                                        CUstream Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -543,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
     CpyDesc3D.Depth = Image.ImageDesc.depth;
   }
 
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     if (Image.HostPtr) {
       if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(
-            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+        UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr,
+                                         ImageSizeBytes, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
         CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc2D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+        UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
       } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
         CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
         CpyDesc3D.srcHost = Image.HostPtr;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+        UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
       }
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed
+      UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(cuMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(
-              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc2D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE;
+      CpyDesc3D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -585,26 +598,52 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   // Device allocation has already been initialized with most up to date
   // data in buffer
-  if (Mem->HaveMigratedToDeviceSinceLastWrite
-          [hDevice->getIndex() %
-           Mem->HaveMigratedToDeviceSinceLastWrite.size()]) {
+  if (Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
+          hDevice)]) {
     return UR_RESULT_SUCCESS;
   }
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
-  Mem->HaveMigratedToDeviceSinceLastWrite
-      [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
-      true;
+  Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex(
+      hDevice)] = true;
   return UR_RESULT_SUCCESS;
 }
+
+BufferMem::native_type
+BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return reinterpret_cast<native_type>(
+      reinterpret_cast<uint8_t *>(
+          Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) +
+      Offset);
+}
+
+CUarray SurfaceMem::getArray(const ur_device_handle_t Device) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)];
+}
+
+CUsurfObject SurfaceMem::getSurface(const ur_device_handle_t Device) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)];
+}
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 6b7e9d0156..a67e9295cc 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -22,8 +22,9 @@
 
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
                                            const ur_device_handle_t);
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
-                                          const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 CUstream);
 
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
@@ -97,16 +98,7 @@ struct BufferMem {
 
   BufferMem(const BufferMem &Buffer) = default;
 
-  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return reinterpret_cast<native_type>(
-        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex() % Ptrs.size()]) +
-        Offset);
-  }
+  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset);
 
   native_type getPtr(const ur_device_handle_t Device) {
     return getPtrWithOffset(Device, 0);
@@ -199,6 +191,7 @@ struct SurfaceMem {
   CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
   size_t PixelTypeSizeBytes;
   void *HostPtr;
+  ur_result_t error = UR_RESULT_SUCCESS;
 
   SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
              ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
@@ -233,6 +226,7 @@ struct SurfaceMem {
       ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
       PixelTypeSizeBytes = 1;
       break;
+    case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8:
     case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
       ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
       PixelTypeSizeBytes = 1;
@@ -242,6 +236,7 @@ struct SurfaceMem {
       ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
       PixelTypeSizeBytes = 2;
       break;
+    case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16:
     case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
       ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
       PixelTypeSizeBytes = 2;
@@ -263,29 +258,15 @@ struct SurfaceMem {
       PixelTypeSizeBytes = 4;
       break;
     default:
-      detail::ur::die(
-          "urMemImageCreate given unsupported image_channel_data_type");
+      break;
     }
   }
 
   // Will allocate a new array on device if not already allocated
-  CUarray getArray(const ur_device_handle_t Device) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return Arrays[Device->getIndex() % Arrays.size()];
-  }
+  CUarray getArray(const ur_device_handle_t Device);
+
   // Will allocate a new surface on device if not already allocated
-  CUsurfObject getSurface(const ur_device_handle_t Device) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return SurfObjs[Device->getIndex() % SurfObjs.size()];
-  }
+  CUsurfObject getSurface(const ur_device_handle_t Device);
 
   ur_mem_type_t getType() { return ImageDesc.type; }
 
@@ -313,7 +294,7 @@ struct SurfaceMem {
 ///
 /// The ur_mem_handle_t is responsible for memory allocation and migration
 /// across devices in the same ur_context_handle_t. If a kernel writes to a
-/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
 /// subsequent operations that want to read from the ur_mem_handle_t must wait
 /// on the event referring to the last write.
 ///
@@ -333,61 +314,7 @@ struct SurfaceMem {
 ///
 /// Migrations will occur in both cases if the most recent version of data
 /// is on a different device, marked by
-/// LastEventWritingToMemObj->getQueue()->getDevice()
-///
-/// Example trace:
-/// ~~~~~~~~~~~~~~
-///
-/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
-///             -> OUT: hContext
-///
-/// =====> urMemBufferCreate(hContext,...);
-///             -> No native allocations made
-///             -> OUT: hBuffer
-///
-/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
-///             -> Allocation made on q0 ie device0
-///             -> New allocation initialized with host data.
-///
-/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
-///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
-///                which is in turn unique to a device. So we can set the kernel
-///                arg with the ptr of the device specific allocation.
-///             -> hKernel0->getProgram()->getDevice() == device0
-///             -> allocateMemObjOnDeviceIfNeeded(device0);
-///                   -> Native allocation already made on device0, continue.
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> Suppose that hKernel0 writes to hBuffer.
-///             -> Call hBuffer->setLastEventWritingToMemObj with return event
-///                from this operation
-///             -> Enqueue native kernel launch
-///
-/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
-///             -> hKernel1->getProgram()->getDevice() == device1
-///             -> New allocation will be made on device1 when calling
-///                getPtr(device1)
-///                   -> No native allocation on device1
-///                   -> Make native allocation on device1
-///
-/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
-///             -> Suppose hKernel1 wants to read from hBuffer and not write.
-///             -> migrateMemoryToDeviceIfNeeded(device1);
-///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
-///                   -> Check if memory has been migrated to device1 since the
-///                      last write
-///                        -> Hasn't been migrated
-///                   -> Wait on LastEventWritingToMemObj.
-///                   -> Migrate memory from device0's native allocation to
-///                      device1's native allocation.
-///             -> Enqueue native kernel launch
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> migrateMemoryToDeviceIfNeeded(device0);
-///                   -> hBuffer->LastEventWritingToMemObj refers to an event
-///                      from q0
-///                        -> Migration not necessary
-///             -> Enqueue native kernel launch
+/// LastQueueWritingToMemObj->getDevice()
 ///
 struct ur_mem_handle_t_ {
   // Context where the memory object is accessible
@@ -406,15 +333,13 @@ struct ur_mem_handle_t_ {
   // Has the memory been migrated to a device since the last write?
   std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
 
-  // We should wait on this event prior to migrating memory across allocations
-  // in this ur_mem_handle_t_
-  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+  // Queue with most up to date data of ur_mem_handle_t_
+  ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
 
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
   ur_mutex MemoryAllocationMutex; // A mutex for allocations
-  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -503,21 +428,20 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
-    assert(NewEvent && "Invalid event!");
-    // This entry point should only ever be called when using multi device ctx
-    assert(Context->Devices.size() > 1);
-    urEventRetain(NewEvent);
-    if (LastEventWritingToMemObj != nullptr) {
-      urEventRelease(LastEventWritingToMemObj);
+  void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
+    urQueueRetain(WritingQueue);
+    if (LastQueueWritingToMemObj != nullptr) {
+      urQueueRelease(LastQueueWritingToMemObj);
     }
-    LastEventWritingToMemObj = NewEvent;
+    LastQueueWritingToMemObj = WritingQueue;
     for (const auto &Device : Context->getDevices()) {
       // This event is never an interop event so will always have an associated
       // queue
-      HaveMigratedToDeviceSinceLastWrite
-          [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
-              Device == NewEvent->getQueue()->getDevice();
+      HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
+          Device == WritingQueue->getDevice();
     }
   }
 };
+
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                          const ur_device_handle_t);
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index 27b94f756f..d89a5b7a19 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -141,12 +141,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
-    ur_native_handle_t hNativePlatform,
-    const ur_platform_native_properties_t *pProperties,
-    ur_platform_handle_t *phPlatform) {
-  std::ignore = hNativePlatform;
-  std::ignore = pProperties;
-  std::ignore = phPlatform;
+    ur_native_handle_t, ur_adapter_handle_t,
+    const ur_platform_native_properties_t *, ur_platform_handle_t *) {
+  // There is no CUDA equivalent to ur_platform_handle_t
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
index 773126f4d5..35a2272cbc 100644
--- a/source/adapters/cuda/queue.cpp
+++ b/source/adapters/cuda/queue.cpp
@@ -201,6 +201,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
       UR_CHECK_ERROR(cuStreamDestroy(S));
     });
 
+    if (hQueue->getHostSubmitTimeStream() != CUstream{0}) {
+      UR_CHECK_ERROR(cuStreamSynchronize(hQueue->getHostSubmitTimeStream()));
+      UR_CHECK_ERROR(cuStreamDestroy(hQueue->getHostSubmitTimeStream()));
+    }
+
     return UR_RESULT_SUCCESS;
   } catch (ur_result_t Err) {
     return Err;
diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp
index 46e9968fa9..727df68573 100644
--- a/source/adapters/cuda/queue.hpp
+++ b/source/adapters/cuda/queue.hpp
@@ -9,10 +9,12 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
+#include "common.hpp"
 #include <ur/ur.hpp>
 
 #include <algorithm>
 #include <cuda.h>
+#include <mutex>
 #include <vector>
 
 using ur_stream_guard_ = std::unique_lock<std::mutex>;
@@ -27,6 +29,10 @@ struct ur_queue_handle_t_ {
 
   std::vector<native_type> ComputeStreams;
   std::vector<native_type> TransferStreams;
+  // Stream used for recording EvQueue, which holds information about when the
+  // command in question is enqueued on host, as opposed to started. It is
+  // created only if profiling is enabled - either for queue or per event.
+  native_type HostSubmitTimeStream{0};
   // delay_compute_ keeps track of which streams have been recently reused and
   // their next use should be delayed. If a stream has been recently reused it
   // will be skipped the next time it would be selected round-robin style. When
@@ -99,6 +105,18 @@ struct ur_queue_handle_t_ {
   native_type get() { return getNextComputeStream(); };
   ur_device_handle_t getDevice() const noexcept { return Device; };
 
+  // Function which creates the profiling stream. Called only from makeNative
+  // event when profiling is required.
+  void createHostSubmitTimeStream() {
+    static std::once_flag HostSubmitTimeStreamFlag;
+    std::call_once(HostSubmitTimeStreamFlag, [&]() {
+      UR_CHECK_ERROR(cuStreamCreateWithPriority(&HostSubmitTimeStream,
+                                                CU_STREAM_NON_BLOCKING, 0));
+    });
+  }
+
+  native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; }
+
   bool hasBeenSynchronized(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
     if (StreamToken == std::numeric_limits<uint32_t>::max()) {
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index b70198b227..fc8cad9d43 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index c0035052d8..89f4b9a6a2 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -261,16 +261,13 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
                                            CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
                                            (CUdeviceptr)pMem));
 
-      // currently each device is in its own platform, so find the platform at
-      // the same index
-      std::vector<ur_platform_handle_t> Platforms;
-      Platforms.resize(DeviceIndex + 1);
+      // cuda backend has only one platform containing all devices
+      ur_platform_handle_t platform;
       ur_adapter_handle_t AdapterHandle = &adapter;
-      Result = urPlatformGet(&AdapterHandle, 1, DeviceIndex + 1,
-                             Platforms.data(), nullptr);
+      Result = urPlatformGet(&AdapterHandle, 1, 1, &platform, nullptr);
 
       // get the device from the platform
-      ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get();
+      ur_device_handle_t Device = platform->Devices[DeviceIndex].get();
       return ReturnValue(Device);
     }
     case UR_USM_ALLOC_INFO_POOL: {
diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp
index be332c280b..98799d58f5 100644
--- a/source/adapters/hip/common.hpp
+++ b/source/adapters/hip/common.hpp
@@ -204,3 +204,38 @@ template <typename T> class ReleaseGuard {
   /// UR object.
   void dismiss() { Captive = nullptr; }
 };
+
+// Helper method to return a (non-null) pointer's attributes, or std::nullopt in
+// the case that the pointer is unknown to the HIP subsystem.
+inline static std::optional<hipPointerAttribute_t>
+getPointerAttributes(const void *pMem) {
+  // do not throw if hipPointerGetAttributes returns hipErrorInvalidValue
+  hipPointerAttribute_t hipPointerAttributes;
+  hipError_t Ret = hipPointerGetAttributes(&hipPointerAttributes, pMem);
+  if (Ret == hipErrorInvalidValue && pMem) {
+    // pointer non-null but not known to the HIP subsystem
+    return std::nullopt;
+  }
+  // Direct usage of the function, instead of UR_CHECK_ERROR, so we can get
+  // the line offset.
+  checkErrorUR(Ret, __func__, __LINE__ - 7, __FILE__);
+  // ROCm 6.0.0 introduces hipMemoryTypeUnregistered in the hipMemoryType
+  // enum to mark unregistered allocations (i.e., via system allocators).
+#if HIP_VERSION_MAJOR >= 6
+  if (hipPointerAttributes.type == hipMemoryTypeUnregistered) {
+    // pointer not known to the HIP subsystem
+    return std::nullopt;
+  }
+#endif
+  return hipPointerAttributes;
+}
+
+// Helper method to abstract away the fact that retrieving a pointer's memory
+// type differs depending on the version of HIP.
+inline static unsigned getMemoryType(hipPointerAttribute_t hipPointerAttrs) {
+#if HIP_VERSION >= 50600000
+  return hipPointerAttrs.type;
+#else
+  return hipPointerAttrs.memoryType;
+#endif
+}
diff --git a/source/adapters/hip/context.cpp b/source/adapters/hip/context.cpp
index 73ac777edb..c3fcb3a1a3 100644
--- a/source/adapters/hip/context.cpp
+++ b/source/adapters/hip/context.cpp
@@ -47,18 +47,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
     // Create a scoped context.
     ContextPtr = std::unique_ptr<ur_context_handle_t_>(
         new ur_context_handle_t_{phDevices, DeviceCount});
-
-    static std::once_flag InitFlag;
-    std::call_once(
-        InitFlag,
-        [](ur_result_t &) {
-          // Use default stream to record base event counter
-          UR_CHECK_ERROR(hipEventCreateWithFlags(&ur_platform_handle_t_::EvBase,
-                                                 hipEventDefault));
-          UR_CHECK_ERROR(hipEventRecord(ur_platform_handle_t_::EvBase, 0));
-        },
-        RetErr);
-
     *phContext = ContextPtr.release();
   } catch (ur_result_t Err) {
     RetErr = Err;
@@ -78,7 +66,8 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName,
   case UR_CONTEXT_INFO_NUM_DEVICES:
     return ReturnValue(static_cast<uint32_t>(hContext->Devices.size()));
   case UR_CONTEXT_INFO_DEVICES:
-    return ReturnValue(hContext->getDevices());
+    return ReturnValue(hContext->getDevices().data(),
+                       hContext->getDevices().size());
   case UR_CONTEXT_INFO_REFERENCE_COUNT:
     return ReturnValue(hContext->getReferenceCount());
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
diff --git a/source/adapters/hip/context.hpp b/source/adapters/hip/context.hpp
index 69d4df9b6d..90366436e2 100644
--- a/source/adapters/hip/context.hpp
+++ b/source/adapters/hip/context.hpp
@@ -112,6 +112,13 @@ struct ur_context_handle_t_ {
     return Devices;
   }
 
+  // Gets the index of the device relative to other devices in the context
+  size_t getDeviceIndex(ur_device_handle_t hDevice) {
+    auto It = std::find(Devices.begin(), Devices.end(), hDevice);
+    assert(It != Devices.end());
+    return std::distance(Devices.begin(), It);
+  }
+
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index dd20a4f50f..0e9b50f94e 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "device.hpp"
+#include "adapter.hpp"
 #include "context.hpp"
 #include "event.hpp"
 
@@ -20,6 +21,18 @@ int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute) {
   return Value;
 }
 
+uint64_t ur_device_handle_t_::getElapsedTime(hipEvent_t ev) const {
+  float Milliseconds = 0.0f;
+
+  // hipEventSynchronize waits till the event is ready for call to
+  // hipEventElapsedTime.
+  UR_CHECK_ERROR(hipEventSynchronize(EvBase));
+  UR_CHECK_ERROR(hipEventSynchronize(ev));
+  UR_CHECK_ERROR(hipEventElapsedTime(&Milliseconds, EvBase, ev));
+
+  return static_cast<uint64_t>(Milliseconds * 1.0e6);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
                                                     ur_device_info_t propName,
                                                     size_t propSize,
@@ -177,7 +190,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     int WarpSize = 0;
     UR_CHECK_ERROR(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize,
                                          hDevice->get()));
-    size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
+    uint32_t Sizes[1] = {static_cast<uint32_t>(WarpSize)};
     return ReturnValue(Sizes, 1);
   }
   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
@@ -321,7 +334,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
-    return ReturnValue(0lu);
+    return ReturnValue(size_t(0));
   }
   case UR_DEVICE_INFO_MAX_SAMPLERS: {
     // This call is kind of meaningless for HIP, as samplers don't exist.
@@ -331,7 +344,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: {
     // __global__ function parameters are passed to the device via constant
     // memory and are limited to 4 KB.
-    return ReturnValue(4000lu);
+    return ReturnValue(size_t(4000));
   }
   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
     int MemBaseAddrAlign = 0;
@@ -442,7 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
     // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
     // 1060 3GB
-    return ReturnValue(1000lu);
+    return ReturnValue(size_t(1000));
   }
   case UR_DEVICE_INFO_ENDIAN_LITTLE: {
     return ReturnValue(true);
@@ -465,10 +478,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
-    // The mandated minimum capability:
-    ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
-                                  UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    return ReturnValue(Capability);
+    return ReturnValue(0);
   }
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_PROPERTIES: {
@@ -569,7 +579,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   }
   case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
     // The minimum value for the FULL profile is 1 MB.
-    return ReturnValue(1024lu);
+    return ReturnValue(size_t(1024));
   }
   case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
     return ReturnValue(true);
@@ -761,6 +771,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(int32_t{1});
   }
 
+  case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: {
+    return ReturnValue(ur_bool_t{false});
+  }
+
   case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
     ur_memory_order_capability_flags_t Capabilities =
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
@@ -768,16 +782,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
     return ReturnValue(Capabilities);
   }
-  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
     // SYCL2020 4.6.4.2 minimum mandated capabilities for
     // atomic_fence/memory_scope_capabilities.
     // Because scopes are hierarchical, wider scopes support all narrower
     // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
     // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
-    uint64_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
-                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
-                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP;
+    ur_memory_scope_capability_flags_t Capabilities =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP;
+    return ReturnValue(Capabilities);
+  }
+  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+    constexpr ur_memory_scope_capability_flags_t Capabilities =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
     return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
@@ -788,6 +811,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+#ifdef __HIP_PLATFORM_NVIDIA__
+    // Nvidia introduced fence.sc for seq_cst only since SM 7.0.
+    int Major = 0;
+    UR_CHECK_ERROR(hipDeviceGetAttribute(
+        &Major, hipDeviceAttributeComputeCapabilityMajor, hDevice->get()));
+    if (Major >= 7)
+      Capabilities |= UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+#else
+    Capabilities |= UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+#endif
     return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_DEVICE_ID: {
@@ -950,8 +983,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
-    ur_native_handle_t, ur_platform_handle_t,
-    const ur_device_native_properties_t *, ur_device_handle_t *) {
+    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    [[maybe_unused]] const ur_device_native_properties_t *pProperties,
+    ur_device_handle_t *phDevice) {
+  // We can't cast between ur_native_handle_t and hipDevice_t, so memcpy the
+  // bits instead
+  hipDevice_t HIPDevice = 0;
+  memcpy(&HIPDevice, &hNativeDevice, sizeof(hipDevice_t));
+
+  auto IsDevice = [=](std::unique_ptr<ur_device_handle_t_> &Dev) {
+    return Dev->get() == HIPDevice;
+  };
+
+  // If a platform is provided just check if the device is in it
+  if (hPlatform) {
+    auto SearchRes = std::find_if(begin(hPlatform->Devices),
+                                  end(hPlatform->Devices), IsDevice);
+    if (SearchRes != end(hPlatform->Devices)) {
+      *phDevice = SearchRes->get();
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // Get list of platforms
+  uint32_t NumPlatforms = 0;
+  ur_adapter_handle_t AdapterHandle = &adapter;
+  ur_result_t Result =
+      urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
+
+  // We can only have a maximum of one platform.
+  if (NumPlatforms != 1)
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+
+  ur_platform_handle_t Platform = nullptr;
+
+  Result = urPlatformGet(&AdapterHandle, 1, NumPlatforms, &Platform, nullptr);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
+
+  // Iterate through the platform's devices to find the device that matches
+  // nativeHandle
+  auto SearchRes = std::find_if(std::begin(Platform->Devices),
+                                std::end(Platform->Devices), IsDevice);
+  if (SearchRes != end(Platform->Devices)) {
+    *phDevice = static_cast<ur_device_handle_t>((*SearchRes).get());
+    return UR_RESULT_SUCCESS;
+  }
+
+  // If the provided nativeHandle cannot be matched to an
+  // existing device return error
   return UR_RESULT_ERROR_INVALID_OPERATION;
 }
 
@@ -995,11 +1077,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
   if (pDeviceTimestamp) {
     UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault));
     UR_CHECK_ERROR(hipEventRecord(Event));
-    UR_CHECK_ERROR(hipEventSynchronize(Event));
-    float ElapsedTime = 0.0f;
-    UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime,
-                                       ur_platform_handle_t_::EvBase, Event));
-    *pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6);
+    *pDeviceTimestamp = hDevice->getElapsedTime(Event);
   }
 
   if (pHostTimestamp) {
diff --git a/source/adapters/hip/device.hpp b/source/adapters/hip/device.hpp
index 269848f3cd..5fd11bfc2f 100644
--- a/source/adapters/hip/device.hpp
+++ b/source/adapters/hip/device.hpp
@@ -25,7 +25,9 @@ struct ur_device_handle_t_ {
   std::atomic_uint32_t RefCount;
   ur_platform_handle_t Platform;
   hipCtx_t HIPContext;
+  hipEvent_t EvBase; // HIP event used as base counter
   uint32_t DeviceIndex;
+
   int MaxWorkGroupSize{0};
   int MaxBlockDimX{0};
   int MaxBlockDimY{0};
@@ -36,9 +38,10 @@ struct ur_device_handle_t_ {
 
 public:
   ur_device_handle_t_(native_type HipDevice, hipCtx_t Context,
-                      ur_platform_handle_t Platform, uint32_t DeviceIndex)
+                      hipEvent_t EvBase, ur_platform_handle_t Platform,
+                      uint32_t DeviceIndex)
       : HIPDevice(HipDevice), RefCount{1}, Platform(Platform),
-        HIPContext(Context), DeviceIndex(DeviceIndex) {
+        HIPContext(Context), EvBase(EvBase), DeviceIndex(DeviceIndex) {
 
     UR_CHECK_ERROR(hipDeviceGetAttribute(
         &MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, HIPDevice));
@@ -68,6 +71,8 @@ struct ur_device_handle_t_ {
 
   ur_platform_handle_t getPlatform() const noexcept { return Platform; };
 
+  uint64_t getElapsedTime(hipEvent_t) const;
+
   hipCtx_t getNativeContext() const noexcept { return HIPContext; };
 
   // Returns the index of the device relative to the other devices in the same
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 4b98a9c491..4fc4f95f75 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -160,8 +160,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
             UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
   UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
@@ -193,9 +193,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
@@ -210,29 +210,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      // This event is never created with interop so getQueue is never null
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     // Use the default stream if copying from another device
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
@@ -280,44 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
-  std::vector<ur_event_handle_t> DepEvents(
-      phEventWaitList, phEventWaitList + numEventsInWaitList);
-  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
-
-  // phEventWaitList only contains events that are handed to UR by the SYCL
-  // runtime. However since UR handles memory dependencies within a context
-  // we may need to add more events to our dependent events list if the UR
-  // context contains multiple devices
-  if (hQueue->getContext()->Devices.size() > 1) {
-    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
-    for (auto &MemArg : hKernel->Args.MemObjArgs) {
-      bool PushBack = false;
-      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
-          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
-                                   MemDepEvent) == DepEvents.end()) {
-        DepEvents.push_back(MemDepEvent);
-        PushBack = true;
-      }
-      if ((MemArg.AccessFlags &
-           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
-          PushBack) {
-        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
-                         [MemArg](auto &Lock) {
-                           return Lock.first == MemArg.Mem;
-                         }) == MemMigrationLocks.end())
-          MemMigrationLocks.emplace_back(
-              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
-      }
-    }
-  }
-
   // Early exit for zero size range kernel
   if (*pGlobalWorkSize == 0) {
-    if (DepEvents.size()) {
-      return urEnqueueEventsWaitWithBarrier(hQueue, DepEvents.size(),
-                                            phEventWaitList, phEvent);
-    }
-    return UR_RESULT_SUCCESS;
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
   }
 
   // Set the number of threads per block to the number of threads per warp
@@ -325,7 +280,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
@@ -343,20 +297,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     hipStream_t HIPStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(),
-                                       DepEvents.data()));
-    }
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // For memory migration across devices in the same context
     if (hQueue->getContext()->Devices.size() > 1) {
       for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+        enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(),
+                                             HIPStream);
+        if (MemArg.AccessFlags &
+            (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) {
+          MemArg.Mem->setLastQueueWritingToMemObj(hQueue);
+        }
       }
     }
 
     auto ArgIndices = hKernel->getArgIndices();
 
+    // If migration of mem across buffer is needed, an event must be associated
+    // with this command, implicitly if phEvent is nullptr
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -364,20 +323,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // Once event has been started we can unlock MemoryMigrationMutex
-    if (hQueue->getContext()->Devices.size() > 1) {
-      for (auto &MemArg : hKernel->Args.MemObjArgs) {
-        // Telling the ur_mem_handle_t that it will need to wait on this kernel
-        // if it has been written to
-        if (phEvent && (MemArg.AccessFlags &
-                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
-        }
-      }
-      // We can release the MemoryMigrationMutexes now
-      MemMigrationLocks.clear();
-    }
-
     UR_CHECK_ERROR(hipModuleLaunchKernel(
         HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2],
@@ -390,9 +335,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       *phEvent = RetImplEvent.release();
     }
   } catch (ur_result_t err) {
-    Result = err;
+    return err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
@@ -578,30 +523,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  ur_lock MemoryMigrationLock(hBuffer->MemoryMigrationMutex);
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
-    if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      // This event is never created with interop so getQueue is never null
-      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hBuffer->LastEventWritingToMemObj));
+    if (hBuffer->LastQueueWritingToMemObj &&
+        hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hBuffer->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
                                      phEventWaitList));
@@ -632,9 +565,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -644,27 +577,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   void *DevPtr = std::get<BufferMem>(hBuffer->Mem).getVoid(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
     hipStream_t HIPStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, HIPStream));
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, HIPStream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
         HIPStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch,
         hostSlicePitch, &DevPtr, hipMemoryTypeDevice, bufferOrigin,
-        bufferRowPitch, bufferSlicePitch);
+        bufferRowPitch, bufferSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
@@ -677,11 +610,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     if (phEvent) {
       *phEvent = RetImplEvent.release();
     }
-
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -803,8 +735,8 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
 // HIP has no memset functions that allow setting values more than 4 bytes. UR
 // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
 // more than 4 bytes. We must break up the pattern into 1 byte values, and set
-// the buffer using multiple strided calls.  The first 4 patterns are set using
-// hipMemsetD32Async then all subsequent 1 byte patterns are set using
+// the buffer using multiple strided calls.  The first 4 patterns are set
+// using hipMemsetD32Async then all subsequent 1 byte patterns are set using
 // hipMemset2DAsync which is called for each pattern.
 ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
                                      size_t Size, const void *pPattern,
@@ -823,8 +755,8 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
   UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr));
 
   // The hostPointer attribute is non-null also for shared memory allocations.
-  // To make sure that this workaround only executes for host pinned memory, we
-  // need to check that isManaged attribute is false.
+  // To make sure that this workaround only executes for host pinned memory,
+  // we need to check that isManaged attribute is false.
   if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
     const auto NumOfCopySteps = Size / PatternSize;
     const auto Offset = sizeof(uint32_t);
@@ -857,38 +789,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     ur_event_handle_t *phEvent) {
   UR_ASSERT(size + offset <= std::get<BufferMem>(hBuffer->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
-  auto ArgsAreMultiplesOfPatternSize =
-      (offset % patternSize == 0) || (size % patternSize == 0);
-
-  auto PatternIsValid = (pPattern != nullptr);
-
-  auto PatternSizeIsValid =
-      ((patternSize & (patternSize - 1)) == 0) && // is power of two
-      (patternSize > 0) && (patternSize <= 128);  // falls within valid range
-
-  UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
-                PatternSizeIsValid,
-            UR_RESULT_ERROR_INVALID_VALUE);
-  std::ignore = ArgsAreMultiplesOfPatternSize;
-  std::ignore = PatternIsValid;
-  std::ignore = PatternSizeIsValid;
 
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  hBuffer->setLastQueueWritingToMemObj(hQueue);
 
   try {
     ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
-    ur_result_t Result = UR_RESULT_SUCCESS;
     if (phEventWaitList) {
-      Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
-                                 phEventWaitList);
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                       phEventWaitList));
     }
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
@@ -915,8 +832,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     }
 
     default: {
-      Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern,
-                                        DstDevice);
+      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
+                                              pPattern, DstDevice));
       break;
     }
     }
@@ -925,13 +842,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
-
-    return Result;
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
+  return UR_RESULT_SUCCESS;
 }
 
 /// General ND memory copy operation for images (where N > 1).
@@ -948,7 +864,7 @@ static ur_result_t commonEnqueueMemImageNDCopy(
   UR_ASSERT(DstType == hipMemoryTypeArray || DstType == hipMemoryTypeHost,
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  if (ImgType == UR_MEM_TYPE_IMAGE2D) {
+  if (ImgType == UR_MEM_TYPE_IMAGE1D || ImgType == UR_MEM_TYPE_IMAGE2D) {
     hip_Memcpy2D CpyDesc;
     memset(&CpyDesc, 0, sizeof(CpyDesc));
     CpyDesc.srcMemoryType = SrcType;
@@ -956,7 +872,7 @@ static ur_result_t commonEnqueueMemImageNDCopy(
       CpyDesc.srcArray =
           reinterpret_cast<hipCUarray>(const_cast<void *>(SrcPtr));
       CpyDesc.srcXInBytes = SrcOffset[0];
-      CpyDesc.srcY = SrcOffset[1];
+      CpyDesc.srcY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : SrcOffset[1];
     } else {
       CpyDesc.srcHost = SrcPtr;
     }
@@ -965,12 +881,12 @@ static ur_result_t commonEnqueueMemImageNDCopy(
       CpyDesc.dstArray =
           reinterpret_cast<hipCUarray>(const_cast<void *>(DstPtr));
       CpyDesc.dstXInBytes = DstOffset[0];
-      CpyDesc.dstY = DstOffset[1];
+      CpyDesc.dstY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : DstOffset[1];
     } else {
       CpyDesc.dstHost = DstPtr;
     }
     CpyDesc.WidthInBytes = Region[0];
-    CpyDesc.Height = Region[1];
+    CpyDesc.Height = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 1 : Region[1];
     UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc, HipStream));
     return UR_RESULT_SUCCESS;
   }
@@ -1015,28 +931,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
-  auto Device = hQueue->getDevice();
-  hipStream_t HIPStream = hQueue->getNextTransferStream();
-
   try {
     // Note that this entry point may be called on a queue that may not be the
-    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // last queue to write to the MemImage, meaning we must perform the copy
     // from a different device
-    if (hImage->LastEventWritingToMemObj &&
-        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
-            hQueue->getDevice()) {
-      hQueue = hImage->LastEventWritingToMemObj->getQueue();
-      Device = hQueue->getDevice();
-      ScopedContext Active(Device);
-      HIPStream = hipStream_t{0}; // Default stream for different device
-      // We may have to wait for an event on another queue if it is the last
-      // event writing to mem obj
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
-                                       &hImage->LastEventWritingToMemObj));
+    if (hImage->LastQueueWritingToMemObj &&
+        hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) {
+      hQueue = hImage->LastQueueWritingToMemObj;
     }
 
+    auto Device = hQueue->getDevice();
     ScopedContext Active(Device);
+    hipStream_t HIPStream = hQueue->getNextTransferStream();
 
     if (phEventWaitList) {
       UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
@@ -1052,11 +958,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     int ElementByteSize = imageElementByteSize(Format);
 
     size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels;
-    size_t BytesToCopy = ElementByteSize * NumChannels * region.depth;
+    size_t BytesToCopy = ElementByteSize * NumChannels * region.width;
 
     auto ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
 
-    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height};
+    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth};
     size_t SrcOffset[3] = {ByteOffsetX, origin.y, origin.z};
 
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
@@ -1113,11 +1019,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     int ElementByteSize = imageElementByteSize(Format);
 
     size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels;
-    size_t BytesToCopy = ElementByteSize * NumChannels * region.depth;
+    size_t BytesToCopy = ElementByteSize * NumChannels * region.width;
 
     auto ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
 
-    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height};
+    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth};
     size_t DstOffset[3] = {ByteOffsetX, origin.y, origin.z};
 
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
@@ -1186,13 +1092,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
 
     int ElementByteSize = imageElementByteSize(SrcFormat);
 
-    size_t DstByteOffsetX = dstOrigin.x * ElementByteSize * SrcNumChannels;
-    size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * DstNumChannels;
-    size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.depth;
+    size_t DstByteOffsetX = dstOrigin.x * ElementByteSize * DstNumChannels;
+    size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * SrcNumChannels;
+    size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.width;
 
     auto ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getImageType();
 
-    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.width};
+    size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth};
     size_t SrcOffset[3] = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z};
     size_t DstOffset[3] = {DstByteOffsetX, dstOrigin.y, dstOrigin.z};
 
@@ -1299,7 +1205,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     if (!IsPinned &&
         (Map->getMapFlags() &
          (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
-      // Pinned host memory is only on host so it doesn't need to be written to.
+      // Pinned host memory is only on host so it doesn't need to be written
+      // to.
       UR_CHECK_ERROR(urEnqueueMemBufferWrite(
           hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(),
           pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent));
@@ -1475,10 +1382,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
 
     hipPointerAttribute_t attribs;
     // TODO: hipPointerGetAttributes will fail if pMem is non-HIP allocated
-    // memory, as it is neither registered as host memory, nor into the address
-    // space for the current device, meaning the pMem ptr points to a
-    // system-allocated memory. This means we may need to check system-alloacted
-    // memory and handle the failure more gracefully.
+    // memory, as it is neither registered as host memory, nor into the
+    // address space for the current device, meaning the pMem ptr points to a
+    // system-allocated memory. This means we may need to check
+    // system-alloacted memory and handle the failure more gracefully.
     UR_CHECK_ERROR(hipPointerGetAttributes(&attribs, pMem));
     // async prefetch requires USM pointer (or hip SVM) to work.
     if (!attribs.isManaged) {
@@ -1507,8 +1414,9 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
   ur_device_handle_t Device = hQueue->getDevice();
 
 #if HIP_VERSION_MAJOR >= 5
-  // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while this
-  // is feature complete, it is still open to changes and outstanding issues.
+  // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while
+  // this is feature complete, it is still open to changes and outstanding
+  // issues.
   size_t PointerRangeSize = 0;
   UR_CHECK_ERROR(hipPointerGetAttribute(
       &PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE,
@@ -1548,9 +1456,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
     }
 
     // Passing MEM_ADVICE_SET/MEM_ADVICE_CLEAR_PREFERRED_LOCATION to
-    // hipMemAdvise on a GPU device requires the GPU device to report a non-zero
-    // value for hipDeviceAttributeConcurrentManagedAccess. Therefore, ignore
-    // the mem advice if concurrent managed memory access is not available.
+    // hipMemAdvise on a GPU device requires the GPU device to report a
+    // non-zero value for hipDeviceAttributeConcurrentManagedAccess.
+    // Therefore, ignore the mem advice if concurrent managed memory access is
+    // not available.
     if (advice & (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION |
                   UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION |
                   UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE |
@@ -1585,9 +1494,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
 #endif
     } else {
       Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
-      // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
-      // currently unmapped advice arguments as not supported by this platform.
-      // Therefore, warn the user instead of throwing and aborting the runtime.
+      // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid
+      // but currently unmapped advice arguments as not supported by this
+      // platform. Therefore, warn the user instead of throwing and aborting
+      // the runtime.
       if (Result == UR_RESULT_ERROR_INVALID_ENUMERATION) {
         releaseEvent();
         setErrorMessage("mem_advise is ignored as the advice argument is not "
@@ -1648,15 +1558,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which
-    // makes the HIP runtime not correctly derive the copy kind (direction) for
-    // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40
+    // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used,
+    // which makes the HIP runtime not correctly derive the copy kind
+    // (direction) for the copies since ROCm 5.6.0+. See:
+    // https://github.com/ROCm/clr/issues/40
     // TODO: Add maximum HIP_VERSION when bug has been fixed.
 #if HIP_VERSION >= 50600000
     hipPointerAttribute_t srcAttribs{};
     hipPointerAttribute_t dstAttribs{};
 
-    // Determine if pSrc and/or pDst are system allocated pageable host memory.
+    // Determine if pSrc and/or pDst are system allocated pageable host
+    // memory.
     bool srcIsSystemAlloc{false};
     bool dstIsSystemAlloc{false};
 
@@ -1851,9 +1763,9 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim],
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
+          // Checks that local work sizes are a divisor of the global work
+          // sizes which includes that the local work sizes are neither larger
+          // than the global work sizes and not 0.
           UR_ASSERT(LocalWorkSize != 0,
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           UR_ASSERT((GlobalWorkSize[dim] % LocalWorkSize[dim]) == 0,
diff --git a/source/adapters/hip/enqueue.hpp b/source/adapters/hip/enqueue.hpp
index a1f86b3678..eacac72a82 100644
--- a/source/adapters/hip/enqueue.hpp
+++ b/source/adapters/hip/enqueue.hpp
@@ -30,3 +30,7 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
                        const hipMemoryType DstType, ur_rect_offset_t DstOffset,
                        size_t DstRowPitch, size_t DstSlicePitch,
                        hipMemcpy3DParms &Params);
+
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3]);
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 5327c43a3b..4b99dd97d7 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -16,23 +16,13 @@
 ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
                                        ur_context_handle_t Context,
                                        ur_queue_handle_t Queue,
-                                       hipStream_t Stream, uint32_t StreamToken)
+                                       hipEvent_t EvEnd, hipEvent_t EvQueued,
+                                       hipEvent_t EvStart, hipStream_t Stream,
+                                       uint32_t StreamToken)
     : CommandType{Type}, RefCount{1}, HasOwnership{true},
       HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false},
-      StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr},
-      EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
-
-  bool ProfilingEnabled =
-      Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent();
-
-  UR_CHECK_ERROR(hipEventCreateWithFlags(
-      &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming));
-
-  if (ProfilingEnabled) {
-    UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault));
-    UR_CHECK_ERROR(hipEventCreateWithFlags(&EvStart, hipEventDefault));
-  }
-
+      StreamToken{StreamToken}, EventId{0}, EvEnd{EvEnd}, EvStart{EvStart},
+      EvQueued{EvQueued}, Queue{Queue}, Stream{Stream}, Context{Context} {
   urQueueRetain(Queue);
   urContextRetain(Context);
 }
@@ -60,9 +50,9 @@ ur_result_t ur_event_handle_t_::start() {
 
   try {
     if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
-      // NOTE: This relies on the default stream to be unused.
-      UR_CHECK_ERROR(hipEventRecord(EvQueued, 0));
-      UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get()));
+      UR_CHECK_ERROR(
+          hipEventRecord(EvQueued, Queue->getHostSubmitTimeStream()));
+      UR_CHECK_ERROR(hipEventRecord(EvStart, Stream));
     }
   } catch (ur_result_t Error) {
     Result = Error;
@@ -90,44 +80,18 @@ bool ur_event_handle_t_::isCompleted() const {
 }
 
 uint64_t ur_event_handle_t_::getQueuedTime() const {
-  float MilliSeconds = 0.0f;
   assert(isStarted());
-
-  // hipEventSynchronize waits till the event is ready for call to
-  // hipEventElapsedTime.
-  UR_CHECK_ERROR(hipEventSynchronize(EvStart));
-  UR_CHECK_ERROR(hipEventSynchronize(EvEnd));
-
-  UR_CHECK_ERROR(hipEventElapsedTime(&MilliSeconds, EvStart, EvEnd));
-  return static_cast<uint64_t>(MilliSeconds * 1.0e6);
+  return Queue->getDevice()->getElapsedTime(EvQueued);
 }
 
 uint64_t ur_event_handle_t_::getStartTime() const {
-  float MiliSeconds = 0.0f;
   assert(isStarted());
-
-  // hipEventSynchronize waits till the event is ready for call to
-  // hipEventElapsedTime.
-  UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase));
-  UR_CHECK_ERROR(hipEventSynchronize(EvStart));
-
-  UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds,
-                                     ur_platform_handle_t_::EvBase, EvStart));
-  return static_cast<uint64_t>(MiliSeconds * 1.0e6);
+  return Queue->getDevice()->getElapsedTime(EvStart);
 }
 
 uint64_t ur_event_handle_t_::getEndTime() const {
-  float MiliSeconds = 0.0f;
   assert(isStarted() && isRecorded());
-
-  // hipEventSynchronize waits till the event is ready for call to
-  // hipEventElapsedTime.
-  UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase));
-  UR_CHECK_ERROR(hipEventSynchronize(EvEnd));
-
-  UR_CHECK_ERROR(
-      hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EvEnd));
-  return static_cast<uint64_t>(MiliSeconds * 1.0e6);
+  return Queue->getDevice()->getElapsedTime(EvEnd);
 }
 
 ur_result_t ur_event_handle_t_::record() {
@@ -327,8 +291,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
     ur_event_handle_t *phEvent) {
   std::ignore = pProperties;
 
-  *phEvent = ur_event_handle_t_::makeWithNative(
-      hContext, reinterpret_cast<hipEvent_t>(hNativeEvent));
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  try {
+    EventPtr =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeWithNative(
+            hContext, reinterpret_cast<hipEvent_t>(hNativeEvent)));
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  *phEvent = EventPtr.release();
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp
index 5e7c1d7e7d..af333c8613 100644
--- a/source/adapters/hip/event.hpp
+++ b/source/adapters/hip/event.hpp
@@ -80,8 +80,23 @@ struct ur_event_handle_t_ {
   static ur_event_handle_t
   makeNative(ur_command_t Type, ur_queue_handle_t Queue, hipStream_t Stream,
              uint32_t StreamToken = std::numeric_limits<uint32_t>::max()) {
-    return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream,
-                                  StreamToken);
+    const bool RequiresTimings =
+        Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE ||
+        Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+    if (RequiresTimings) {
+      Queue->createHostSubmitTimeStream();
+    }
+    native_type EvEnd{nullptr}, EvQueued{nullptr}, EvStart{nullptr};
+    UR_CHECK_ERROR(hipEventCreateWithFlags(
+        &EvEnd, RequiresTimings ? hipEventDefault : hipEventDisableTiming));
+
+    if (RequiresTimings) {
+      UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault));
+      UR_CHECK_ERROR(hipEventCreateWithFlags(&EvStart, hipEventDefault));
+    }
+
+    return new ur_event_handle_t_(Type, Queue->getContext(), Queue, EvEnd,
+                                  EvQueued, EvStart, Stream, StreamToken);
   }
 
   static ur_event_handle_t makeWithNative(ur_context_handle_t context,
@@ -97,8 +112,9 @@ struct ur_event_handle_t_ {
   // This constructor is private to force programmers to use the makeNative /
   // make_user static members in order to create a ur_event_handle_t for HIP.
   ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context,
-                     ur_queue_handle_t Queue, hipStream_t Stream,
-                     uint32_t StreamToken);
+                     ur_queue_handle_t Queue, native_type EvEnd,
+                     native_type EvQueued, native_type EvStart,
+                     hipStream_t Stream, uint32_t StreamToken);
 
   // This constructor is private to force programmers to use the
   // makeWithNative for event interop
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index f35d3957bc..b433c06852 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "kernel.hpp"
+#include "enqueue.hpp"
 #include "memory.hpp"
 #include "sampler.hpp"
 
@@ -349,3 +350,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
     [[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    [[maybe_unused]] ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue,
+    uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  size_t MaxThreadsPerBlock[3];
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+
+  MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX();
+  MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY();
+  MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ();
+
+  ur_device_handle_t Device = hQueue->getDevice();
+  ScopedContext Active(Device);
+
+  guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim,
+                     MaxThreadsPerBlock);
+  std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim,
+            pSuggestedLocalWorkSize);
+  return UR_RESULT_SUCCESS;
+}
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index ff209884ce..eb91f1620a 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -10,6 +10,7 @@
 
 #include "memory.hpp"
 #include "context.hpp"
+#include "enqueue.hpp"
 #include <cassert>
 #include <ur_util.hpp>
 
@@ -32,6 +33,28 @@ size_t imageElementByteSize(hipArray_Format ArrayFormat) {
   return 0;
 }
 
+ur_result_t
+checkSupportedImageChannelType(ur_image_channel_type_t ImageChannelType) {
+  switch (ImageChannelType) {
+  case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+  case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+    return UR_RESULT_SUCCESS;
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 /// Decreases the reference count of the Mem object.
 /// If this is zero, calls the relevant HIP Free function
 /// \return UR_RESULT_SUCCESS unless deallocation error
@@ -339,7 +362,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
 
   UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER,
+  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_ARRAY,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   UR_ASSERT(pImageDesc->numMipLevel == 0,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
@@ -355,7 +378,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   // We only support RBGA channel order
   // TODO: check SYCL CTS and spec. May also have to support BGRA
   UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
-            UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+            UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT);
+
+  UR_CHECK_ERROR(checkSupportedImageChannelType(pImageFormat->channelType));
 
   auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
       new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost});
@@ -366,7 +391,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
 
   if (PerformInitialCopy) {
     for (const auto &Dev : hContext->getDevices()) {
-      UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev));
+      ScopedContext Active(Dev);
+      hipStream_t Stream{0}; // Use default stream
+      UR_CHECK_ERROR(
+          enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev, Stream));
+      UR_CHECK_ERROR(hipStreamSynchronize(Stream));
     }
   }
   *phMem = URMemObj.release();
@@ -455,11 +484,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
                                            const ur_device_handle_t hDevice) {
   ScopedContext Active(hDevice);
+  auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
   ur_lock LockGuard(Mem->MemoryAllocationMutex);
 
   if (Mem->isBuffer()) {
     auto &Buffer = std::get<BufferMem>(Mem->Mem);
-    hipDeviceptr_t &DevPtr = Buffer.Ptrs[hDevice->getIndex()];
+    hipDeviceptr_t &DevPtr = Buffer.Ptrs[DeviceIdx];
 
     // Allocation has already been made
     if (DevPtr != BufferMem::native_type{0}) {
@@ -482,12 +512,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
     try {
       auto &Image = std::get<SurfaceMem>(Mem->Mem);
       // Allocation has already been made
-      if (Image.Arrays[hDevice->getIndex()]) {
+      if (Image.Arrays[DeviceIdx]) {
         return UR_RESULT_SUCCESS;
       }
       UR_CHECK_ERROR(hipArray3DCreate(
           reinterpret_cast<hipCUarray *>(&ImageArray), &Image.ArrayDesc));
-      Image.Arrays[hDevice->getIndex()] = ImageArray;
+      Image.Arrays[DeviceIdx] = ImageArray;
       // HIP_RESOURCE_DESC is a union of different structs, shown here
       // We need to fill it as described here to use it for a surface or texture
       // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
@@ -499,7 +529,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       ImageResDesc.resType = hipResourceTypeArray;
 
       UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc));
-      Image.SurfObjs[hDevice->getIndex()] = Surface;
+      Image.SurfObjs[DeviceIdx] = Surface;
     } catch (ur_result_t Err) {
       if (ImageArray) {
         UR_CHECK_ERROR(hipFreeArray(ImageArray));
@@ -516,27 +546,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 }
 
 namespace {
-inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
-                                         ur_device_handle_t hDevice) {
+inline ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem,
+                                                ur_device_handle_t hDevice,
+                                                hipStream_t Stream) {
   auto &Buffer = std::get<BufferMem>(Mem->Mem);
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     // Device allocation being initialized from host for the first time
     if (Buffer.HostPtr) {
-      UR_CHECK_ERROR(
-          hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+      UR_CHECK_ERROR(hipMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr,
+                                        Buffer.Size, Stream));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
-    UR_CHECK_ERROR(hipMemcpyDtoD(
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
+    UR_CHECK_ERROR(hipMemcpyDtoDAsync(
         Buffer.getPtr(hDevice),
-        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
-        Buffer.Size));
+        Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size,
+        Stream));
   }
   return UR_RESULT_SUCCESS;
 }
 
-inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
-                                        ur_device_handle_t hDevice) {
+inline ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem,
+                                               ur_device_handle_t hDevice,
+                                               hipStream_t Stream) {
   auto &Image = std::get<SurfaceMem>(Mem->Mem);
   // When a dimension isn't used image_desc has the size set to 1
   size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
@@ -567,36 +598,40 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
     CpyDesc3D.Depth = Image.ImageDesc.depth;
   }
 
-  if (Mem->LastEventWritingToMemObj == nullptr) {
+  if (Mem->LastQueueWritingToMemObj == nullptr) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      UR_CHECK_ERROR(hipStreamSynchronize(Stream));
       UR_CHECK_ERROR(
           hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
       CpyDesc2D.srcHost = Image.HostPtr;
-      UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
+      UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
       CpyDesc3D.srcHost = Image.HostPtr;
+      CpyDesc3D.srcMemoryType = hipMemoryTypeHost;
       UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
     }
-  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
-             hDevice) {
+  } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // Blocking wait needed
+      UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj));
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(hipMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(
-              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
+      CpyDesc2D.srcMemoryType = hipMemoryTypeDevice;
+      CpyDesc2D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(
-          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
-      UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
+      CpyDesc3D.srcMemoryType = hipMemoryTypeDevice;
+      CpyDesc3D.srcArray =
+          Image.getArray(Mem->LastQueueWritingToMemObj->getDevice());
+      UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&CpyDesc3D, Stream));
     }
   }
   return UR_RESULT_SUCCESS;
@@ -605,22 +640,50 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
 
 // If calling this entry point it is necessary to lock the memoryMigrationMutex
 // beforehand
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
-                                          const ur_device_handle_t hDevice) {
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(
+    ur_mem_handle_t Mem, const ur_device_handle_t hDevice, hipStream_t Stream) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
   // Device allocation has already been initialized with most up to date
   // data in buffer
-  if (Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()]) {
+  if (Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx])
     return UR_RESULT_SUCCESS;
-  }
 
   ScopedContext Active(hDevice);
   if (Mem->isBuffer()) {
-    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream));
   } else {
-    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+    UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream));
   }
 
-  Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()] = true;
+  Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx] = true;
   return UR_RESULT_SUCCESS;
 }
+
+BufferMem::native_type
+BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return reinterpret_cast<native_type>(
+      reinterpret_cast<uint8_t *>(
+          Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) +
+      Offset);
+}
+
+hipArray *SurfaceMem::getArray(const ur_device_handle_t Device) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)];
+}
+
+hipSurfaceObject_t SurfaceMem::getSurface(const ur_device_handle_t Device) {
+  if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+      Err != UR_RESULT_SUCCESS) {
+    throw Err;
+  }
+  return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)];
+}
diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp
index 5d2aa6f9a5..425c2e7f53 100644
--- a/source/adapters/hip/memory.hpp
+++ b/source/adapters/hip/memory.hpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
+#include "common.hpp"
 #include "context.hpp"
 #include "event.hpp"
 #include <cassert>
@@ -16,12 +17,11 @@
 #include <unordered_map>
 #include <variant>
 
-#include "common.hpp"
-
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
                                            const ur_device_handle_t);
-ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
-                                          const ur_device_handle_t);
+ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                                 const ur_device_handle_t,
+                                                 hipStream_t);
 
 // Handler for plain, pointer-based HIP allocations
 struct BufferMem {
@@ -95,15 +95,7 @@ struct BufferMem {
 
   // This will allocate memory on device with index Index if there isn't already
   // an active allocation on the device
-  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return reinterpret_cast<native_type>(
-        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex()]) + Offset);
-  }
+  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset);
 
   // This will allocate memory on device if there isn't already an active
   // allocation on the device
@@ -224,6 +216,7 @@ struct SurfaceMem {
       ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT8;
       PixelTypeSizeBytes = 1;
       break;
+    case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8:
     case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
       ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT8;
       PixelTypeSizeBytes = 1;
@@ -233,6 +226,7 @@ struct SurfaceMem {
       ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT16;
       PixelTypeSizeBytes = 2;
       break;
+    case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16:
     case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
       ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT16;
       PixelTypeSizeBytes = 2;
@@ -260,24 +254,10 @@ struct SurfaceMem {
   }
 
   // Will allocate a new array on device if not already allocated
-  hipArray *getArray(const ur_device_handle_t Device) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return Arrays[Device->getIndex()];
-  }
+  hipArray *getArray(const ur_device_handle_t Device);
 
   // Will allocate a new surface on device if not already allocated
-  hipSurfaceObject_t getSurface(const ur_device_handle_t Device) {
-    if (ur_result_t Err =
-            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
-        Err != UR_RESULT_SUCCESS) {
-      throw Err;
-    }
-    return SurfObjs[Device->getIndex()];
-  }
+  hipSurfaceObject_t getSurface(const ur_device_handle_t Device);
 
   ur_mem_type_t getImageType() const noexcept { return ImageDesc.type; }
 
@@ -306,7 +286,7 @@ struct SurfaceMem {
 ///
 /// The ur_mem_handle_t is responsible for memory allocation and migration
 /// across devices in the same ur_context_handle_t. If a kernel writes to a
-/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all
 /// subsequent operations that want to read from the ur_mem_handle_t must wait
 /// on the event referring to the last write.
 ///
@@ -325,61 +305,7 @@ struct SurfaceMem {
 ///   2. urEnqueueMem(Buffer|Image)Read(Rect)
 ///
 /// Migrations will occur in both cases if the most recent version of data
-/// is on a different device, marked by LastEventWritingToMemObj->getDevice().
-///
-/// Example trace:
-/// ~~~~~~~~~~~~~~
-///
-/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
-///             -> OUT: hContext
-///
-/// =====> urMemBufferCreate(hContext,...);
-///             -> No native allocations made
-///             -> OUT: hBuffer
-///
-/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
-///             -> Allocation made on q0 ie device0
-///             -> New allocation initialized with host data.
-///
-/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
-///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
-///                which is in turn unique to a device. So we can set the kernel
-///                arg with the ptr of the device specific allocation.
-///             -> hKernel0->getProgram()->getDevice() == device0
-///             -> allocateMemObjOnDeviceIfNeeded(device0);
-///                   -> Native allocation already made on device0, continue.
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> Suppose that hKernel0 writes to hBuffer.
-///             -> Call hBuffer->setLastEventWritingToMemObj with return event
-///                from this operation
-///             -> Enqueue native kernel launch
-///
-/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
-///             -> hKernel1->getProgram()->getDevice() == device1
-///             -> New allocation will be made on device1 when calling
-///                getPtr(device1)
-///                   -> No native allocation on device1
-///                   -> Make native allocation on device1
-///
-/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
-///             -> Suppose hKernel1 wants to read from hBuffer and not write.
-///             -> migrateMemoryToDeviceIfNeeded(device1);
-///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
-///                   -> Check if memory has been migrated to device1 since the
-///                      last write
-///                        -> Hasn't been migrated
-///                   -> Wait on LastEventWritingToMemObj.
-///                   -> Migrate memory from device0's native allocation to
-///                      device1's native allocation.
-///             -> Enqueue native kernel launch
-///
-/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
-///             -> migrateMemoryToDeviceIfNeeded(device0);
-///                   -> hBuffer->LastEventWritingToMemObj refers to an event
-///                      from q0
-///                        -> Migration not necessary
-///             -> Enqueue native kernel launch
+/// is on a different device, marked by LastQueueWritingToMemObj->getDevice().
 ///
 struct ur_mem_handle_t_ {
 
@@ -403,15 +329,13 @@ struct ur_mem_handle_t_ {
   // Has the memory been migrated to a device since the last write?
   std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
 
-  // We should wait on this event prior to migrating memory across allocations
-  // in this ur_mem_handle_t_
-  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+  // Queue with most up to date data of ur_mem_handle_t_
+  ur_queue_handle_t LastQueueWritingToMemObj{nullptr};
 
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
   ur_mutex MemoryAllocationMutex; // A mutex for allocations
-  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
@@ -500,18 +424,18 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
-    assert(NewEvent && "Invalid event!");
-    // This entry point should only ever be called when using multi device ctx
-    assert(Context->Devices.size() > 1);
-    if (LastEventWritingToMemObj != nullptr) {
-      urEventRelease(LastEventWritingToMemObj);
+  void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) {
+    if (LastQueueWritingToMemObj != nullptr) {
+      urQueueRelease(LastQueueWritingToMemObj);
     }
-    urEventRetain(NewEvent);
-    LastEventWritingToMemObj = NewEvent;
+    urQueueRetain(WritingQueue);
+    LastQueueWritingToMemObj = WritingQueue;
     for (const auto &Device : Context->getDevices()) {
-      HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
-          Device == NewEvent->getQueue()->getDevice();
+      HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] =
+          Device == WritingQueue->getDevice();
     }
   }
 };
+
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                          const ur_device_handle_t);
diff --git a/source/adapters/hip/platform.cpp b/source/adapters/hip/platform.cpp
index 287f941c30..8671d70a57 100644
--- a/source/adapters/hip/platform.cpp
+++ b/source/adapters/hip/platform.cpp
@@ -11,8 +11,6 @@
 #include "platform.hpp"
 #include "context.hpp"
 
-hipEvent_t ur_platform_handle_t_::EvBase{nullptr};
-
 UR_APIEXPORT ur_result_t UR_APICALL
 urPlatformGetInfo(ur_platform_handle_t, ur_platform_info_t propName,
                   size_t propSize, void *pPropValue, size_t *pSizeRet) {
@@ -81,18 +79,15 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
               UR_CHECK_ERROR(hipDeviceGet(&Device, i));
               hipCtx_t Context;
               UR_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Context, Device));
-              Platform.Devices.emplace_back(
-                  new ur_device_handle_t_{Device, Context, &Platform, i});
-            }
-
-            // Setup EvBase
-            {
-              ScopedContext Active(Platform.Devices.front().get());
               hipEvent_t EvBase;
               UR_CHECK_ERROR(hipEventCreate(&EvBase));
+
+              // Use the default stream to record base event counter
               UR_CHECK_ERROR(hipEventRecord(EvBase, 0));
+              Platform.Devices.emplace_back(new ur_device_handle_t_{
+                  Device, Context, EvBase, &Platform, i});
 
-              ur_platform_handle_t_::EvBase = EvBase;
+              ScopedContext Active(Platform.Devices.front().get());
             }
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
@@ -140,12 +135,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
-    ur_native_handle_t hNativePlatform,
-    const ur_platform_native_properties_t *pProperties,
-    ur_platform_handle_t *phPlatform) {
-  std::ignore = hNativePlatform;
-  std::ignore = pProperties;
-  std::ignore = phPlatform;
+    ur_native_handle_t, ur_adapter_handle_t,
+    const ur_platform_native_properties_t *, ur_platform_handle_t *) {
+  // There is no HIP equivalent to ur_platform_handle_t
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/source/adapters/hip/platform.hpp b/source/adapters/hip/platform.hpp
index f925692ec8..7b96de6473 100644
--- a/source/adapters/hip/platform.hpp
+++ b/source/adapters/hip/platform.hpp
@@ -20,6 +20,5 @@
 ///  when devices are used.
 ///
 struct ur_platform_handle_t_ {
-  static hipEvent_t EvBase; // HIP event used as base counter
   std::vector<std::unique_ptr<ur_device_handle_t_>> Devices;
 };
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index 8e3653ee02..f7cd6eebf1 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -283,7 +283,11 @@ urProgramCreateWithIL(ur_context_handle_t, const void *, size_t,
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
                  const char *pOptions) {
-  return urProgramBuild(hContext, hProgram, pOptions);
+  UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
+  // urProgramBuild sets the BinaryType to UR_PROGRAM_BINARY_TYPE_EXECUTABLE, so
+  // set it to the correct value for urProgramCompile post-hoc.
+  hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCompileExp(ur_program_handle_t,
@@ -312,6 +316,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t,
     ScopedContext Active(hProgram->getDevice());
 
     hProgram->buildProgram(pOptions);
+    hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
 
   } catch (ur_result_t Err) {
     Result = Err;
@@ -355,13 +360,14 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t,
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
-  case UR_PROGRAM_BUILD_INFO_STATUS: {
+  case UR_PROGRAM_BUILD_INFO_STATUS:
     return ReturnValue(hProgram->BuildStatus);
-  }
   case UR_PROGRAM_BUILD_INFO_OPTIONS:
     return ReturnValue(hProgram->BuildOptions.c_str());
   case UR_PROGRAM_BUILD_INFO_LOG:
     return ReturnValue(hProgram->InfoLog, hProgram->MAX_LOG_SIZE);
+  case UR_PROGRAM_BUILD_INFO_BINARY_TYPE:
+    return ReturnValue(hProgram->BinaryType);
   default:
     break;
   }
@@ -494,6 +500,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
 
   *phProgram = RetProgram.release();
+  (*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
 
   return Result;
 }
diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp
index cc5da40af5..25a70e6a7a 100644
--- a/source/adapters/hip/program.hpp
+++ b/source/adapters/hip/program.hpp
@@ -27,6 +27,12 @@ struct ur_program_handle_t_ {
   ur_device_handle_t Device;
   std::string ExecutableCache;
 
+  // The ur_program_binary_type_t property is defined individually for every
+  // device in a program. However, since the HIP adapter only has 1 device per
+  // program, there is no need to keep track of its value for each
+  // device.
+  ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;
+
   // Metadata
   bool IsRelocatable = false;
 
diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp
index 6e6496fec1..4f7b4060cb 100644
--- a/source/adapters/hip/queue.cpp
+++ b/source/adapters/hip/queue.cpp
@@ -117,12 +117,17 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   try {
     std::unique_ptr<ur_queue_handle_t_> QueueImpl{nullptr};
 
-    unsigned int Flags = 0;
+    unsigned int Flags = hipStreamNonBlocking;
     ur_queue_flags_t URFlags = 0;
     int Priority = 0; // Not guaranteed, but, in ROCm 5.0-6.0, 0 is the default
-
     if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
       URFlags = pProps->flags;
+      if (URFlags == UR_QUEUE_FLAG_USE_DEFAULT_STREAM) {
+        Flags = hipStreamDefault;
+      } else if (URFlags == UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM) {
+        Flags = 0;
+      }
+
       if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) {
         ScopedContext Active(hDevice);
         UR_CHECK_ERROR(hipDeviceGetStreamPriorityRange(nullptr, &Priority));
@@ -143,7 +148,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
     QueueImpl = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
         std::move(ComputeHipStreams), std::move(TransferHipStreams), hContext,
-        hDevice, Flags, pProps ? pProps->flags : 0, Priority});
+        hDevice, Flags, URFlags, Priority});
 
     *phQueue = QueueImpl.release();
 
@@ -186,10 +191,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
     });
     return ReturnValue(IsReady);
   }
+  case UR_QUEUE_INFO_DEVICE_DEFAULT:
+  case UR_QUEUE_INFO_SIZE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
-    break;
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
-  return {};
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
@@ -217,6 +224,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
       UR_CHECK_ERROR(hipStreamDestroy(S));
     });
 
+    if (hQueue->getHostSubmitTimeStream() != hipStream_t{0}) {
+      UR_CHECK_ERROR(hipStreamSynchronize(hQueue->getHostSubmitTimeStream()));
+      UR_CHECK_ERROR(hipStreamDestroy(hQueue->getHostSubmitTimeStream()));
+    }
+
     return UR_RESULT_SUCCESS;
   } catch (ur_result_t Err) {
     return Err;
diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp
index ad2f0f016e..cfabd29bf7 100644
--- a/source/adapters/hip/queue.hpp
+++ b/source/adapters/hip/queue.hpp
@@ -10,6 +10,9 @@
 #pragma once
 
 #include "common.hpp"
+#include <hip/hip_runtime.h>
+#include <mutex>
+#include <vector>
 
 using ur_stream_quard = std::unique_lock<std::mutex>;
 
@@ -22,6 +25,10 @@ struct ur_queue_handle_t_ {
 
   std::vector<native_type> ComputeStreams;
   std::vector<native_type> TransferStreams;
+  // Stream used for recording EvQueue, which holds information about when the
+  // command in question is enqueued on host, as opposed to started. It is
+  // created only if profiling is enabled - either for queue or per event.
+  native_type HostSubmitTimeStream{0};
   // DelayCompute keeps track of which streams have been recently reused and
   // their next use should be delayed. If a stream has been recently reused it
   // will be skipped the next time it would be selected round-robin style. When
@@ -95,6 +102,17 @@ struct ur_queue_handle_t_ {
   native_type getNextTransferStream();
   native_type get() { return getNextComputeStream(); };
 
+  // Function which creates the profiling stream. Called only from makeNative
+  // event when profiling is required.
+  void createHostSubmitTimeStream() {
+    static std::once_flag HostSubmitTimeStreamFlag;
+    std::call_once(HostSubmitTimeStreamFlag, [&]() {
+      UR_CHECK_ERROR(hipStreamCreateWithFlags(&HostSubmitTimeStream,
+                                              hipStreamNonBlocking));
+    });
+  }
+  native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; }
+
   bool hasBeenSynchronized(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
     if (StreamToken == std::numeric_limits<uint32_t>::max()) {
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 71979b75b1..7a28623e0b 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 7c4f43c4ac..275125b2ac 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -161,41 +161,21 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
                      ur_usm_alloc_info_t propName, size_t propValueSize,
                      void *pPropValue, size_t *pPropValueSizeRet) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  hipPointerAttribute_t hipPointerAttributeType;
-
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   try {
     switch (propName) {
     case UR_USM_ALLOC_INFO_TYPE: {
-      // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
-      hipError_t Ret = hipPointerGetAttributes(&hipPointerAttributeType, pMem);
-      if (Ret == hipErrorInvalidValue) {
-        // pointer not known to the HIP subsystem
-        return ReturnValue(UR_USM_TYPE_UNKNOWN);
-      }
-      // Direct usage of the function, instead of UR_CHECK_ERROR, so we can
-      // get the line offset.
-      checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__);
-      // ROCm 6.0.0 introduces hipMemoryTypeUnregistered in the hipMemoryType
-      // enum to mark unregistered allocations (i.e., via system allocators).
-#if HIP_VERSION_MAJOR >= 6
-      if (hipPointerAttributeType.type == hipMemoryTypeUnregistered) {
+      auto MaybePointerAttrs = getPointerAttributes(pMem);
+      if (!MaybePointerAttrs.has_value()) {
         // pointer not known to the HIP subsystem
         return ReturnValue(UR_USM_TYPE_UNKNOWN);
       }
-#endif
-      unsigned int Value;
-#if HIP_VERSION >= 50600000
-      Value = hipPointerAttributeType.type;
-#else
-      Value = hipPointerAttributeType.memoryType;
-#endif
+      auto Value = getMemoryType(*MaybePointerAttrs);
       UR_ASSERT(Value == hipMemoryTypeDevice || Value == hipMemoryTypeHost ||
                     Value == hipMemoryTypeManaged,
                 UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-      if (hipPointerAttributeType.isManaged || Value == hipMemoryTypeManaged) {
+      if (MaybePointerAttrs->isManaged || Value == hipMemoryTypeManaged) {
         // pointer to managed memory
         return ReturnValue(UR_USM_TYPE_SHARED);
       }
@@ -211,21 +191,21 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
       ur::unreachable();
     }
     case UR_USM_ALLOC_INFO_DEVICE: {
-      // get device index associated with this pointer
-      UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem));
+      auto MaybePointerAttrs = getPointerAttributes(pMem);
+      if (!MaybePointerAttrs.has_value()) {
+        // pointer not known to the HIP subsystem
+        return ReturnValue(UR_USM_TYPE_UNKNOWN);
+      }
 
-      int DeviceIdx = hipPointerAttributeType.device;
+      int DeviceIdx = MaybePointerAttrs->device;
 
-      // currently each device is in its own platform, so find the platform at
-      // the same index
-      std::vector<ur_platform_handle_t> Platforms;
-      Platforms.resize(DeviceIdx + 1);
+      // hip backend has only one platform containing all devices
+      ur_platform_handle_t platform;
       ur_adapter_handle_t AdapterHandle = &adapter;
-      Result = urPlatformGet(&AdapterHandle, 1, DeviceIdx + 1, Platforms.data(),
-                             nullptr);
+      UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 1, &platform, nullptr));
 
       // get the device from the platform
-      ur_device_handle_t Device = Platforms[DeviceIdx]->Devices[0].get();
+      ur_device_handle_t Device = platform->Devices[DeviceIdx].get();
       return ReturnValue(Device);
     }
     case UR_USM_ALLOC_INFO_POOL: {
@@ -240,15 +220,31 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
       return ReturnValue(Pool);
     }
     case UR_USM_ALLOC_INFO_BASE_PTR:
-    case UR_USM_ALLOC_INFO_SIZE:
-      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+      // HIP gives us the ability to query the base pointer for a device
+      // pointer, so check whether we've got one of those.
+      if (auto MaybePointerAttrs = getPointerAttributes(pMem)) {
+        if (getMemoryType(*MaybePointerAttrs) == hipMemoryTypeDevice) {
+          void *Base = nullptr;
+          UR_CHECK_ERROR(hipPointerGetAttribute(
+              &Base, HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+              (hipDeviceptr_t)pMem));
+          return ReturnValue(Base);
+        }
+      }
+      // If not, we can't be sure.
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    case UR_USM_ALLOC_INFO_SIZE: {
+      size_t RangeSize = 0;
+      UR_CHECK_ERROR(hipMemPtrGetInfo(const_cast<void *>(pMem), &RangeSize));
+      return ReturnValue(RangeSize);
+    }
     default:
       return UR_RESULT_ERROR_INVALID_ENUMERATION;
     }
   } catch (ur_result_t Error) {
-    Result = Error;
+    return Error;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context,
diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt
index d26d0aeb26..41f8ce8d12 100644
--- a/source/adapters/level_zero/CMakeLists.txt
+++ b/source/adapters/level_zero/CMakeLists.txt
@@ -5,22 +5,27 @@
 
 
 set(TARGET_NAME ur_adapter_level_zero)
+set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loader library")
+set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers")
+set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from")
+set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo")
 
-# Copy L0 loader/headers locally to the build to avoid leaking their path.
+# Copy Level Zero loader/headers locally to the build to avoid leaking their path.
 set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader)
-if (DEFINED L0_LIBRARY)
-  get_filename_component(LEVEL_ZERO_LIB_NAME "${L0_LIBRARY}" NAME)
+if (NOT UR_LEVEL_ZERO_LOADER_LIBRARY STREQUAL "")
+  get_filename_component(LEVEL_ZERO_LIB_NAME "${UR_LEVEL_ZERO_LOADER_LIBRARY}" NAME)
   set(LEVEL_ZERO_LIBRARY ${LEVEL_ZERO_COPY_DIR}/${LEVEL_ZERO_LIB_NAME})
-  message(STATUS "Copying Level Zero loader and headers to local build tree")
-  file(COPY ${L0_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN)
+  message(STATUS "Level Zero Adapter: Copying Level Zero loader to local build tree")
+  file(COPY ${UR_LEVEL_ZERO_LOADER_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN)
 endif()
-if (DEFINED L0_INCLUDE_DIR)
+if (NOT UR_LEVEL_ZERO_INCLUDE_DIR STREQUAL "")
   set(LEVEL_ZERO_INCLUDE_DIR ${LEVEL_ZERO_COPY_DIR}/level_zero)
-  file(COPY ${L0_INCLUDE_DIR}/level_zero DESTINATION ${LEVEL_ZERO_COPY_DIR})
+  message(STATUS "Level Zero Adapter: Copying Level Zero headers to local build tree")
+  file(COPY ${UR_LEVEL_ZERO_INCLUDE_DIR}/level_zero DESTINATION ${LEVEL_ZERO_COPY_DIR})
 endif()
 
 if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
-    message(STATUS "Download Level Zero loader and headers from github.com")
+    message(STATUS "Level Zero Adapter: Download Level Zero loader and headers from github.com")
 
     # Workaround warnings/errors for Level Zero build
     set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}")
@@ -33,19 +38,23 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
     endif()
 
-    set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
-    set(LEVEL_ZERO_LOADER_TAG v1.16.1)
+    if (UR_LEVEL_ZERO_LOADER_REPO STREQUAL "")
+        set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
+    endif()
+    if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "")
+        set(UR_LEVEL_ZERO_LOADER_TAG v1.16.1)
+    endif()
 
     # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
     set(CMAKE_INCLUDE_CURRENT_DIR OFF)
     # Prevent L0 loader from exporting extra symbols
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
 
-    message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}")
+    message(STATUS "Level Zero Adapter: Will fetch Level Zero Loader from ${UR_LEVEL_ZERO_LOADER_REPO}")
     include(FetchContent)
     FetchContent_Declare(level-zero-loader
-        GIT_REPOSITORY    ${LEVEL_ZERO_LOADER_REPO}
-        GIT_TAG           ${LEVEL_ZERO_LOADER_TAG}
+        GIT_REPOSITORY    ${UR_LEVEL_ZERO_LOADER_REPO}
+        GIT_TAG           ${UR_LEVEL_ZERO_LOADER_TAG}
     )
     if(MSVC)
         set(USE_Z7 ON)
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 67415a0de0..6d0d2aea02 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -15,6 +15,9 @@
 https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero
 */
 
+// Print the name of a variable and its value in the L0 debug log
+#define DEBUG_LOG(VAR) logger::debug(#VAR " {}", VAR);
+
 namespace {
 /// Checks the version of the level-zero driver.
 /// @param Context Execution context
@@ -38,18 +41,31 @@ bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context,
           (DriverVersionMinor >= VersionMinor) &&
           (DriverVersionBuild >= VersionBuild));
 }
+
+// Default to using compute engine for fill operation, but allow to
+// override this with an environment variable.
+bool PreferCopyEngineForFill = [] {
+  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL");
+  const char *PiRet =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
+  return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
+}();
+
 }; // namespace
 
 ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
     ur_context_handle_t Context, ur_device_handle_t Device,
     ze_command_list_handle_t CommandList,
     ze_command_list_handle_t CommandListResetEvents,
+    ze_command_list_handle_t CopyCommandList,
     ZeStruct<ze_command_list_desc_t> ZeDesc,
+    ZeStruct<ze_command_list_desc_t> ZeCopyDesc,
     const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
-    : Context(Context), Device(Device), ZeCommandList(CommandList),
+    : Context(Context), Device(Device), ZeComputeCommandList(CommandList),
       ZeCommandListResetEvents(CommandListResetEvents),
-      ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
-      SyncPoints(), NextSyncPoint(0),
+      ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList),
+      ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr),
+      QueueProperties(), SyncPoints(), NextSyncPoint(0),
       IsUpdatable(Desc ? Desc->isUpdatable : false),
       IsProfilingEnabled(Desc ? Desc->enableProfiling : false),
       IsInOrderCmdList(IsInOrderCmdList) {
@@ -68,8 +84,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
 
   // Release the memory allocated to the CommandList stored in the
   // command_buffer
-  if (ZeCommandList) {
-    ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
+  if (ZeComputeCommandList) {
+    ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList));
+  }
+  if (UseCopyEngine() && ZeCopyCommandList) {
+    ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList));
   }
 
   // Release the memory allocated to the CommandListResetEvents stored in the
@@ -99,8 +118,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
     urEventReleaseInternal(Event);
   }
 
-  // Release Fences allocated to command_buffer
-  for (auto &ZeFence : ZeFencesList) {
+  // Release fences allocated to command-buffer
+  for (auto &ZeFencePair : ZeFencesMap) {
+    auto &ZeFence = ZeFencePair.second;
     ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
   }
 
@@ -300,13 +320,14 @@ static ur_result_t getEventsFromSyncPoints(
 // buffer.
 static ur_result_t enqueueCommandBufferMemCopyHelper(
     ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer,
-    void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList,
+    void *Dst, const void *Src, size_t Size, bool PreferCopyEngine,
+    uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   if (CommandBuffer->IsInOrderCmdList) {
-    ZE2UR_CALL(
-        zeCommandListAppendMemoryCopy,
-        (CommandBuffer->ZeCommandList, Dst, Src, Size, nullptr, 0, nullptr));
+    ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+               (CommandBuffer->ZeComputeCommandList, Dst, Src, Size, nullptr, 0,
+                nullptr));
 
     logger::debug("calling zeCommandListAppendMemoryCopy()");
   } else {
@@ -320,12 +341,26 @@ static ur_result_t enqueueCommandBufferMemCopyHelper(
     LaunchEvent->CommandType = CommandType;
 
     // Get sync point and register the event with it.
-    *SyncPoint = CommandBuffer->GetNextSyncPoint();
-    CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
 
+    ze_command_list_handle_t ZeCommandList =
+        CommandBuffer->ZeComputeCommandList;
+    // If the copy engine available, the command is enqueued in the
+    // ZeCopyCommandList.
+    if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) {
+      ZeCommandList = CommandBuffer->ZeCopyCommandList;
+      // We indicate that the ZeCopyCommandList contains commands to be
+      // submitted.
+      CommandBuffer->MCopyCommandListEmpty = false;
+    }
     ZE2UR_CALL(zeCommandListAppendMemoryCopy,
-               (CommandBuffer->ZeCommandList, Dst, Src, Size,
-                LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
+               (ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent,
+                ZeEventList.size(), ZeEventList.data()));
 
     logger::debug("calling zeCommandListAppendMemoryCopy() with"
                   "  ZeEvent {}",
@@ -341,9 +376,9 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper(
     void *Dst, const void *Src, ur_rect_offset_t SrcOrigin,
     ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch,
     size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch,
-    uint32_t NumSyncPointsInWaitList,
+    bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
 
   uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin.x);
   uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin.y);
@@ -378,9 +413,9 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper(
 
   if (CommandBuffer->IsInOrderCmdList) {
     ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
-               (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch,
-                DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch,
-                nullptr, 0, nullptr));
+               (CommandBuffer->ZeComputeCommandList, Dst, &ZeDstRegion,
+                DstPitch, DstSlicePitch, Src, &ZeSrcRegion, SrcPitch,
+                SrcSlicePitch, nullptr, 0, nullptr));
 
     logger::debug("calling zeCommandListAppendMemoryCopyRegion()");
   } else {
@@ -395,13 +430,28 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper(
     LaunchEvent->CommandType = CommandType;
 
     // Get sync point and register the event with it.
-    *SyncPoint = CommandBuffer->GetNextSyncPoint();
-    CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
+
+    ze_command_list_handle_t ZeCommandList =
+        CommandBuffer->ZeComputeCommandList;
+    // If the copy engine available, the command is enqueued in the
+    // ZeCopyCommandList.
+    if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) {
+      ZeCommandList = CommandBuffer->ZeCopyCommandList;
+      // We indicate that the ZeCopyCommandList contains commands to be
+      // submitted.
+      CommandBuffer->MCopyCommandListEmpty = false;
+    }
 
     ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
-               (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch,
-                DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch,
-                LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
+               (ZeCommandList, Dst, &ZeDstRegion, DstPitch, DstSlicePitch, Src,
+                &ZeSrcRegion, SrcPitch, SrcSlicePitch, LaunchEvent->ZeEvent,
+                ZeEventList.size(), ZeEventList.data()));
 
     logger::debug("calling zeCommandListAppendMemoryCopyRegion() with"
                   "  ZeEvent {}",
@@ -415,25 +465,43 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper(
 static ur_result_t enqueueCommandBufferFillHelper(
     ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer,
     void *Ptr, const void *Pattern, size_t PatternSize, size_t Size,
-    uint32_t NumSyncPointsInWaitList,
+    bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   // Pattern size must be a power of two.
   UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0),
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  // Pattern size must fit the compute queue capabilities.
-  UR_ASSERT(
+  ze_command_list_handle_t ZeCommandList;
+  // If the copy engine available and patternsize is valid, the command is
+  // enqueued in the ZeCopyCommandList, otherwise enqueue it in the compute
+  // command list.
+
+  if (PreferCopyEngine && CommandBuffer->UseCopyEngine() &&
       PatternSize <=
           CommandBuffer->Device
-              ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-              .ZeProperties.maxMemoryFillPatternSize,
-      UR_RESULT_ERROR_INVALID_VALUE);
+              ->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy]
+              .ZeProperties.maxMemoryFillPatternSize) {
+
+    ZeCommandList = CommandBuffer->ZeCopyCommandList;
+    // We indicate that the ZeCopyCommandList contains commands to be
+    // submitted.
+    CommandBuffer->MCopyCommandListEmpty = false;
+  } else {
+    // Pattern size must fit the compute queue capabilities.
+    UR_ASSERT(
+        PatternSize <=
+            CommandBuffer->Device
+                ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+                .ZeProperties.maxMemoryFillPatternSize,
+        UR_RESULT_ERROR_INVALID_VALUE);
+    ZeCommandList = CommandBuffer->ZeComputeCommandList;
+  }
 
   if (CommandBuffer->IsInOrderCmdList) {
     ZE2UR_CALL(zeCommandListAppendMemoryFill,
-               (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size,
-                nullptr, 0, nullptr));
+               (CommandBuffer->ZeComputeCommandList, Ptr, Pattern, PatternSize,
+                Size, nullptr, 0, nullptr));
 
     logger::debug("calling zeCommandListAppendMemoryFill()");
   } else {
@@ -448,11 +516,15 @@ static ur_result_t enqueueCommandBufferFillHelper(
     LaunchEvent->CommandType = CommandType;
 
     // Get sync point and register the event with it.
-    *SyncPoint = CommandBuffer->GetNextSyncPoint();
-    CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
 
     ZE2UR_CALL(zeCommandListAppendMemoryFill,
-               (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size,
+               (ZeCommandList, Ptr, Pattern, PatternSize, Size,
                 LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
 
     logger::debug("calling zeCommandListAppendMemoryFill() with"
@@ -474,8 +546,6 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
           ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
           : false;
 
-  // Force compute queue type for now. Copy engine types may be better suited
-  // for host to device copies.
   uint32_t QueueGroupOrdinal =
       Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute]
           .ZeOrdinal;
@@ -495,21 +565,51 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
   ZeCommandListDesc.flags = IsInOrder ? ZE_COMMAND_LIST_FLAG_IN_ORDER
                                       : ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
 
+  DEBUG_LOG(ZeCommandListDesc.flags);
+
   ZeStruct<ze_mutable_command_list_exp_desc_t> ZeMutableCommandListDesc;
   if (CommandBufferDesc && CommandBufferDesc->isUpdatable) {
     ZeMutableCommandListDesc.flags = 0;
     ZeCommandListDesc.pNext = &ZeMutableCommandListDesc;
   }
 
-  ze_command_list_handle_t ZeCommandList;
+  ze_command_list_handle_t ZeComputeCommandList;
   // TODO We could optimize this by pooling both Level Zero command-lists and UR
   // command-buffers, then reusing them.
   ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
-                                   &ZeCommandListDesc, &ZeCommandList));
+                                   &ZeCommandListDesc, &ZeComputeCommandList));
+
+  // Create a list for copy commands.
+  // Note that to simplify the implementation, the current implementation only
+  // uses the main copy engine and does not use the link engine even if
+  // available.
+  ze_command_list_handle_t ZeCopyCommandList = nullptr;
+  ZeStruct<ze_command_list_desc_t> ZeCopyCommandListDesc;
+  if (Device->hasMainCopyEngine()) {
+    uint32_t QueueGroupOrdinalCopy =
+        Device
+            ->QueueGroup
+                [ur_device_handle_t_::queue_group_info_t::type::MainCopy]
+            .ZeOrdinal;
+
+    ZeCopyCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinalCopy;
+    // Dependencies between commands are explicitly enforced by sync points when
+    // enqueuing. Consequently, relax the command ordering in the command list
+    // can enable the backend to further optimize the workload
+    ZeCopyCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
+
+    // TODO We could optimize this by pooling both Level Zero command-lists and
+    // UR command-buffers, then reusing them.
+    ZE2UR_CALL(zeCommandListCreate,
+               (Context->ZeContext, Device->ZeDevice, &ZeCopyCommandListDesc,
+                &ZeCopyCommandList));
+  }
+
   try {
     *CommandBuffer = new ur_exp_command_buffer_handle_t_(
-        Context, Device, ZeCommandList, ZeCommandListResetEvents,
-        ZeCommandListDesc, CommandBufferDesc, IsInOrder);
+        Context, Device, ZeComputeCommandList, ZeCommandListResetEvents,
+        ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc,
+        CommandBufferDesc, IsInOrder);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
@@ -536,9 +636,18 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
   std::vector<ze_event_handle_t> PrecondEvents = {
       RetCommandBuffer->WaitEvent->ZeEvent,
       RetCommandBuffer->AllResetEvent->ZeEvent};
-  ZE2UR_CALL(
-      zeCommandListAppendBarrier,
-      (ZeCommandList, nullptr, PrecondEvents.size(), PrecondEvents.data()));
+  ZE2UR_CALL(zeCommandListAppendBarrier,
+             (ZeComputeCommandList, nullptr, PrecondEvents.size(),
+              PrecondEvents.data()));
+
+  if (Device->hasMainCopyEngine()) {
+    // The copy command-list must be executed once the preconditions have been
+    // met. We therefore begin this command-list with a barrier on the
+    // preconditions.
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (ZeCopyCommandList, nullptr, PrecondEvents.size(),
+                PrecondEvents.data()));
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -579,9 +688,9 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
               CommandBuffer->AllResetEvent->ZeEvent));
 
   if (CommandBuffer->IsInOrderCmdList) {
-    ZE2UR_CALL(
-        zeCommandListAppendSignalEvent,
-        (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent));
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (CommandBuffer->ZeComputeCommandList,
+                CommandBuffer->SignalEvent->ZeEvent));
   } else {
     // Create a list of events for our signal event to wait on
     const size_t NumEvents = CommandBuffer->SyncPoints.size();
@@ -592,15 +701,21 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
 
     // Wait for all the user added commands to complete, and signal the
     // command-buffer signal-event when they are done.
-    ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeCommandList,
+    ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeComputeCommandList,
                                             CommandBuffer->SignalEvent->ZeEvent,
                                             NumEvents, WaitEventList.data()));
   }
 
   // Close the command lists and have them ready for dispatch.
-  ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
+  ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList));
   ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents));
+
+  if (CommandBuffer->UseCopyEngine()) {
+    ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCopyCommandList));
+  }
+
   CommandBuffer->IsFinalized = true;
+
   return UR_RESULT_SUCCESS;
 }
 
@@ -610,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
     uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint,
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint,
     ur_exp_command_buffer_command_handle_t *Command) {
   UR_ASSERT(CommandBuffer && Kernel && Kernel->Program,
             UR_RESULT_ERROR_INVALID_NULL_POINTER);
@@ -675,9 +790,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     auto Plt = CommandBuffer->Context->getPlatform();
     UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
               UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
-    ZE2UR_CALL(
-        Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
-        (CommandBuffer->ZeCommandList, &ZeMutableCommandDesc, &CommandId));
+    ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
+               (CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc,
+                &CommandId));
+    DEBUG_LOG(CommandId);
   }
   try {
     if (Command)
@@ -691,7 +807,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
 
   if (CommandBuffer->IsInOrderCmdList) {
     ZE2UR_CALL(zeCommandListAppendLaunchKernel,
-               (CommandBuffer->ZeCommandList, Kernel->ZeKernel,
+               (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel,
                 &ZeThreadGroupDimensions, nullptr, 0, nullptr));
 
     logger::debug("calling zeCommandListAppendLaunchKernel()");
@@ -705,14 +821,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                         !CommandBuffer->IsProfilingEnabled));
     LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH;
 
-    if (SyncPoint) {
-      // Get sync point and register the event with it.
-      *SyncPoint = CommandBuffer->GetNextSyncPoint();
-      CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    // Get sync point and register the event with it.
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
     }
 
     ZE2UR_CALL(zeCommandListAppendLaunchKernel,
-               (CommandBuffer->ZeCommandList, Kernel->ZeKernel,
+               (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel,
                 &ZeThreadGroupDimensions, LaunchEvent->ZeEvent,
                 ZeEventList.size(), ZeEventList.data()));
 
@@ -729,8 +847,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
     size_t Size, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *SyncPoint) {
+
+  bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) ||
+                          !IsDevicePointer(CommandBuffer->Context, Dst);
+
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
   return enqueueCommandBufferMemCopyHelper(
-      UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size,
+      UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine,
       NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
 }
 
@@ -740,8 +864,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
     uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *SyncPoint) {
-  auto SrcBuffer = ur_cast<ur_mem_handle_t>(SrcMem);
-  auto DstBuffer = ur_cast<ur_mem_handle_t>(DstMem);
+  auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem);
+  auto DstBuffer = ur_cast<_ur_buffer *>(DstMem);
 
   std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
   std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
@@ -754,10 +878,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                                  CommandBuffer->Device));
 
+  bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
+
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
   return enqueueCommandBufferMemCopyHelper(
       UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst + DstOffset,
-      ZeHandleSrc + SrcOffset, Size, NumSyncPointsInWaitList, SyncPointWaitList,
-      SyncPoint);
+      ZeHandleSrc + SrcOffset, Size, PreferCopyEngine, NumSyncPointsInWaitList,
+      SyncPointWaitList, SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
@@ -768,8 +896,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
     uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *SyncPoint) {
-  auto SrcBuffer = ur_cast<ur_mem_handle_t>(SrcMem);
-  auto DstBuffer = ur_cast<ur_mem_handle_t>(DstMem);
+  auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem);
+  auto DstBuffer = ur_cast<_ur_buffer *>(DstMem);
 
   std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
   std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
@@ -782,10 +910,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                                  CommandBuffer->Device));
 
+  bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost);
+
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
   return enqueueCommandBufferMemCopyRectHelper(
       UR_COMMAND_MEM_BUFFER_COPY_RECT, CommandBuffer, ZeHandleDst, ZeHandleSrc,
       SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch,
-      DstSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      DstSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList,
+      SyncPointWaitList, SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
@@ -799,12 +932,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
   char *ZeHandleDst = nullptr;
   UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                               CommandBuffer->Device));
+  // Always prefer copy engine for writes
+  bool PreferCopyEngine = true;
 
   return enqueueCommandBufferMemCopyHelper(
       UR_COMMAND_MEM_BUFFER_WRITE, CommandBuffer,
       ZeHandleDst + Offset, // dst
       Src,                  // src
-      Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList,
+      SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
@@ -820,11 +956,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
   char *ZeHandleDst = nullptr;
   UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                               CommandBuffer->Device));
+
+  // Always prefer copy engine for writes
+  bool PreferCopyEngine = true;
+
   return enqueueCommandBufferMemCopyRectHelper(
       UR_COMMAND_MEM_BUFFER_WRITE_RECT, CommandBuffer, ZeHandleDst,
       const_cast<char *>(static_cast<const char *>(Src)), HostOffset,
       BufferOffset, Region, HostRowPitch, BufferRowPitch, HostSlicePitch,
-      BufferSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      BufferSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList,
+      SyncPointWaitList, SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
@@ -837,9 +978,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
   char *ZeHandleSrc = nullptr;
   UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
                               CommandBuffer->Device));
+
+  // Always prefer copy engine for reads
+  bool PreferCopyEngine = true;
+
   return enqueueCommandBufferMemCopyHelper(
       UR_COMMAND_MEM_BUFFER_READ, CommandBuffer, Dst, ZeHandleSrc + Offset,
-      Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList,
+      SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
@@ -855,25 +1001,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   char *ZeHandleSrc;
   UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
                               CommandBuffer->Device));
+
+  // Always prefer copy engine for reads
+  bool PreferCopyEngine = true;
+
   return enqueueCommandBufferMemCopyRectHelper(
       UR_COMMAND_MEM_BUFFER_READ_RECT, CommandBuffer, Dst, ZeHandleSrc,
       BufferOffset, HostOffset, Region, BufferRowPitch, HostRowPitch,
-      BufferSlicePitch, HostSlicePitch, NumSyncPointsInWaitList,
-      SyncPointWaitList, SyncPoint);
+      BufferSlicePitch, HostSlicePitch, PreferCopyEngine,
+      NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
     ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size,
     ur_usm_migration_flags_t Flags, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   std::ignore = Flags;
 
   if (CommandBuffer->IsInOrderCmdList) {
     // Add the prefetch command to the command buffer.
     // Note that L0 does not handle migration flags.
     ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
-               (CommandBuffer->ZeCommandList, Mem, Size));
+               (CommandBuffer->ZeComputeCommandList, Mem, Size));
   } else {
     std::vector<ze_event_handle_t> ZeEventList;
     UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
@@ -881,7 +1031,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
 
     if (NumSyncPointsInWaitList) {
       ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
-                 (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList,
+                 (CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList,
                   ZeEventList.data()));
     }
 
@@ -892,18 +1042,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
     LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH;
 
     // Get sync point and register the event with it.
-    *SyncPoint = CommandBuffer->GetNextSyncPoint();
-    CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
 
     // Add the prefetch command to the command buffer.
     // Note that L0 does not handle migration flags.
     ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
-               (CommandBuffer->ZeCommandList, Mem, Size));
+               (CommandBuffer->ZeComputeCommandList, Mem, Size));
 
     // Level Zero does not have a completion "event" with the prefetch API,
     // so manually add command to signal our event.
     ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent));
+               (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent));
   }
 
   return UR_RESULT_SUCCESS;
@@ -913,7 +1067,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
     ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size,
     ur_usm_advice_flags_t Advice, uint32_t NumSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
-    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+    ur_exp_command_buffer_sync_point_t *RetSyncPoint) {
   // A memory chunk can be advised with muliple memory advices
   // We therefore prefer if statements to switch cases to combine all potential
   // flags
@@ -943,8 +1097,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
 
   if (CommandBuffer->IsInOrderCmdList) {
     ZE2UR_CALL(zeCommandListAppendMemAdvise,
-               (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice,
-                Mem, Size, ZeAdvice));
+               (CommandBuffer->ZeComputeCommandList,
+                CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice));
   } else {
     std::vector<ze_event_handle_t> ZeEventList;
     UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
@@ -952,7 +1106,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
 
     if (NumSyncPointsInWaitList) {
       ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
-                 (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList,
+                 (CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList,
                   ZeEventList.data()));
     }
 
@@ -963,17 +1117,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
     LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE;
 
     // Get sync point and register the event with it.
-    *SyncPoint = CommandBuffer->GetNextSyncPoint();
-    CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
+    ur_exp_command_buffer_sync_point_t SyncPoint =
+        CommandBuffer->GetNextSyncPoint();
+    CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent);
+    if (RetSyncPoint) {
+      *RetSyncPoint = SyncPoint;
+    }
 
     ZE2UR_CALL(zeCommandListAppendMemAdvise,
-               (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice,
-                Mem, Size, ZeAdvice));
+               (CommandBuffer->ZeComputeCommandList,
+                CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice));
 
     // Level Zero does not have a completion "event" with the advise API,
     // so manually add command to signal our event.
     ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent));
+               (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent));
   }
 
   return UR_RESULT_SUCCESS;
@@ -997,7 +1155,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
       UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset,
       Pattern,     // It will be interpreted as an 8-bit value,
       PatternSize, // which is indicated with this pattern_size==1
-      Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList,
+      SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
@@ -1011,7 +1170,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
       UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr,
       Pattern,     // It will be interpreted as an 8-bit value,
       PatternSize, // which is indicated with this pattern_size==1
-      Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
+      Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList,
+      SyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
@@ -1025,11 +1185,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   uint32_t QueueGroupOrdinal;
   auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal);
 
-  ze_fence_handle_t ZeFence;
-  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
-
-  ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-  CommandBuffer->ZeFencesList.push_back(ZeFence);
+  // If we already have created a fence for this queue, first reset then reuse
+  // it, otherwise create a new fence.
+  ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence;
+  auto ZeWorkloadFenceForQueue =
+      CommandBuffer->ZeFencesMap.find(ZeCommandQueue);
+  if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) {
+    ZeStruct<ze_fence_desc_t> ZeFenceDesc;
+    ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+    CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}});
+  } else {
+    ZeFence = ZeWorkloadFenceForQueue->second;
+    ZE2UR_CALL(zeFenceReset, (ZeFence));
+  }
 
   bool MustSignalWaitEvent = true;
   if (NumEventsInWaitList) {
@@ -1078,8 +1246,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // type, regardless of the UR Queue type. We therefore need to submit the list
   // directly using the Level-Zero API to avoid type mismatches if using UR
   // functions.
-  ZE2UR_CALL(zeCommandQueueExecuteCommandLists,
-             (ZeCommandQueue, 1, &CommandBuffer->ZeCommandList, ZeFence));
+  ZE2UR_CALL(
+      zeCommandQueueExecuteCommandLists,
+      (ZeCommandQueue, 1, &CommandBuffer->ZeComputeCommandList, ZeFence));
+
+  // The Copy command-list is submitted to the main copy queue if it is not
+  // empty.
+  if (!CommandBuffer->MCopyCommandListEmpty) {
+    auto &QGroupCopy = Queue->getQueueGroup(true);
+    uint32_t QueueGroupOrdinal;
+    auto &ZeCopyCommandQueue = QGroupCopy.getZeQueue(&QueueGroupOrdinal);
+    ZE2UR_CALL(
+        zeCommandQueueExecuteCommandLists,
+        (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr));
+  }
 
   // Execution event for this enqueue of the UR command-buffer
   ur_event_handle_t RetEvent{};
@@ -1089,6 +1269,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
                                                   false, NumEventsInWaitList,
                                                   EventWaitList, false));
+
   // Reset the wait-event for the UR command-buffer that is signaled when its
   // submission dependencies have been satisfied.
   ZE2UR_CALL(zeCommandListAppendEventReset,
@@ -1201,6 +1382,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   auto SupportedFeatures =
       Command->CommandBuffer->Device->ZeDeviceMutableCmdListsProperties
           ->mutableCommandFlags;
+  logger::debug("Mutable features supported by device {}", SupportedFeatures);
 
   // We need the created descriptors to live till the point when
   // zexCommandListUpdateMutableCommandsExp is called at the end of the
@@ -1228,10 +1410,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     auto MutableGroupOffestDesc =
         std::make_unique<ZeStruct<ze_mutable_global_offset_exp_desc_t>>();
     MutableGroupOffestDesc->commandId = Command->CommandId;
+    DEBUG_LOG(MutableGroupOffestDesc->commandId);
     MutableGroupOffestDesc->pNext = NextDesc;
+    DEBUG_LOG(MutableGroupOffestDesc->pNext);
     MutableGroupOffestDesc->offsetX = NewGlobalWorkOffset[0];
+    DEBUG_LOG(MutableGroupOffestDesc->offsetX);
     MutableGroupOffestDesc->offsetY = Dim >= 2 ? NewGlobalWorkOffset[1] : 0;
+    DEBUG_LOG(MutableGroupOffestDesc->offsetY);
     MutableGroupOffestDesc->offsetZ = Dim == 3 ? NewGlobalWorkOffset[2] : 0;
+    DEBUG_LOG(MutableGroupOffestDesc->offsetZ);
     NextDesc = MutableGroupOffestDesc.get();
     OffsetDescs.push_back(std::move(MutableGroupOffestDesc));
   }
@@ -1245,10 +1432,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     auto MutableGroupSizeDesc =
         std::make_unique<ZeStruct<ze_mutable_group_size_exp_desc_t>>();
     MutableGroupSizeDesc->commandId = Command->CommandId;
+    DEBUG_LOG(MutableGroupSizeDesc->commandId);
     MutableGroupSizeDesc->pNext = NextDesc;
+    DEBUG_LOG(MutableGroupSizeDesc->pNext);
     MutableGroupSizeDesc->groupSizeX = NewLocalWorkSize[0];
+    DEBUG_LOG(MutableGroupSizeDesc->groupSizeX);
     MutableGroupSizeDesc->groupSizeY = Dim >= 2 ? NewLocalWorkSize[1] : 1;
+    DEBUG_LOG(MutableGroupSizeDesc->groupSizeY);
     MutableGroupSizeDesc->groupSizeZ = Dim == 3 ? NewLocalWorkSize[2] : 1;
+    DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ);
     NextDesc = MutableGroupSizeDesc.get();
     GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc));
   }
@@ -1261,8 +1453,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   UR_ASSERT(!(NewGlobalWorkSize && !NewLocalWorkSize) ||
                 (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE),
             UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
+
+  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
   if (NewGlobalWorkSize && Dim > 0) {
-    ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
     uint32_t WG[3];
     // If new global work size is provided but new local work size is not
     // provided then we still need to update local work size based on size
@@ -1273,9 +1466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
         Dim, NewGlobalWorkSize, NewLocalWorkSize));
     auto MutableGroupCountDesc =
         std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
-    MutableGroupCountDesc->pNext = NextDesc;
     MutableGroupCountDesc->commandId = Command->CommandId;
+    DEBUG_LOG(MutableGroupCountDesc->commandId);
+    MutableGroupCountDesc->pNext = NextDesc;
+    DEBUG_LOG(MutableGroupCountDesc->pNext);
     MutableGroupCountDesc->pGroupCount = &ZeThreadGroupDimensions;
+    DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountX);
+    DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountY);
+    DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountZ);
     NextDesc = MutableGroupCountDesc.get();
     GroupCountDescs.push_back(std::move(MutableGroupCountDesc));
 
@@ -1283,10 +1481,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
       auto MutableGroupSizeDesc =
           std::make_unique<ZeStruct<ze_mutable_group_size_exp_desc_t>>();
       MutableGroupSizeDesc->commandId = Command->CommandId;
+      DEBUG_LOG(MutableGroupSizeDesc->commandId);
       MutableGroupSizeDesc->pNext = NextDesc;
+      DEBUG_LOG(MutableGroupSizeDesc->pNext);
       MutableGroupSizeDesc->groupSizeX = WG[0];
+      DEBUG_LOG(MutableGroupSizeDesc->groupSizeX);
       MutableGroupSizeDesc->groupSizeY = WG[1];
+      DEBUG_LOG(MutableGroupSizeDesc->groupSizeY);
       MutableGroupSizeDesc->groupSizeZ = WG[2];
+      DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ);
+
       NextDesc = MutableGroupSizeDesc.get();
       GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc));
     }
@@ -1333,10 +1537,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     auto ZeMutableArgDesc =
         std::make_unique<ZeStruct<ze_mutable_kernel_argument_exp_desc_t>>();
     ZeMutableArgDesc->commandId = Command->CommandId;
+    DEBUG_LOG(ZeMutableArgDesc->commandId);
     ZeMutableArgDesc->pNext = NextDesc;
+    DEBUG_LOG(ZeMutableArgDesc->pNext);
     ZeMutableArgDesc->argIndex = NewMemObjArgDesc.argIndex;
+    DEBUG_LOG(ZeMutableArgDesc->argIndex);
     ZeMutableArgDesc->argSize = sizeof(void *);
+    DEBUG_LOG(ZeMutableArgDesc->argSize);
     ZeMutableArgDesc->pArgValue = ZeHandlePtr;
+    DEBUG_LOG(ZeMutableArgDesc->pArgValue);
 
     NextDesc = ZeMutableArgDesc.get();
     ArgDescs.push_back(std::move(ZeMutableArgDesc));
@@ -1350,10 +1559,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     auto ZeMutableArgDesc =
         std::make_unique<ZeStruct<ze_mutable_kernel_argument_exp_desc_t>>();
     ZeMutableArgDesc->commandId = Command->CommandId;
+    DEBUG_LOG(ZeMutableArgDesc->commandId);
     ZeMutableArgDesc->pNext = NextDesc;
+    DEBUG_LOG(ZeMutableArgDesc->pNext);
     ZeMutableArgDesc->argIndex = NewPointerArgDesc.argIndex;
+    DEBUG_LOG(ZeMutableArgDesc->argIndex);
     ZeMutableArgDesc->argSize = sizeof(void *);
+    DEBUG_LOG(ZeMutableArgDesc->argSize);
     ZeMutableArgDesc->pArgValue = NewPointerArgDesc.pNewPointerArg;
+    DEBUG_LOG(ZeMutableArgDesc->pArgValue);
 
     NextDesc = ZeMutableArgDesc.get();
     ArgDescs.push_back(std::move(ZeMutableArgDesc));
@@ -1367,9 +1581,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
     auto ZeMutableArgDesc =
         std::make_unique<ZeStruct<ze_mutable_kernel_argument_exp_desc_t>>();
     ZeMutableArgDesc->commandId = Command->CommandId;
+    DEBUG_LOG(ZeMutableArgDesc->commandId);
     ZeMutableArgDesc->pNext = NextDesc;
+    DEBUG_LOG(ZeMutableArgDesc->pNext);
     ZeMutableArgDesc->argIndex = NewValueArgDesc.argIndex;
+    DEBUG_LOG(ZeMutableArgDesc->argIndex);
     ZeMutableArgDesc->argSize = NewValueArgDesc.argSize;
+    DEBUG_LOG(ZeMutableArgDesc->argSize);
     // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
     // in which case a NULL value will be used as the value for the argument
     // declared as a pointer to global or constant memory in the kernel"
@@ -1383,6 +1601,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
       ArgValuePtr = nullptr;
     }
     ZeMutableArgDesc->pArgValue = ArgValuePtr;
+    DEBUG_LOG(ZeMutableArgDesc->pArgValue);
     NextDesc = ZeMutableArgDesc.get();
     ArgDescs.push_back(std::move(ZeMutableArgDesc));
   }
@@ -1392,15 +1611,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   MutableCommandDesc.flags = 0;
 
   // We must synchronize mutable command list execution before mutating.
-  ZE2UR_CALL(zeEventHostSynchronize,
-             (CommandBuffer->SignalEvent->ZeEvent, UINT64_MAX));
+  if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
+    ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
+  }
 
-  auto Plt = Command->CommandBuffer->Context->getPlatform();
+  auto Plt = CommandBuffer->Context->getPlatform();
   UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
             UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
   ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
-             (CommandBuffer->ZeCommandList, &MutableCommandDesc));
-  ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
+             (CommandBuffer->ZeComputeCommandList, &MutableCommandDesc));
+  ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList));
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp
index 04d6a7d269..48f1c68330 100644
--- a/source/adapters/level_zero/command_buffer.hpp
+++ b/source/adapters/level_zero/command_buffer.hpp
@@ -29,7 +29,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
       ur_context_handle_t Context, ur_device_handle_t Device,
       ze_command_list_handle_t CommandList,
       ze_command_list_handle_t CommandListResetEvents,
+      ze_command_list_handle_t CopyCommandList,
       ZeStruct<ze_command_list_desc_t> ZeDesc,
+      ZeStruct<ze_command_list_desc_t> ZeCopyDesc,
       const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList);
 
   ~ur_exp_command_buffer_handle_t_();
@@ -44,20 +46,33 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
     return NextSyncPoint;
   }
 
+  // Indicates if a copy engine is available for use
+  bool UseCopyEngine() const { return ZeCopyCommandList != nullptr; }
+
   // UR context associated with this command-buffer
   ur_context_handle_t Context;
   // Device associated with this command buffer
   ur_device_handle_t Device;
   // Level Zero command list handle
-  ze_command_list_handle_t ZeCommandList;
+  ze_command_list_handle_t ZeComputeCommandList;
   // Level Zero command list handle
   ze_command_list_handle_t ZeCommandListResetEvents;
   // Level Zero command list descriptor
   ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
-  // List of Level Zero fences created when submitting a graph.
-  // This list is needed to release all fences retained by the
-  // command_buffer.
-  std::vector<ze_fence_handle_t> ZeFencesList;
+  // Level Zero Copy command list handle
+  ze_command_list_handle_t ZeCopyCommandList;
+  // Level Zero Copy command list descriptor
+  ZeStruct<ze_command_list_desc_t> ZeCopyCommandListDesc;
+  // This flag is must be set to false if at least one copy command has been
+  // added to `ZeCopyCommandList`
+  bool MCopyCommandListEmpty = true;
+  // Level Zero fences for each queue the command-buffer has been enqueued to.
+  // These should be destroyed when the command-buffer is released.
+  std::unordered_map<ze_command_queue_handle_t, ze_fence_handle_t> ZeFencesMap;
+  // The Level Zero fence from the most recent enqueue of the command-buffer.
+  // Must be an element in ZeFencesMap, so is not required to be destroyed
+  // itself.
+  ze_fence_handle_t ZeActiveFence;
   // Queue properties from command-buffer descriptor
   // TODO: Do we need these?
   ur_queue_properties_t QueueProperties;
diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index e16d767b71..55cf1af5ca 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -99,8 +99,8 @@ static auto getUrResultString = [](ur_result_t Result) {
     return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
   case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
     return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
+  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
+    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
   case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
     return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
   case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
@@ -141,8 +141,6 @@ static auto getUrResultString = [](ur_result_t Result) {
     return "UR_RESULT_ERROR_INVALID_ENUMERATION";
   case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
     return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
   case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
     return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
   case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 7f832f30f2..087d459506 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   // > The application must only use the module for the device, or its
   // > sub-devices, which was provided during creation.
   case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
-    return ReturnValue(uint32_t{0});
+    return ReturnValue(ur_bool_t{0});
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
     return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
@@ -326,10 +326,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(Device->ZeDeviceImageProperties->maxImageDims1D > 0);
   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
     return ReturnValue(
-        static_cast<uint32_t>((Device->ZeDeviceProperties->flags &
-                               ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0));
+        static_cast<ur_bool_t>((Device->ZeDeviceProperties->flags &
+                                ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0));
   case UR_DEVICE_INFO_AVAILABLE:
-    return ReturnValue(static_cast<uint32_t>(ZeDevice ? true : false));
+    return ReturnValue(static_cast<ur_bool_t>(ZeDevice ? true : false));
   case UR_DEVICE_INFO_VENDOR:
     // TODO: Level-Zero does not return vendor's name at the moment
     // only the ID.
@@ -448,8 +448,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_ENDIAN_LITTLE:
     return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
-    return ReturnValue(static_cast<uint32_t>(Device->ZeDeviceProperties->flags &
-                                             ZE_DEVICE_PROPERTY_FLAG_ECC));
+    return ReturnValue(static_cast<ur_bool_t>(
+        Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC));
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION:
     return ReturnValue(
         static_cast<size_t>(Device->ZeDeviceProperties->timerResolution));
@@ -626,11 +626,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(static_cast<ur_bool_t>(false));
   }
   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-    // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
-    // expected return is size_t datatype. size_t can be 8 bytes of data.
-    return ReturnValue.template operator()<size_t>(
-        Device->ZeDeviceComputeProperties->subGroupSizes,
-        Device->ZeDeviceComputeProperties->numSubGroupSizes);
+    return ReturnValue(Device->ZeDeviceComputeProperties->subGroupSizes,
+                       Device->ZeDeviceComputeProperties->numSubGroupSizes);
   }
   case UR_DEVICE_INFO_IL_VERSION: {
     // Set to a space separated list of IL version strings of the form
@@ -875,13 +872,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(static_cast<ur_bool_t>(true));
   }
   case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   }
 
   case UR_DEVICE_INFO_ESIMD_SUPPORT: {
     // ESIMD is only supported by Intel GPUs.
-    uint32_t result = Device->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU &&
-                      Device->ZeDeviceProperties->vendorId == 0x8086;
+    ur_bool_t result = Device->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU &&
+                       Device->ZeDeviceProperties->vendorId == 0x8086;
     return ReturnValue(result);
   }
 
@@ -947,18 +944,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP:
     return ReturnValue(true);
   case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: {
-    // TODO: Level Zero API allows to check support for all sub-features:
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS,
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT,
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE,
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET,
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT,
-    // ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS
-    // but UR has only one property to check the mutable command lists feature
-    // support. For now return true if kernel arguments can be updated.
-    auto KernelArgUpdateSupport =
-        Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags &
-        ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS;
+    // Update support requires being able to update kernel arguments and all
+    // aspects of the kernel NDRange.
+    const ze_mutable_command_exp_flags_t UpdateMask =
+        ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS |
+        ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT |
+        ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE |
+        ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET;
+
+    const bool KernelArgUpdateSupport =
+        (Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags &
+         UpdateMask) == UpdateMask;
     return ReturnValue(KernelArgUpdateSupport &&
                        Device->Platform->ZeMutableCmdListExt.Supported);
   }
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index b953e69f06..e2026b9dc7 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -766,7 +766,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
             UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(!(UR_EXP_IMAGE_COPY_FLAGS_MASK & imageCopyFlags),
             UR_RESULT_ERROR_INVALID_ENUMERATION);
-  UR_ASSERT(!(pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type),
+  UR_ASSERT(!(pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type),
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
 
   ZeStruct<ze_image_desc_t> ZeImageDesc;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index 40d6260ac9..3f8e8c6986 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -13,6 +13,93 @@
 #include "ur_api.h"
 #include "ur_level_zero.hpp"
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) {
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(pSuggestedLocalWorkSize != nullptr,
+            UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  uint32_t LocalWorkSize[3];
+  size_t GlobalWorkSize3D[3]{1, 1, 1};
+  std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D);
+
+  ze_kernel_handle_t ZeKernel{};
+  UR_CALL(getZeKernel(hQueue, hKernel, &ZeKernel));
+
+  UR_CALL(getSuggestedLocalWorkSize(hQueue, ZeKernel, GlobalWorkSize3D,
+                                    LocalWorkSize));
+
+  std::copy(LocalWorkSize, LocalWorkSize + workDim, pSuggestedLocalWorkSize);
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                        ze_kernel_handle_t *phZeKernel) {
+  auto ZeDevice = hQueue->Device->ZeDevice;
+
+  if (hKernel->ZeKernelMap.empty()) {
+    *phZeKernel = hKernel->ZeKernel;
+  } else {
+    auto It = hKernel->ZeKernelMap.find(ZeDevice);
+    if (It == hKernel->ZeKernelMap.end()) {
+      /* kernel and queue don't match */
+      return UR_RESULT_ERROR_INVALID_QUEUE;
+    }
+    *phZeKernel = It->second;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue,
+                                      ze_kernel_handle_t hZeKernel,
+                                      size_t GlobalWorkSize3D[3],
+                                      uint32_t SuggestedLocalWorkSize3D[3]) {
+  uint32_t *WG = SuggestedLocalWorkSize3D;
+
+  // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
+  // values do not fit to 32-bit that the API only supports currently.
+  bool SuggestGroupSize = true;
+  for (int I : {0, 1, 2}) {
+    if (GlobalWorkSize3D[I] > UINT32_MAX) {
+      SuggestGroupSize = false;
+    }
+  }
+  if (SuggestGroupSize) {
+    ZE2UR_CALL(zeKernelSuggestGroupSize,
+               (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
+                GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
+  } else {
+    for (int I : {0, 1, 2}) {
+      // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
+      // fully divisable with. Start with the max possible size in
+      // each dimension.
+      uint32_t GroupSize[] = {
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
+          hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
+      GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
+      while (GlobalWorkSize3D[I] % GroupSize[I]) {
+        --GroupSize[I];
+      }
+      if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
+        logger::error("getSuggestedLocalWorkSize: can't find a WG size "
+                      "suitable for global work size > UINT32_MAX");
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+      WG[I] = GroupSize[I];
+    }
+    logger::debug(
+        "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}",
+        WG[0], WG[1], WG[2]);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t Queue,   ///< [in] handle of the queue object
     ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
@@ -43,19 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular kernel execution instance.
 ) {
-  auto ZeDevice = Queue->Device->ZeDevice;
-
   ze_kernel_handle_t ZeKernel{};
-  if (Kernel->ZeKernelMap.empty()) {
-    ZeKernel = Kernel->ZeKernel;
-  } else {
-    auto It = Kernel->ZeKernelMap.find(ZeDevice);
-    if (It == Kernel->ZeKernelMap.end()) {
-      /* kernel and queue don't match */
-      return UR_RESULT_ERROR_INVALID_QUEUE;
-    }
-    ZeKernel = It->second;
-  }
+  UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel));
+
   // Lock automatically releases when this goes out of scope.
   std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
       Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
@@ -92,54 +169,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
 
   if (LocalWorkSize) {
-    // L0
-    UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits<uint32_t>::max)(),
-              UR_RESULT_ERROR_INVALID_VALUE);
-    WG[0] = static_cast<uint32_t>(LocalWorkSize[0]);
-    WG[1] = static_cast<uint32_t>(LocalWorkSize[1]);
-    WG[2] = static_cast<uint32_t>(LocalWorkSize[2]);
-  } else {
-    // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
-    // values do not fit to 32-bit that the API only supports currently.
-    bool SuggestGroupSize = true;
-    for (int I : {0, 1, 2}) {
-      if (GlobalWorkSize3D[I] > UINT32_MAX) {
-        SuggestGroupSize = false;
-      }
-    }
-    if (SuggestGroupSize) {
-      ZE2UR_CALL(zeKernelSuggestGroupSize,
-                 (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
-                  GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
-    } else {
-      for (int I : {0, 1, 2}) {
-        // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
-        // fully divisable with. Start with the max possible size in
-        // each dimension.
-        uint32_t GroupSize[] = {
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
-        GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
-        while (GlobalWorkSize3D[I] % GroupSize[I]) {
-          --GroupSize[I];
-        }
-
-        if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
-          logger::error("urEnqueueKernelLaunch: can't find a WG size "
-                        "suitable for global work size > UINT32_MAX");
-          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-        WG[I] = GroupSize[I];
-      }
-      logger::debug(
-          "urEnqueueKernelLaunch: using computed WG size = {{{}, {}, {}}}",
-          WG[0], WG[1], WG[2]);
+    for (uint32_t I = 0; I < WorkDim; ++I) {
+      UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits<uint32_t>::max)(),
+                UR_RESULT_ERROR_INVALID_VALUE);
+      WG[I] = static_cast<uint32_t>(LocalWorkSize[I]);
     }
+  } else {
+    UR_CALL(getSuggestedLocalWorkSize(Queue, ZeKernel, GlobalWorkSize3D, WG));
   }
 
   // TODO: assert if sizes do not fit into 32-bit?
diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp
index 1cc146d262..2db3af0514 100644
--- a/source/adapters/level_zero/kernel.hpp
+++ b/source/adapters/level_zero/kernel.hpp
@@ -107,3 +107,10 @@ struct ur_kernel_handle_t_ : _ur_object {
   ZeCache<ZeStruct<ze_kernel_properties_t>> ZeKernelProperties;
   ZeCache<std::string> ZeKernelName;
 };
+
+ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue,
+                                      ze_kernel_handle_t hZeKernel,
+                                      size_t GlobalWorkSize3D[3],
+                                      uint32_t SuggestedLocalWorkSize3D[3]);
+ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                        ze_kernel_handle_t *phZeKernel);
diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 77cb6abb38..42a5d22d47 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -1483,14 +1483,14 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
       ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
       break;
     default:
-      logger::error("urMemImageCreate: unexpected data type Size");
-      return UR_RESULT_ERROR_INVALID_VALUE;
+      logger::error("urMemImageCreate: unexpected data type Size\n");
+      return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
     }
     break;
   }
   default:
     logger::error("format layout = {}", ImageFormat->channelOrder);
-    die("urMemImageCreate: unsupported image format layout\n");
+    return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT;
     break;
   }
 
@@ -1519,7 +1519,7 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
     break;
   default:
     logger::error("urMemImageCreate: unsupported image type");
-    return UR_RESULT_ERROR_INVALID_VALUE;
+    return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
   }
 
   ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp
index d508d85c82..ca162bbad2 100644
--- a/source/adapters/level_zero/platform.cpp
+++ b/source/adapters/level_zero/platform.cpp
@@ -123,6 +123,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         NativePlatform, ///< [in] the native handle of the platform.
+    ur_adapter_handle_t,
     const ur_platform_native_properties_t
         *Properties, ///< [in][optional] pointer to native platform properties
                      ///< struct.
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index f47f960cc0..641c11071b 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1696,15 +1696,9 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     // If event is discarded then it can be in reset state or underlying level
     // zero handle can have device scope, so we can't synchronize the last
     // event.
+    auto savedLastCommandEvent = LastCommandEvent;
     if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
-      if (UrL0QueueSyncNonBlocking) {
-        auto SyncZeEvent = LastCommandEvent->ZeEvent;
-        this->Mutex.unlock();
-        ZE2UR_CALL(zeHostSynchronize, (SyncZeEvent));
-        this->Mutex.lock();
-      } else {
-        ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
-      }
+      ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
 
       // clean up all events known to have been completed as well,
       // so they can be reused later
@@ -1744,7 +1738,12 @@ ur_result_t ur_queue_handle_t_::synchronize() {
         }
       }
     }
-    LastCommandEvent = nullptr;
+    // If the current version of the LastCommandEvent == savedLastCommandEvent,
+    // then LastCommandEvent = nullptr; Otherwise, if LastCommandEvent !=
+    // savedLastCommandEvent, then LastCommandEvent is unchanged.
+    if (LastCommandEvent == savedLastCommandEvent) {
+      LastCommandEvent = nullptr;
+    }
   }
 
   // Since all timestamp recordings should have finished with the
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index d6d6060ea6..45568a7885 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -151,6 +151,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgSampler = urKernelSetArgSampler;
   pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return retVal;
 }
 
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index aabe0a3f13..067d149289 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -294,10 +294,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return UR_RESULT_ERROR_INVALID_VALUE;
   case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
     ur_memory_order_capability_flags_t Capabilities =
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED;
     return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index 7ef17b0c28..29b54503eb 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -297,3 +297,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
 
   DIE_NO_IMPLEMENTATION
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    [[maybe_unused]] ur_kernel_handle_t hKernel,
+    [[maybe_unused]] ur_queue_handle_t hQueue,
+    [[maybe_unused]] uint32_t workDim,
+    [[maybe_unused]] const size_t *pGlobalWorkOffset,
+    [[maybe_unused]] const size_t *pGlobalWorkSize,
+    [[maybe_unused]] size_t *pSuggestedLocalWorkSize) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/native_cpu/platform.cpp b/source/adapters/native_cpu/platform.cpp
index 31f35d03b2..8d650764c1 100644
--- a/source/adapters/native_cpu/platform.cpp
+++ b/source/adapters/native_cpu/platform.cpp
@@ -96,13 +96,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
-    ur_native_handle_t hNativePlatform,
-    const ur_platform_native_properties_t *pProperties,
-    ur_platform_handle_t *phPlatform) {
-  std::ignore = hNativePlatform;
-  std::ignore = pProperties;
-  std::ignore = phPlatform;
-
+    ur_native_handle_t, ur_adapter_handle_t,
+    const ur_platform_native_properties_t *, ur_platform_handle_t *) {
   DIE_NO_IMPLEMENTATION;
 }
 
diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp
index 678bdb8074..ee21a707b7 100644
--- a/source/adapters/native_cpu/program.cpp
+++ b/source/adapters/native_cpu/program.cpp
@@ -14,6 +14,7 @@
 #include "common/ur_util.hpp"
 #include "program.hpp"
 #include <cstdint>
+#include <memory>
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
@@ -63,11 +64,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  auto hProgram = new ur_program_handle_t_(
+  auto hProgram = std::make_unique<ur_program_handle_t_>(
       hContext, reinterpret_cast<const unsigned char *>(pBinary));
   if (pProperties != nullptr) {
     for (uint32_t i = 0; i < pProperties->count; i++) {
-      auto mdNode = pProperties->pMetadatas[i];
+      const auto &mdNode = pProperties->pMetadatas[i];
       std::string mdName(mdNode.pName);
       auto [Prefix, Tag] = splitMetadataName(mdName);
       if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
@@ -89,7 +90,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     nativecpu_it++;
   }
 
-  *phProgram = hProgram;
+  *phProgram = hProgram.release();
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp
index 0673b4122c..e85749a7b2 100644
--- a/source/adapters/native_cpu/program.hpp
+++ b/source/adapters/native_cpu/program.hpp
@@ -13,6 +13,8 @@
 #include <ur_api.h>
 
 #include "context.hpp"
+
+#include <array>
 #include <map>
 
 namespace native_cpu {
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 065012613e..053fc32d9f 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -123,6 +123,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
index 2278d5907e..19a9c9003b 100644
--- a/source/adapters/null/ur_nullddi.cpp
+++ b/source/adapters/null/ur_nullddi.cpp
@@ -263,6 +263,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -274,8 +276,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     auto pfnCreateWithNativeHandle =
         d_context.urDdiTable.Platform.pfnCreateWithNativeHandle;
     if (nullptr != pfnCreateWithNativeHandle) {
-        result =
-            pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
+        result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter,
+                                           pProperties, phPlatform);
     } else {
         // generic implementation
         *phPlatform = reinterpret_cast<ur_platform_handle_t>(d_context.get());
@@ -2616,6 +2618,43 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // if the driver has created a custom function, then call it instead of using the generic path
+    auto pfnGetSuggestedLocalWorkSize =
+        d_context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr != pfnGetSuggestedLocalWorkSize) {
+        result = pfnGetSuggestedLocalWorkSize(
+            hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+            pSuggestedLocalWorkSize);
+    } else {
+        // generic implementation
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -6248,6 +6287,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         driver::urKernelCreateWithNativeHandle;
 
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        driver::urKernelGetSuggestedLocalWorkSize;
+
     pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue;
 
     pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal;
diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp
index 750616235d..d557d3a202 100644
--- a/source/adapters/opencl/common.cpp
+++ b/source/adapters/opencl/common.cpp
@@ -87,6 +87,8 @@ ur_result_t mapCLErrorToUR(cl_int Result) {
     return UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE;
   case CL_INVALID_KERNEL_ARGS:
     return UR_RESULT_ERROR_INVALID_KERNEL_ARGS;
+  case CL_INVALID_COMMAND_QUEUE:
+    return UR_RESULT_ERROR_INVALID_QUEUE;
   default:
     return UR_RESULT_ERROR_UNKNOWN;
   }
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index ac79b71876..0c03c727f3 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -910,7 +910,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
   case UR_DEVICE_INFO_BUILT_IN_KERNELS:
   case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES:
-  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL:
   case UR_DEVICE_INFO_IP_VERSION: {
     /* We can just use the OpenCL outputs because the sizes of OpenCL types
      * are the same as UR.
@@ -929,6 +928,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
     return UR_RESULT_SUCCESS;
   }
+  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+    // Have to convert size_t to uint32_t
+    size_t SubGroupSizesSize = 0;
+    CL_RETURN_ON_FAILURE(
+        clGetDeviceInfo(cl_adapter::cast<cl_device_id>(hDevice), CLPropName, 0,
+                        nullptr, &SubGroupSizesSize));
+    std::vector<size_t> SubGroupSizes(SubGroupSizesSize / sizeof(size_t));
+    CL_RETURN_ON_FAILURE(
+        clGetDeviceInfo(cl_adapter::cast<cl_device_id>(hDevice), CLPropName,
+                        SubGroupSizesSize, SubGroupSizes.data(), nullptr));
+    return ReturnValue.template operator()<uint32_t>(SubGroupSizes.data(),
+                                                     SubGroupSizes.size());
+  }
   case UR_DEVICE_INFO_EXTENSIONS: {
     cl_device_id Dev = cl_adapter::cast<cl_device_id>(hDevice);
     size_t ExtSize = 0;
diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp
index 5141ce8ff0..45550a68e8 100644
--- a/source/adapters/opencl/event.cpp
+++ b/source/adapters/opencl/event.cpp
@@ -46,7 +46,8 @@ convertURProfilingInfoToCL(const ur_profiling_info_t PropName) {
     return CL_PROFILING_COMMAND_SUBMIT;
   case UR_PROFILING_INFO_COMMAND_START:
     return CL_PROFILING_COMMAND_START;
-  // TODO(ur) add UR_PROFILING_INFO_COMMAND_COMPLETE once spec has been updated
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return CL_PROFILING_COMMAND_COMPLETE;
   case UR_PROFILING_INFO_COMMAND_END:
     return CL_PROFILING_COMMAND_END;
   default:
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index 4fcbdeefa5..3accd84778 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -419,3 +419,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
   CL_RETURN_ON_FAILURE(RetErr);
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    size_t *pSuggestedLocalWorkSize) {
+  cl_device_id Device;
+  cl_platform_id Platform;
+
+  CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
+      cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
+      sizeof(cl_device_id), &Device, nullptr));
+
+  CL_RETURN_ON_FAILURE(clGetDeviceInfo(
+      Device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &Platform, nullptr));
+
+  auto GetKernelSuggestedLocalWorkSizeFuncPtr =
+      (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+          clGetExtensionFunctionAddressForPlatform(
+              Platform, "clGetKernelSuggestedLocalWorkSizeKHR");
+  if (!GetKernelSuggestedLocalWorkSizeFuncPtr)
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  CL_RETURN_ON_FAILURE(GetKernelSuggestedLocalWorkSizeFuncPtr(
+      cl_adapter::cast<cl_command_queue>(hQueue),
+      cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
+      pGlobalWorkSize, pSuggestedLocalWorkSize));
+  return UR_RESULT_SUCCESS;
+}
diff --git a/source/adapters/opencl/memory.cpp b/source/adapters/opencl/memory.cpp
index 2397e2b5f9..b2476fc420 100644
--- a/source/adapters/opencl/memory.cpp
+++ b/source/adapters/opencl/memory.cpp
@@ -123,9 +123,6 @@ cl_image_desc mapURImageDescToCL(const ur_image_desc_t *PImageDesc) {
       cl_adapter::cast<cl_mem_object_type>(PImageDesc->type);
 
   switch (PImageDesc->type) {
-  case UR_MEM_TYPE_BUFFER:
-    CLImageDesc.image_type = CL_MEM_OBJECT_BUFFER;
-    break;
   case UR_MEM_TYPE_IMAGE2D:
     CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
     break;
@@ -141,9 +138,6 @@ cl_image_desc mapURImageDescToCL(const ur_image_desc_t *PImageDesc) {
   case UR_MEM_TYPE_IMAGE1D_ARRAY:
     CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
     break;
-  case UR_MEM_TYPE_IMAGE1D_BUFFER:
-    CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-    break;
   default:
     CLImageDesc.image_type = -1;
     break;
diff --git a/source/adapters/opencl/platform.cpp b/source/adapters/opencl/platform.cpp
index 7188a3e8f0..9fa5025196 100644
--- a/source/adapters/opencl/platform.cpp
+++ b/source/adapters/opencl/platform.cpp
@@ -106,8 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
-    ur_native_handle_t hNativePlatform, const ur_platform_native_properties_t *,
-    ur_platform_handle_t *phPlatform) {
+    ur_native_handle_t hNativePlatform, ur_adapter_handle_t,
+    const ur_platform_native_properties_t *, ur_platform_handle_t *phPlatform) {
   *phPlatform = reinterpret_cast<ur_platform_handle_t>(hNativePlatform);
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index effb2128c3..687b541911 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp
index 8ab868e679..4df99f3a9c 100644
--- a/source/adapters/opencl/usm.cpp
+++ b/source/adapters/opencl/usm.cpp
@@ -261,9 +261,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     return UR_RESULT_SUCCESS;
   }
 
-  // OpenCL only supports pattern sizes as large as the largest CL type
-  // (double16/long16 - 128 bytes), anything larger we need to do on the host
-  // side and copy it into the target allocation.
+  // OpenCL only supports pattern sizes which are powers of 2 and are as large
+  // as the largest CL type (double16/long16 - 128 bytes), anything larger or
+  // not a power of 2, we need to do on the host side and copy it into the
+  // target allocation.
   clHostMemAllocINTEL_fn HostMemAlloc = nullptr;
   UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
       CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache,
@@ -280,14 +281,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
       cl_ext::MemBlockingFreeName, &USMFree));
 
   cl_int ClErr = CL_SUCCESS;
-  auto HostBuffer = static_cast<uint64_t *>(
-      HostMemAlloc(CLContext, nullptr, size, 0, &ClErr));
+  auto HostBuffer =
+      static_cast<uint8_t *>(HostMemAlloc(CLContext, nullptr, size, 0, &ClErr));
   CL_RETURN_ON_FAILURE(ClErr);
 
-  auto NumValues = size / sizeof(uint64_t);
-  auto NumChunks = patternSize / sizeof(uint64_t);
-  for (size_t i = 0; i < NumValues; i++) {
-    HostBuffer[i] = static_cast<const uint64_t *>(pPattern)[i % NumChunks];
+  auto *End = HostBuffer + size;
+  for (auto *Iter = HostBuffer; Iter < End; Iter += patternSize) {
+    std::memcpy(Iter, pPattern, patternSize);
   }
 
   cl_event CopyEvent = nullptr;
diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt
index c884607500..075d9909b0 100644
--- a/source/loader/CMakeLists.txt
+++ b/source/loader/CMakeLists.txt
@@ -15,6 +15,24 @@ add_ur_library(ur_loader
     ${CMAKE_CURRENT_BINARY_DIR}/UrLoaderVersion.rc
 )
 
+if (MSVC)
+    set(TARGET_LIBNAME ur_loader)
+    string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
+    
+    set(LOADER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/ur_loader.def)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/loader.def.in ${LOADER_VERSION_SCRIPT} @ONLY)
+    set_target_properties(ur_loader PROPERTIES
+        LINK_FLAGS "/DEF:${LOADER_VERSION_SCRIPT}"
+    )
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    set(TARGET_LIBNAME libur_loader_${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR})
+    string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME)
+
+    set(LOADER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/ur_loader.map)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/loader.map.in ${LOADER_VERSION_SCRIPT} @ONLY)
+    target_link_options(ur_loader PRIVATE "-Wl,--version-script=${LOADER_VERSION_SCRIPT}")
+endif()
+
 set_target_properties(ur_loader PROPERTIES
     LIBRARY_OUTPUT_NAME ur_loader
     RUNTIME_OUTPUT_NAME ur_loader
@@ -108,6 +126,8 @@ if(UR_ENABLE_SANITIZER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../ur/ur.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.hpp
         ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_libdevice.hpp
diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp
new file mode 100644
index 0000000000..bb50b53c06
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_buffer.cpp
@@ -0,0 +1,137 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_buffer.cpp
+ *
+ */
+
+#include "asan_buffer.hpp"
+#include "asan_interceptor.hpp"
+#include "ur_sanitizer_layer.hpp"
+#include "ur_sanitizer_utils.hpp"
+
+namespace ur_sanitizer_layer {
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) {
+    // If user doesn't determine src/dst row pitch and slice pitch, just use
+    // region for it.
+    if (SrcRowPitch == 0) {
+        SrcRowPitch = Region.width;
+    }
+
+    if (SrcSlicePitch == 0) {
+        SrcSlicePitch = SrcRowPitch * Region.height;
+    }
+
+    if (DstRowPitch == 0) {
+        DstRowPitch = Region.width;
+    }
+
+    if (DstSlicePitch == 0) {
+        DstSlicePitch = DstRowPitch * Region.height;
+    }
+
+    // Calculate the src and dst addresses that actually will be copied.
+    char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y +
+                      SrcSlicePitch * SrcOffset.z;
+    char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y +
+                      DstSlicePitch * DstOffset.z;
+
+    std::vector<ur_event_handle_t> Events;
+    Events.reserve(Region.depth);
+    // For now, USM doesn't support 3D memory copy operation, so we can only
+    // loop call 2D memory copy function to implement it.
+    for (size_t i = 0; i < Region.depth; i++) {
+        ur_event_handle_t NewEvent{};
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy2D(
+            Queue, Blocking, DstOrigin + (i * DstSlicePitch), DstRowPitch,
+            SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width,
+            Region.height, NumEventsInWaitList, EventWaitList, &NewEvent));
+
+        Events.push_back(NewEvent);
+    }
+
+    UR_CALL(context.urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(),
+                                                     Events.data(), Event));
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
+    // Sub-buffers don't maintain own allocations but rely on parent buffer.
+    if (SubBuffer) {
+        UR_CALL(SubBuffer->Parent->getHandle(Device, Handle));
+        Handle += SubBuffer->Origin;
+        return UR_RESULT_SUCCESS;
+    }
+
+    auto &Allocation = Allocations[Device];
+    if (!Allocation) {
+        ur_usm_desc_t USMDesc{};
+        USMDesc.align = getAlignment();
+        ur_usm_pool_handle_t Pool{};
+        ur_result_t URes = context.interceptor->allocateMemory(
+            Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER,
+            ur_cast<void **>(&Allocation));
+        if (URes != UR_RESULT_SUCCESS) {
+            context.logger.error(
+                "Failed to allocate {} bytes memory for buffer {}", Size, this);
+            return URes;
+        }
+
+        if (HostPtr) {
+            ManagedQueue Queue(Context, Device);
+            URes = context.urDdiTable.Enqueue.pfnUSMMemcpy(
+                Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error("Failed to copy {} bytes data from host "
+                                     "pointer {} to buffer {}",
+                                     Size, HostPtr, this);
+                return URes;
+            }
+        }
+    }
+
+    Handle = Allocation;
+
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t MemBuffer::free() {
+    for (const auto &[_, Ptr] : Allocations) {
+        ur_result_t URes = context.interceptor->releaseMemory(Context, Ptr);
+        if (URes != UR_RESULT_SUCCESS) {
+            context.logger.error("Failed to free buffer handle {}", Ptr);
+            return URes;
+        }
+    }
+    Allocations.clear();
+    return UR_RESULT_SUCCESS;
+}
+
+size_t MemBuffer::getAlignment() {
+    // Choose an alignment that is at most 128 and is the next power of 2
+    // for sizes less than 128.
+    // TODO: If we don't set the alignment size explicitly, the device will
+    // usually choose a very large size (more than 1k). Then sanitizer will
+    // allocate extra unnessary memory. Not sure if this will impact
+    // performance.
+    size_t MsbIdx = 63 - __builtin_clz(Size);
+    size_t Alignment = (1 << (MsbIdx + 1));
+    if (Alignment > 128) {
+        Alignment = 128;
+    }
+    return Alignment;
+}
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp
new file mode 100644
index 0000000000..b4eba4e4ba
--- /dev/null
+++ b/source/loader/layers/sanitizer/asan_buffer.hpp
@@ -0,0 +1,74 @@
+/*
+ *
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See LICENSE.TXT
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * @file asan_buffer.hpp
+ *
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <optional>
+
+#include "common.hpp"
+
+namespace ur_sanitizer_layer {
+
+struct MemBuffer {
+    // Buffer constructor
+    MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr)
+        : Context(Context), Size(Size), HostPtr(HostPtr) {}
+
+    // Sub-buffer constructor
+    MemBuffer(std::shared_ptr<MemBuffer> Parent, size_t Origin, size_t Size)
+        : Context(Parent->Context), Size(Size), SubBuffer{{Parent, Origin}} {}
+
+    ur_result_t getHandle(ur_device_handle_t Device, char *&Handle);
+
+    ur_result_t free();
+
+    size_t getAlignment();
+
+    std::unordered_map<ur_device_handle_t, char *> Allocations;
+
+    enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY };
+
+    struct Mapping {
+        size_t Offset;
+        size_t Size;
+    };
+
+    std::unordered_map<void *, Mapping> Mappings;
+
+    ur_context_handle_t Context;
+
+    size_t Size;
+
+    char *HostPtr{};
+
+    struct SubBuffer_t {
+        std::shared_ptr<MemBuffer> Parent;
+        size_t Origin;
+    };
+
+    std::optional<SubBuffer_t> SubBuffer;
+
+    std::atomic<int32_t> RefCount = 1;
+
+    ur_shared_mutex Mutex;
+};
+
+ur_result_t EnqueueMemCopyRectHelper(
+    ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset,
+    ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch,
+    size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch,
+    bool Blocking, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event);
+
+} // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index 37630e5c29..535247e863 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -259,6 +259,9 @@ ur_result_t SanitizerInterceptor::allocateMemory(
     } else if (Type == AllocType::SHARED_USM) {
         UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
             Context, Device, Properties, Pool, NeededSize, &Allocated));
+    } else if (Type == AllocType::MEM_BUFFER) {
+        UR_CALL(context.urDdiTable.USM.pfnDeviceAlloc(
+            Context, Device, Properties, Pool, NeededSize, &Allocated));
     } else {
         context.logger.error("Unsupport memory type");
         return UR_RESULT_ERROR_INVALID_ARGUMENT;
@@ -662,6 +665,32 @@ ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t
+SanitizerInterceptor::insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(ur_cast<ur_mem_handle_t>(MemBuffer.get())) ==
+           m_MemBufferMap.end());
+    m_MemBufferMap.emplace(reinterpret_cast<ur_mem_handle_t>(MemBuffer.get()),
+                           MemBuffer);
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseMemBuffer(ur_mem_handle_t MemHandle) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    assert(m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end());
+    m_MemBufferMap.erase(MemHandle);
+    return UR_RESULT_SUCCESS;
+}
+
+std::shared_ptr<MemBuffer>
+SanitizerInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) {
+    std::shared_lock<ur_shared_mutex> Guard(m_MemBufferMapMutex);
+    if (m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end()) {
+        return m_MemBufferMap[MemHandle];
+    }
+    return nullptr;
+}
+
 ur_result_t SanitizerInterceptor::prepareLaunch(
     ur_context_handle_t Context, std::shared_ptr<DeviceInfo> &DeviceInfo,
     ur_queue_handle_t Queue, ur_kernel_handle_t Kernel,
@@ -669,6 +698,21 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
     auto Program = GetProgram(Kernel);
 
     do {
+        // Set membuffer arguments
+        auto KernelInfo = getKernelInfo(Kernel);
+        for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) {
+            char *ArgPointer = nullptr;
+            UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer));
+            ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer(
+                Kernel, ArgIndex, nullptr, &ArgPointer);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error(
+                    "Failed to set buffer {} as the {} arg to kernel {}: {}",
+                    ur_cast<ur_mem_handle_t>(MemBuffer.get()), ArgIndex, Kernel,
+                    URes);
+            }
+        }
+
         // Set launch info argument
         auto ArgNums = GetKernelNumArgs(Kernel);
         if (ArgNums) {
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index f02ca3b370..3d6ba9bdf2 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "asan_allocator.hpp"
+#include "asan_buffer.hpp"
 #include "asan_libdevice.hpp"
 #include "common.hpp"
 #include "ur_sanitizer_layer.hpp"
@@ -81,8 +82,10 @@ struct QueueInfo {
 
 struct KernelInfo {
     ur_kernel_handle_t Handle;
-
     ur_shared_mutex Mutex;
+    std::atomic<int32_t> RefCount = 1;
+    std::unordered_map<uint32_t, std::shared_ptr<MemBuffer>> BufferArgs;
+
     // Need preserve the order of local arguments
     std::map<uint32_t, LocalArgsInfo> LocalArgs;
 
@@ -128,7 +131,7 @@ struct ContextInfo {
 };
 
 struct USMLaunchInfo {
-    LaunchInfo *Data;
+    LaunchInfo *Data = nullptr;
 
     ur_context_handle_t Context = nullptr;
     ur_device_handle_t Device = nullptr;
@@ -194,6 +197,10 @@ class SanitizerInterceptor {
     ur_result_t insertKernel(ur_kernel_handle_t Kernel);
     ur_result_t eraseKernel(ur_kernel_handle_t Kernel);
 
+    ur_result_t insertMemBuffer(std::shared_ptr<MemBuffer> MemBuffer);
+    ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle);
+    std::shared_ptr<MemBuffer> getMemBuffer(ur_mem_handle_t MemHandle);
+
     std::optional<AllocationIterator> findAllocInfoByAddress(uptr Address);
 
     std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
@@ -245,6 +252,10 @@ class SanitizerInterceptor {
         m_KernelMap;
     ur_shared_mutex m_KernelMapMutex;
 
+    std::unordered_map<ur_mem_handle_t, std::shared_ptr<MemBuffer>>
+        m_MemBufferMap;
+    ur_shared_mutex m_MemBufferMapMutex;
+
     /// Assumption: all USM chunks are allocated in one VA
     AllocationMap m_AllocationMap;
     ur_shared_mutex m_AllocationMapMutex;
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 58f54c9338..e352ad69b8 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -376,6 +376,730 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemBufferCreate
+__urdlllocal ur_result_t UR_APICALL urMemBufferCreate(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_mem_flags_t flags, ///< [in] allocation and usage information flags
+    size_t size, ///< [in] size in bytes of the memory object to be allocated
+    const ur_buffer_properties_t
+        *pProperties, ///< [in][optional] pointer to buffer creation properties
+    ur_mem_handle_t
+        *phBuffer ///< [out] pointer to handle of the memory buffer created
+) {
+    auto pfnBufferCreate = context.urDdiTable.Mem.pfnBufferCreate;
+
+    if (nullptr == pfnBufferCreate) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    if (nullptr == phBuffer) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    context.logger.debug("==== urMemBufferCreate");
+
+    void *Host = nullptr;
+    if (pProperties) {
+        Host = pProperties->pHost;
+    }
+
+    char *hostPtrOrNull = (flags & UR_MEM_FLAG_USE_HOST_POINTER)
+                              ? ur_cast<char *>(Host)
+                              : nullptr;
+
+    std::shared_ptr<MemBuffer> pMemBuffer =
+        std::make_shared<MemBuffer>(hContext, size, hostPtrOrNull);
+    ur_result_t result = context.interceptor->insertMemBuffer(pMemBuffer);
+    *phBuffer = ur_cast<ur_mem_handle_t>(pMemBuffer.get());
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemGetInfo
+__urdlllocal ur_result_t UR_APICALL urMemGetInfo(
+    ur_mem_handle_t
+        hMemory,            ///< [in] handle to the memory object being queried.
+    ur_mem_info_t propName, ///< [in] type of the info to retrieve.
+    size_t
+        propSize, ///< [in] the number of bytes of memory pointed to by pPropValue.
+    void *
+        pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding
+                    ///< the info.
+    ///< If propSize is less than the real number of bytes needed to return
+    ///< the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+    ///< pPropValue is not used.
+    size_t *
+        pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName.
+) {
+    auto pfnGetInfo = context.urDdiTable.Mem.pfnGetInfo;
+
+    if (nullptr == pfnGetInfo) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urMemGetInfo");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hMemory)) {
+        UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+        switch (propName) {
+        case UR_MEM_INFO_CONTEXT: {
+            return ReturnValue(MemBuffer->Context);
+        }
+        case UR_MEM_INFO_SIZE: {
+            return ReturnValue(size_t{MemBuffer->Size});
+        }
+        default: {
+            return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+        }
+        }
+    } else {
+        UR_CALL(
+            pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemRetain
+__urdlllocal ur_result_t UR_APICALL urMemRetain(
+    ur_mem_handle_t hMem ///< [in] handle of the memory object to get access
+) {
+    auto pfnRetain = context.urDdiTable.Mem.pfnRetain;
+
+    if (nullptr == pfnRetain) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urMemRetain");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) {
+        MemBuffer->RefCount++;
+    } else {
+        UR_CALL(pfnRetain(hMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemRelease
+__urdlllocal ur_result_t UR_APICALL urMemRelease(
+    ur_mem_handle_t hMem ///< [in] handle of the memory object to release
+) {
+    auto pfnRelease = context.urDdiTable.Mem.pfnRelease;
+
+    if (nullptr == pfnRelease) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urMemRelease");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) {
+        if (--MemBuffer->RefCount != 0) {
+            return UR_RESULT_SUCCESS;
+        }
+        UR_CALL(MemBuffer->free());
+        UR_CALL(context.interceptor->eraseMemBuffer(hMem));
+    } else {
+        UR_CALL(pfnRelease(hMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemBufferPartition
+__urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
+    ur_mem_handle_t
+        hBuffer,          ///< [in] handle of the buffer object to allocate from
+    ur_mem_flags_t flags, ///< [in] allocation and usage information flags
+    ur_buffer_create_type_t bufferCreateType, ///< [in] buffer creation type
+    const ur_buffer_region_t
+        *pRegion, ///< [in] pointer to buffer create region information
+    ur_mem_handle_t
+        *phMem ///< [out] pointer to the handle of sub buffer created
+) {
+    auto pfnBufferPartition = context.urDdiTable.Mem.pfnBufferPartition;
+
+    if (nullptr == pfnBufferPartition) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urMemBufferPartition");
+
+    if (auto ParentBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        if (ParentBuffer->Size < (pRegion->origin + pRegion->size)) {
+            return UR_RESULT_ERROR_INVALID_BUFFER_SIZE;
+        }
+        std::shared_ptr<MemBuffer> SubBuffer = std::make_shared<MemBuffer>(
+            ParentBuffer, pRegion->origin, pRegion->size);
+        UR_CALL(context.interceptor->insertMemBuffer(SubBuffer));
+        *phMem = reinterpret_cast<ur_mem_handle_t>(SubBuffer.get());
+    } else {
+        UR_CALL(pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion,
+                                   phMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urMemGetNativeHandle
+__urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
+    ur_mem_handle_t hMem, ///< [in] handle of the mem.
+    ur_device_handle_t hDevice,
+    ur_native_handle_t
+        *phNativeMem ///< [out] a pointer to the native handle of the mem.
+) {
+    auto pfnGetNativeHandle = context.urDdiTable.Mem.pfnGetNativeHandle;
+
+    if (nullptr == pfnGetNativeHandle) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urMemGetNativeHandle");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) {
+        char *Handle = nullptr;
+        UR_CALL(MemBuffer->getHandle(hDevice, Handle));
+        *phNativeMem = ur_cast<ur_native_handle_t>(Handle);
+    } else {
+        UR_CALL(pfnGetNativeHandle(hMem, hDevice, phNativeMem));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferRead
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being read
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferRead = context.urDdiTable.Enqueue.pfnMemBufferRead;
+
+    if (nullptr == pfnMemBufferRead) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferRead");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *pSrc = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, pSrc));
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingRead, pDst, pSrc + offset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size,
+                                 pDst, numEventsInWaitList, phEventWaitList,
+                                 phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferWrite
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being written
+    const void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferWrite = context.urDdiTable.Enqueue.pfnMemBufferWrite;
+
+    if (nullptr == pfnMemBufferWrite) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferWrite");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *pDst = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, pDst));
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingWrite, pDst + offset, pSrc, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size,
+                                  pSrc, numEventsInWaitList, phEventWaitList,
+                                  phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferReadRect
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOrigin,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t
+        bufferRowPitch, ///< [in] length of each row in bytes in the buffer object
+    size_t
+        bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being read
+    size_t
+        hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by
+                      ///< dst
+    size_t
+        hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region
+                        ///< pointed by dst
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferReadRect = context.urDdiTable.Enqueue.pfnMemBufferReadRect;
+
+    if (nullptr == pfnMemBufferReadRect) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferReadRect");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        char *SrcHandle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, SrcHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, SrcHandle, ur_cast<char *>(pDst), bufferOrigin, hostOrigin,
+            region, bufferRowPitch, bufferSlicePitch, hostRowPitch,
+            hostSlicePitch, blockingRead, numEventsInWaitList, phEventWaitList,
+            phEvent));
+    } else {
+        UR_CALL(pfnMemBufferReadRect(
+            hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region,
+            bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch,
+            pDst, numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferWriteRect
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOrigin,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t
+        bufferRowPitch, ///< [in] length of each row in bytes in the buffer object
+    size_t
+        bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being
+                          ///< written
+    size_t
+        hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by
+                      ///< src
+    size_t
+        hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region
+                        ///< pointed by src
+    void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] points to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferWriteRect =
+        context.urDdiTable.Enqueue.pfnMemBufferWriteRect;
+
+    if (nullptr == pfnMemBufferWriteRect) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferWriteRect");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        char *DstHandle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, ur_cast<char *>(pSrc), DstHandle, hostOrigin, bufferOrigin,
+            region, hostRowPitch, hostSlicePitch, bufferRowPitch,
+            bufferSlicePitch, blockingWrite, numEventsInWaitList,
+            phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferWriteRect(
+            hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region,
+            bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch,
+            pSrc, numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferCopy
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBufferSrc, ///< [in][bounds(srcOffset, size)] handle of the src buffer object
+    ur_mem_handle_t
+        hBufferDst, ///< [in][bounds(dstOffset, size)] handle of the dest buffer object
+    size_t srcOffset, ///< [in] offset into hBufferSrc to begin copying from
+    size_t dstOffset, ///< [in] offset info hBufferDst to begin copying into
+    size_t size,      ///< [in] size in bytes of data being copied
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferCopy = context.urDdiTable.Enqueue.pfnMemBufferCopy;
+
+    if (nullptr == pfnMemBufferCopy) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferCopy");
+
+    auto SrcBuffer = context.interceptor->getMemBuffer(hBufferSrc);
+    auto DstBuffer = context.interceptor->getMemBuffer(hBufferDst);
+
+    UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer),
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    if (SrcBuffer && DstBuffer) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *SrcHandle = nullptr;
+        UR_CALL(SrcBuffer->getHandle(Device, SrcHandle));
+
+        char *DstHandle = nullptr;
+        UR_CALL(DstBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, false, DstHandle + dstOffset, SrcHandle + srcOffset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset,
+                                 dstOffset, size, numEventsInWaitList,
+                                 phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferCopyRect
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBufferSrc, ///< [in][bounds(srcOrigin, region)] handle of the source buffer object
+    ur_mem_handle_t
+        hBufferDst, ///< [in][bounds(dstOrigin, region)] handle of the dest buffer object
+    ur_rect_offset_t srcOrigin, ///< [in] 3D offset in the source buffer
+    ur_rect_offset_t dstOrigin, ///< [in] 3D offset in the destination buffer
+    ur_rect_region_t
+        region, ///< [in] source 3D rectangular region descriptor: width, height, depth
+    size_t
+        srcRowPitch, ///< [in] length of each row in bytes in the source buffer object
+    size_t
+        srcSlicePitch, ///< [in] length of each 2D slice in bytes in the source buffer object
+    size_t
+        dstRowPitch, ///< [in] length of each row in bytes in the destination buffer object
+    size_t
+        dstSlicePitch, ///< [in] length of each 2D slice in bytes in the destination buffer object
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferCopyRect = context.urDdiTable.Enqueue.pfnMemBufferCopyRect;
+
+    if (nullptr == pfnMemBufferCopyRect) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferCopyRect");
+
+    auto SrcBuffer = context.interceptor->getMemBuffer(hBufferSrc);
+    auto DstBuffer = context.interceptor->getMemBuffer(hBufferDst);
+
+    UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer),
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    if (SrcBuffer && DstBuffer) {
+        ur_device_handle_t Device = GetDevice(hQueue);
+        char *SrcHandle = nullptr;
+        UR_CALL(SrcBuffer->getHandle(Device, SrcHandle));
+
+        char *DstHandle = nullptr;
+        UR_CALL(DstBuffer->getHandle(Device, DstHandle));
+
+        UR_CALL(EnqueueMemCopyRectHelper(
+            hQueue, SrcHandle, DstHandle, srcOrigin, dstOrigin, region,
+            srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, false,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferCopyRect(
+            hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region,
+            srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferFill
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    const void *pPattern, ///< [in] pointer to the fill pattern
+    size_t patternSize,   ///< [in] size in bytes of the pattern
+    size_t offset,        ///< [in] offset into the buffer
+    size_t size, ///< [in] fill size in bytes, must be a multiple of patternSize
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemBufferFill = context.urDdiTable.Enqueue.pfnMemBufferFill;
+
+    if (nullptr == pfnMemBufferFill) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferFill");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+        char *Handle = nullptr;
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, Handle));
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMFill(
+            hQueue, Handle + offset, patternSize, pPattern, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+        UR_CALL(pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset,
+                                 size, numEventsInWaitList, phEventWaitList,
+                                 phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemBufferMap
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object
+    bool blockingMap, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_map_flags_t mapFlags, ///< [in] flags for read, write, readwrite mapping
+    size_t offset, ///< [in] offset in bytes of the buffer region being mapped
+    size_t size,   ///< [in] size in bytes of the buffer region being mapped
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent, ///< [out][optional] return an event object that identifies this particular
+                 ///< command instance.
+    void **ppRetMap ///< [out] return mapped pointer.  TODO: move it before
+                    ///< numEventsInWaitList?
+) {
+    auto pfnMemBufferMap = context.urDdiTable.Enqueue.pfnMemBufferMap;
+
+    if (nullptr == pfnMemBufferMap) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemBufferMap");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) {
+
+        // Translate the host access mode info.
+        MemBuffer::AccessMode AccessMode = MemBuffer::UNKNOWN;
+        if (mapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION) {
+            AccessMode = MemBuffer::WRITE_ONLY;
+        } else {
+            if (mapFlags & UR_MAP_FLAG_READ) {
+                AccessMode = MemBuffer::READ_ONLY;
+                if (mapFlags & UR_MAP_FLAG_WRITE) {
+                    AccessMode = MemBuffer::READ_WRITE;
+                }
+            } else if (mapFlags & UR_MAP_FLAG_WRITE) {
+                AccessMode = MemBuffer::WRITE_ONLY;
+            }
+        }
+
+        UR_ASSERT(AccessMode != MemBuffer::UNKNOWN,
+                  UR_RESULT_ERROR_INVALID_ARGUMENT);
+
+        ur_device_handle_t Device = GetDevice(hQueue);
+        // If the buffer used host pointer, then we just reuse it. If not, we
+        // need to manually allocate a new host USM.
+        if (MemBuffer->HostPtr) {
+            *ppRetMap = MemBuffer->HostPtr + offset;
+        } else {
+            ur_context_handle_t Context = GetContext(hQueue);
+            ur_usm_desc_t USMDesc{};
+            USMDesc.align = MemBuffer->getAlignment();
+            ur_usm_pool_handle_t Pool{};
+            UR_CALL(context.interceptor->allocateMemory(
+                Context, nullptr, &USMDesc, Pool, size, AllocType::HOST_USM,
+                ppRetMap));
+        }
+
+        // Actually, if the access mode is write only, we don't need to do this
+        // copy. However, in that way, we cannot generate a event to user. So,
+        // we'll aways do copy here.
+        char *SrcHandle = nullptr;
+        UR_CALL(MemBuffer->getHandle(Device, SrcHandle));
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, blockingMap, *ppRetMap, SrcHandle + offset, size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+
+        {
+            std::scoped_lock<ur_shared_mutex> Guard(MemBuffer->Mutex);
+            UR_ASSERT(MemBuffer->Mappings.find(*ppRetMap) ==
+                          MemBuffer->Mappings.end(),
+                      UR_RESULT_ERROR_INVALID_VALUE);
+            MemBuffer->Mappings[*ppRetMap] = {offset, size};
+        }
+    } else {
+        UR_CALL(pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags, offset,
+                                size, numEventsInWaitList, phEventWaitList,
+                                phEvent, ppRetMap));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueMemUnmap
+__urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    ur_mem_handle_t
+        hMem,         ///< [in] handle of the memory (buffer or image) object
+    void *pMappedPtr, ///< [in] mapped host address
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before this command can be executed.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating that this
+    ///< command does not wait on any event to complete.
+    ur_event_handle_t *
+        phEvent ///< [out][optional] return an event object that identifies this particular
+                ///< command instance.
+) {
+    auto pfnMemUnmap = context.urDdiTable.Enqueue.pfnMemUnmap;
+
+    if (nullptr == pfnMemUnmap) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urEnqueueMemUnmap");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) {
+        MemBuffer::Mapping Mapping{};
+        {
+            std::scoped_lock<ur_shared_mutex> Guard(MemBuffer->Mutex);
+            auto It = MemBuffer->Mappings.find(pMappedPtr);
+            UR_ASSERT(It != MemBuffer->Mappings.end(),
+                      UR_RESULT_ERROR_INVALID_VALUE);
+            Mapping = It->second;
+            MemBuffer->Mappings.erase(It);
+        }
+
+        // Write back mapping memory data to device and release mapping memory
+        // if we allocated a host USM. But for now, UR doesn't support event
+        // call back, we can only do blocking copy here.
+        char *DstHandle = nullptr;
+        ur_context_handle_t Context = GetContext(hQueue);
+        ur_device_handle_t Device = GetDevice(hQueue);
+        UR_CALL(MemBuffer->getHandle(Device, DstHandle));
+        UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy(
+            hQueue, true, DstHandle + Mapping.Offset, pMappedPtr, Mapping.Size,
+            numEventsInWaitList, phEventWaitList, phEvent));
+
+        if (!MemBuffer->HostPtr) {
+            UR_CALL(context.interceptor->releaseMemory(Context, pMappedPtr));
+        }
+    } else {
+        UR_CALL(pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList,
+                            phEventWaitList, phEvent));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelCreate
 __urdlllocal ur_result_t UR_APICALL urKernelCreate(
@@ -398,6 +1122,28 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate(
     return UR_RESULT_SUCCESS;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelRetain
+__urdlllocal ur_result_t UR_APICALL urKernelRetain(
+    ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to retain
+) {
+    auto pfnRetain = context.urDdiTable.Kernel.pfnRetain;
+
+    if (nullptr == pfnRetain) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelRetain");
+
+    UR_CALL(pfnRetain(hKernel));
+
+    if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) {
+        KernelInfo->RefCount++;
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urKernelRelease
 __urdlllocal ur_result_t urKernelRelease(
@@ -413,13 +1159,72 @@ __urdlllocal ur_result_t urKernelRelease(
     UR_CALL(pfnRelease(hKernel));
 
     if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) {
-        uint32_t RefCount;
-        UR_CALL(context.urDdiTable.Kernel.pfnGetInfo(
-            hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount),
-            &RefCount, nullptr));
-        if (RefCount == 1) {
-            UR_CALL(context.interceptor->eraseKernel(hKernel));
+        if (--KernelInfo->RefCount != 0) {
+            return UR_RESULT_SUCCESS;
         }
+        UR_CALL(context.interceptor->eraseKernel(hKernel));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgValue
+__urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t argSize,    ///< [in] size of argument type
+    const ur_kernel_arg_value_properties_t
+        *pProperties, ///< [in][optional] pointer to value properties.
+    const void
+        *pArgValue ///< [in] argument value represented as matching arg type.
+) {
+    auto pfnSetArgValue = context.urDdiTable.Kernel.pfnSetArgValue;
+
+    if (nullptr == pfnSetArgValue) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelSetArgValue");
+
+    std::shared_ptr<MemBuffer> MemBuffer;
+    if (argSize == sizeof(ur_mem_handle_t) &&
+        (MemBuffer = context.interceptor->getMemBuffer(
+             *ur_cast<const ur_mem_handle_t *>(pArgValue)))) {
+        auto KernelInfo = context.interceptor->getKernelInfo(hKernel);
+        std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
+        KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
+    } else {
+        UR_CALL(
+            pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue));
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgMemObj
+__urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    const ur_kernel_arg_mem_obj_properties_t
+        *pProperties, ///< [in][optional] pointer to Memory object properties.
+    ur_mem_handle_t hArgValue ///< [in][optional] handle of Memory object.
+) {
+    auto pfnSetArgMemObj = context.urDdiTable.Kernel.pfnSetArgMemObj;
+
+    if (nullptr == pfnSetArgMemObj) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelSetArgMemObj");
+
+    if (auto MemBuffer = context.interceptor->getMemBuffer(hArgValue)) {
+        auto KernelInfo = context.interceptor->getKernelInfo(hKernel);
+        std::scoped_lock<ur_shared_mutex> Guard(KernelInfo->Mutex);
+        KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer);
+    } else {
+        UR_CALL(pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue));
     }
 
     return UR_RESULT_SUCCESS;
@@ -525,16 +1330,16 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramProcAddrTable(
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's ProgramExp table
+/// @brief Exported function for filling application's Kernel table
 ///        with current process' addresses
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
-__urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
+__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
     ur_api_version_t version, ///< [in] API version requested
-    ur_program_exp_dditable_t
+    ur_kernel_dditable_t
         *pDdiTable ///< [in,out] pointer to table of DDI function pointers
 ) {
     if (nullptr == pDdiTable) {
@@ -550,23 +1355,26 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
 
     ur_result_t result = UR_RESULT_SUCCESS;
 
-    pDdiTable->pfnBuildExp = ur_sanitizer_layer::urProgramBuildExp;
-    pDdiTable->pfnLinkExp = ur_sanitizer_layer::urProgramLinkExp;
+    pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::urKernelRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease;
+    pDdiTable->pfnSetArgValue = ur_sanitizer_layer::urKernelSetArgValue;
+    pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::urKernelSetArgMemObj;
+    pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal;
 
     return result;
 }
-
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Kernel table
+/// @brief Exported function for filling application's Mem table
 ///        with current process' addresses
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
-__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
+__urdlllocal ur_result_t UR_APICALL urGetMemProcAddrTable(
     ur_api_version_t version, ///< [in] API version requested
-    ur_kernel_dditable_t
+    ur_mem_dditable_t
         *pDdiTable ///< [in,out] pointer to table of DDI function pointers
 ) {
     if (nullptr == pDdiTable) {
@@ -582,9 +1390,42 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
 
     ur_result_t result = UR_RESULT_SUCCESS;
 
-    pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate;
-    pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease;
-    pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal;
+    pDdiTable->pfnBufferCreate = ur_sanitizer_layer::urMemBufferCreate;
+    pDdiTable->pfnRetain = ur_sanitizer_layer::urMemRetain;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urMemRelease;
+    pDdiTable->pfnBufferPartition = ur_sanitizer_layer::urMemBufferPartition;
+    pDdiTable->pfnGetNativeHandle = ur_sanitizer_layer::urMemGetNativeHandle;
+    pDdiTable->pfnGetInfo = ur_sanitizer_layer::urMemGetInfo;
+
+    return result;
+}
+/// @brief Exported function for filling application's ProgramExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_program_exp_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnBuildExp = ur_sanitizer_layer::urProgramBuildExp;
+    pDdiTable->pfnLinkExp = ur_sanitizer_layer::urProgramLinkExp;
 
     return result;
 }
@@ -614,6 +1455,18 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
 
     ur_result_t result = UR_RESULT_SUCCESS;
 
+    pDdiTable->pfnMemBufferRead = ur_sanitizer_layer::urEnqueueMemBufferRead;
+    pDdiTable->pfnMemBufferWrite = ur_sanitizer_layer::urEnqueueMemBufferWrite;
+    pDdiTable->pfnMemBufferReadRect =
+        ur_sanitizer_layer::urEnqueueMemBufferReadRect;
+    pDdiTable->pfnMemBufferWriteRect =
+        ur_sanitizer_layer::urEnqueueMemBufferWriteRect;
+    pDdiTable->pfnMemBufferCopy = ur_sanitizer_layer::urEnqueueMemBufferCopy;
+    pDdiTable->pfnMemBufferCopyRect =
+        ur_sanitizer_layer::urEnqueueMemBufferCopyRect;
+    pDdiTable->pfnMemBufferFill = ur_sanitizer_layer::urEnqueueMemBufferFill;
+    pDdiTable->pfnMemBufferMap = ur_sanitizer_layer::urEnqueueMemBufferMap;
+    pDdiTable->pfnMemUnmap = ur_sanitizer_layer::urEnqueueMemUnmap;
     pDdiTable->pfnKernelLaunch = ur_sanitizer_layer::urEnqueueKernelLaunch;
 
     return result;
@@ -698,6 +1551,16 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->Program);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetKernelProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Kernel);
+    }
+
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetMemProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Mem);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_sanitizer_layer::urGetProgramExpProcAddrTable(
             UR_API_VERSION_CURRENT, &dditable->ProgramExp);
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 56f270a9d9..e5cc8f81e2 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -352,6 +352,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -365,15 +367,15 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     }
 
     ur_platform_create_with_native_handle_params_t params = {
-        &hNativePlatform, &pProperties, &phPlatform};
+        &hNativePlatform, &hAdapter, &pProperties, &phPlatform};
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE,
                              "urPlatformCreateWithNativeHandle", &params);
 
     context.logger.info("---> urPlatformCreateWithNativeHandle");
 
-    ur_result_t result =
-        pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
+    ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter,
+                                                   pProperties, phPlatform);
 
     context.notify_end(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE,
                        "urPlatformCreateWithNativeHandle", &params, &result,
@@ -3420,6 +3422,57 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    auto pfnGetSuggestedLocalWorkSize =
+        context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_kernel_get_suggested_local_work_size_params_t params = {
+        &hKernel,           &hQueue,          &numWorkDim,
+        &pGlobalWorkOffset, &pGlobalWorkSize, &pSuggestedLocalWorkSize};
+    uint64_t instance =
+        context.notify_begin(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE,
+                             "urKernelGetSuggestedLocalWorkSize", &params);
+
+    context.logger.info("---> urKernelGetSuggestedLocalWorkSize");
+
+    ur_result_t result = pfnGetSuggestedLocalWorkSize(
+        hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pSuggestedLocalWorkSize);
+
+    context.notify_end(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE,
+                       "urKernelGetSuggestedLocalWorkSize", &params, &result,
+                       instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -8348,6 +8401,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         ur_tracing_layer::urKernelCreateWithNativeHandle;
 
+    dditable.pfnGetSuggestedLocalWorkSize =
+        pDdiTable->pfnGetSuggestedLocalWorkSize;
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        ur_tracing_layer::urKernelGetSuggestedLocalWorkSize;
+
     dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
     pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue;
 
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 4bdd801c1a..5ac97dfc1d 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -347,6 +347,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -360,13 +362,22 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     }
 
     if (context.enableParameterValidation) {
+        if (NULL == hAdapter) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
         if (NULL == phPlatform) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
     }
 
-    ur_result_t result =
-        pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hAdapter)) {
+        refCountContext.logInvalidReference(hAdapter);
+    }
+
+    ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter,
+                                                   pProperties, phPlatform);
 
     return result;
 }
@@ -1067,7 +1078,7 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate(
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
 
@@ -3857,6 +3868,71 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    auto pfnGetSuggestedLocalWorkSize =
+        context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (context.enableParameterValidation) {
+        if (NULL == hKernel) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == hQueue) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == pGlobalWorkOffset) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == pGlobalWorkSize) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (NULL == pSuggestedLocalWorkSize) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hKernel)) {
+        refCountContext.logInvalidReference(hKernel);
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hQueue)) {
+        refCountContext.logInvalidReference(hQueue);
+    }
+
+    ur_result_t result = pfnGetSuggestedLocalWorkSize(
+        hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pSuggestedLocalWorkSize);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -6887,7 +6963,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
     }
@@ -6996,7 +7072,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
     }
@@ -7067,7 +7143,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
     }
@@ -7159,7 +7235,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
 
@@ -7412,7 +7488,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) {
+        if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) {
             return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
         }
     }
@@ -10006,6 +10082,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
     pDdiTable->pfnCreateWithNativeHandle =
         ur_validation_layer::urKernelCreateWithNativeHandle;
 
+    dditable.pfnGetSuggestedLocalWorkSize =
+        pDdiTable->pfnGetSuggestedLocalWorkSize;
+    pDdiTable->pfnGetSuggestedLocalWorkSize =
+        ur_validation_layer::urKernelGetSuggestedLocalWorkSize;
+
     dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue;
     pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue;
 
diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in
new file mode 100644
index 0000000000..b68c1ab6c1
--- /dev/null
+++ b/source/loader/loader.def.in
@@ -0,0 +1,538 @@
+LIBRARY @TARGET_LIBNAME@
+EXPORTS
+	urAdapterGet
+	urAdapterGetInfo
+	urAdapterGetLastError
+	urAdapterRelease
+	urAdapterRetain
+	urBindlessImagesDestroyExternalSemaphoreExp
+	urBindlessImagesImageAllocateExp
+	urBindlessImagesImageCopyExp
+	urBindlessImagesImageFreeExp
+	urBindlessImagesImageGetInfoExp
+	urBindlessImagesImportExternalSemaphoreOpaqueFDExp
+	urBindlessImagesImportOpaqueFDExp
+	urBindlessImagesMapExternalArrayExp
+	urBindlessImagesMipmapFreeExp
+	urBindlessImagesMipmapGetLevelExp
+	urBindlessImagesReleaseInteropExp
+	urBindlessImagesSampledImageCreateExp
+	urBindlessImagesSampledImageHandleDestroyExp
+	urBindlessImagesSignalExternalSemaphoreExp
+	urBindlessImagesUnsampledImageCreateExp
+	urBindlessImagesUnsampledImageHandleDestroyExp
+	urBindlessImagesWaitExternalSemaphoreExp
+	urCommandBufferAppendKernelLaunchExp
+	urCommandBufferAppendMemBufferCopyExp
+	urCommandBufferAppendMemBufferCopyRectExp
+	urCommandBufferAppendMemBufferFillExp
+	urCommandBufferAppendMemBufferReadExp
+	urCommandBufferAppendMemBufferReadRectExp
+	urCommandBufferAppendMemBufferWriteExp
+	urCommandBufferAppendMemBufferWriteRectExp
+	urCommandBufferAppendUSMAdviseExp
+	urCommandBufferAppendUSMFillExp
+	urCommandBufferAppendUSMMemcpyExp
+	urCommandBufferAppendUSMPrefetchExp
+	urCommandBufferCommandGetInfoExp
+	urCommandBufferCreateExp
+	urCommandBufferEnqueueExp
+	urCommandBufferFinalizeExp
+	urCommandBufferGetInfoExp
+	urCommandBufferReleaseCommandExp
+	urCommandBufferReleaseExp
+	urCommandBufferRetainCommandExp
+	urCommandBufferRetainExp
+	urCommandBufferUpdateKernelLaunchExp
+	urContextCreate
+	urContextCreateWithNativeHandle
+	urContextGetInfo
+	urContextGetNativeHandle
+	urContextRelease
+	urContextRetain
+	urContextSetExtendedDeleter
+	urDeviceCreateWithNativeHandle
+	urDeviceGet
+	urDeviceGetGlobalTimestamps
+	urDeviceGetInfo
+	urDeviceGetNativeHandle
+	urDeviceGetSelected
+	urDevicePartition
+	urDeviceRelease
+	urDeviceRetain
+	urDeviceSelectBinary
+	urEnqueueCooperativeKernelLaunchExp
+	urEnqueueDeviceGlobalVariableRead
+	urEnqueueDeviceGlobalVariableWrite
+	urEnqueueEventsWait
+	urEnqueueEventsWaitWithBarrier
+	urEnqueueKernelLaunch
+	urEnqueueKernelLaunchCustomExp
+	urEnqueueMemBufferCopy
+	urEnqueueMemBufferCopyRect
+	urEnqueueMemBufferFill
+	urEnqueueMemBufferMap
+	urEnqueueMemBufferRead
+	urEnqueueMemBufferReadRect
+	urEnqueueMemBufferWrite
+	urEnqueueMemBufferWriteRect
+	urEnqueueMemImageCopy
+	urEnqueueMemImageRead
+	urEnqueueMemImageWrite
+	urEnqueueMemUnmap
+	urEnqueueReadHostPipe
+	urEnqueueTimestampRecordingExp
+	urEnqueueUSMAdvise
+	urEnqueueUSMFill
+	urEnqueueUSMFill2D
+	urEnqueueUSMMemcpy
+	urEnqueueUSMMemcpy2D
+	urEnqueueUSMPrefetch
+	urEnqueueWriteHostPipe
+	urEventCreateWithNativeHandle
+	urEventGetInfo
+	urEventGetNativeHandle
+	urEventGetProfilingInfo
+	urEventRelease
+	urEventRetain
+	urEventSetCallback
+	urEventWait
+	urGetBindlessImagesExpProcAddrTable
+	urGetCommandBufferExpProcAddrTable
+	urGetContextProcAddrTable
+	urGetDeviceProcAddrTable
+	urGetEnqueueExpProcAddrTable
+	urGetEnqueueProcAddrTable
+	urGetEventProcAddrTable
+	urGetGlobalProcAddrTable
+	urGetKernelExpProcAddrTable
+	urGetKernelProcAddrTable
+	urGetMemProcAddrTable
+	urGetPhysicalMemProcAddrTable
+	urGetPlatformProcAddrTable
+	urGetProgramExpProcAddrTable
+	urGetProgramProcAddrTable
+	urGetQueueProcAddrTable
+	urGetSamplerProcAddrTable
+	urGetUSMExpProcAddrTable
+	urGetUSMProcAddrTable
+	urGetUsmP2PExpProcAddrTable
+	urGetVirtualMemProcAddrTable
+	urKernelCreate
+	urKernelCreateWithNativeHandle
+	urKernelGetGroupInfo
+	urKernelGetInfo
+	urKernelGetNativeHandle
+	urKernelGetSubGroupInfo
+	urKernelGetSuggestedLocalWorkSize
+	urKernelRelease
+	urKernelRetain
+	urKernelSetArgLocal
+	urKernelSetArgMemObj
+	urKernelSetArgPointer
+	urKernelSetArgSampler
+	urKernelSetArgValue
+	urKernelSetExecInfo
+	urKernelSetSpecializationConstants
+	urKernelSuggestMaxCooperativeGroupCountExp
+	urLoaderConfigCreate
+	urLoaderConfigEnableLayer
+	urLoaderConfigGetInfo
+	urLoaderConfigRelease
+	urLoaderConfigRetain
+	urLoaderConfigSetCodeLocationCallback
+	urLoaderInit
+	urLoaderTearDown
+	urMemBufferCreate
+	urMemBufferCreateWithNativeHandle
+	urMemBufferPartition
+	urMemGetInfo
+	urMemGetNativeHandle
+	urMemImageCreate
+	urMemImageCreateWithNativeHandle
+	urMemImageGetInfo
+	urMemRelease
+	urMemRetain
+	urPhysicalMemCreate
+	urPhysicalMemRelease
+	urPhysicalMemRetain
+	urPlatformCreateWithNativeHandle
+	urPlatformGet
+	urPlatformGetApiVersion
+	urPlatformGetBackendOption
+	urPlatformGetInfo
+	urPlatformGetNativeHandle
+	urPrintAdapterBackend
+	urPrintAdapterGetInfoParams
+	urPrintAdapterGetLastErrorParams
+	urPrintAdapterGetParams
+	urPrintAdapterInfo
+	urPrintAdapterReleaseParams
+	urPrintAdapterRetainParams
+	urPrintApiVersion
+	urPrintBaseDesc
+	urPrintBaseProperties
+	urPrintBindlessImagesDestroyExternalSemaphoreExpParams
+	urPrintBindlessImagesImageAllocateExpParams
+	urPrintBindlessImagesImageCopyExpParams
+	urPrintBindlessImagesImageFreeExpParams
+	urPrintBindlessImagesImageGetInfoExpParams
+	urPrintBindlessImagesImportExternalSemaphoreOpaqueFdExpParams
+	urPrintBindlessImagesImportOpaqueFdExpParams
+	urPrintBindlessImagesMapExternalArrayExpParams
+	urPrintBindlessImagesMipmapFreeExpParams
+	urPrintBindlessImagesMipmapGetLevelExpParams
+	urPrintBindlessImagesReleaseInteropExpParams
+	urPrintBindlessImagesSampledImageCreateExpParams
+	urPrintBindlessImagesSampledImageHandleDestroyExpParams
+	urPrintBindlessImagesSignalExternalSemaphoreExpParams
+	urPrintBindlessImagesUnsampledImageCreateExpParams
+	urPrintBindlessImagesUnsampledImageHandleDestroyExpParams
+	urPrintBindlessImagesWaitExternalSemaphoreExpParams
+	urPrintBufferAllocLocationProperties
+	urPrintBufferChannelProperties
+	urPrintBufferCreateType
+	urPrintBufferProperties
+	urPrintBufferRegion
+	urPrintCodeLocation
+	urPrintCommand
+	urPrintCommandBufferAppendKernelLaunchExpParams
+	urPrintCommandBufferAppendMemBufferCopyExpParams
+	urPrintCommandBufferAppendMemBufferCopyRectExpParams
+	urPrintCommandBufferAppendMemBufferFillExpParams
+	urPrintCommandBufferAppendMemBufferReadExpParams
+	urPrintCommandBufferAppendMemBufferReadRectExpParams
+	urPrintCommandBufferAppendMemBufferWriteExpParams
+	urPrintCommandBufferAppendMemBufferWriteRectExpParams
+	urPrintCommandBufferAppendUsmAdviseExpParams
+	urPrintCommandBufferAppendUsmFillExpParams
+	urPrintCommandBufferAppendUsmMemcpyExpParams
+	urPrintCommandBufferAppendUsmPrefetchExpParams
+	urPrintCommandBufferCommandGetInfoExpParams
+	urPrintCommandBufferCreateExpParams
+	urPrintCommandBufferEnqueueExpParams
+	urPrintCommandBufferFinalizeExpParams
+	urPrintCommandBufferGetInfoExpParams
+	urPrintCommandBufferReleaseCommandExpParams
+	urPrintCommandBufferReleaseExpParams
+	urPrintCommandBufferRetainCommandExpParams
+	urPrintCommandBufferRetainExpParams
+	urPrintCommandBufferUpdateKernelLaunchExpParams
+	urPrintContextCreateParams
+	urPrintContextCreateWithNativeHandleParams
+	urPrintContextFlags
+	urPrintContextGetInfoParams
+	urPrintContextGetNativeHandleParams
+	urPrintContextInfo
+	urPrintContextNativeProperties
+	urPrintContextProperties
+	urPrintContextReleaseParams
+	urPrintContextRetainParams
+	urPrintContextSetExtendedDeleterParams
+	urPrintDeviceAffinityDomainFlags
+	urPrintDeviceBinary
+	urPrintDeviceCreateWithNativeHandleParams
+	urPrintDeviceExecCapabilityFlags
+	urPrintDeviceFpCapabilityFlags
+	urPrintDeviceGetGlobalTimestampsParams
+	urPrintDeviceGetInfoParams
+	urPrintDeviceGetNativeHandleParams
+	urPrintDeviceGetParams
+	urPrintDeviceGetSelectedParams
+	urPrintDeviceInfo
+	urPrintDeviceInitFlags
+	urPrintDeviceLocalMemType
+	urPrintDeviceMemCacheType
+	urPrintDeviceNativeProperties
+	urPrintDevicePartition
+	urPrintDevicePartitionParams
+	urPrintDevicePartitionProperties
+	urPrintDevicePartitionProperty
+	urPrintDeviceReleaseParams
+	urPrintDeviceRetainParams
+	urPrintDeviceSelectBinaryParams
+	urPrintDeviceType
+	urPrintDeviceUsmAccessCapabilityFlags
+	urPrintEnqueueCooperativeKernelLaunchExpParams
+	urPrintEnqueueDeviceGlobalVariableReadParams
+	urPrintEnqueueDeviceGlobalVariableWriteParams
+	urPrintEnqueueEventsWaitParams
+	urPrintEnqueueEventsWaitWithBarrierParams
+	urPrintEnqueueKernelLaunchCustomExpParams
+	urPrintEnqueueKernelLaunchParams
+	urPrintEnqueueMemBufferCopyParams
+	urPrintEnqueueMemBufferCopyRectParams
+	urPrintEnqueueMemBufferFillParams
+	urPrintEnqueueMemBufferMapParams
+	urPrintEnqueueMemBufferReadParams
+	urPrintEnqueueMemBufferReadRectParams
+	urPrintEnqueueMemBufferWriteParams
+	urPrintEnqueueMemBufferWriteRectParams
+	urPrintEnqueueMemImageCopyParams
+	urPrintEnqueueMemImageReadParams
+	urPrintEnqueueMemImageWriteParams
+	urPrintEnqueueMemUnmapParams
+	urPrintEnqueueReadHostPipeParams
+	urPrintEnqueueTimestampRecordingExpParams
+	urPrintEnqueueUsmAdviseParams
+	urPrintEnqueueUsmFillParams
+	urPrintEnqueueUsmFill_2dParams
+	urPrintEnqueueUsmMemcpyParams
+	urPrintEnqueueUsmMemcpy_2dParams
+	urPrintEnqueueUsmPrefetchParams
+	urPrintEnqueueWriteHostPipeParams
+	urPrintEventCreateWithNativeHandleParams
+	urPrintEventGetInfoParams
+	urPrintEventGetNativeHandleParams
+	urPrintEventGetProfilingInfoParams
+	urPrintEventInfo
+	urPrintEventNativeProperties
+	urPrintEventReleaseParams
+	urPrintEventRetainParams
+	urPrintEventSetCallbackParams
+	urPrintEventStatus
+	urPrintEventWaitParams
+	urPrintExecutionInfo
+	urPrintExpCommandBufferCommandInfo
+	urPrintExpCommandBufferDesc
+	urPrintExpCommandBufferInfo
+	urPrintExpCommandBufferUpdateKernelLaunchDesc
+	urPrintExpCommandBufferUpdateMemobjArgDesc
+	urPrintExpCommandBufferUpdatePointerArgDesc
+	urPrintExpCommandBufferUpdateValueArgDesc
+	urPrintExpFileDescriptor
+	urPrintExpImageCopyFlags
+	urPrintExpInteropMemDesc
+	urPrintExpInteropSemaphoreDesc
+	urPrintExpLaunchProperty
+	urPrintExpLaunchPropertyId
+	urPrintExpPeerInfo
+	urPrintExpSamplerAddrModes
+	urPrintExpSamplerCubemapFilterMode
+	urPrintExpSamplerCubemapProperties
+	urPrintExpSamplerMipProperties
+	urPrintExpWin32Handle
+	urPrintFunction
+	urPrintFunctionParams
+	urPrintImageChannelOrder
+	urPrintImageChannelType
+	urPrintImageDesc
+	urPrintImageFormat
+	urPrintImageInfo
+	urPrintKernelArgLocalProperties
+	urPrintKernelArgMemObjProperties
+	urPrintKernelArgPointerProperties
+	urPrintKernelArgSamplerProperties
+	urPrintKernelArgValueProperties
+	urPrintKernelCacheConfig
+	urPrintKernelCreateParams
+	urPrintKernelCreateWithNativeHandleParams
+	urPrintKernelExecInfo
+	urPrintKernelExecInfoProperties
+	urPrintKernelGetGroupInfoParams
+	urPrintKernelGetInfoParams
+	urPrintKernelGetNativeHandleParams
+	urPrintKernelGetSubGroupInfoParams
+	urPrintKernelGetSuggestedLocalWorkSizeParams
+	urPrintKernelGroupInfo
+	urPrintKernelInfo
+	urPrintKernelNativeProperties
+	urPrintKernelReleaseParams
+	urPrintKernelRetainParams
+	urPrintKernelSetArgLocalParams
+	urPrintKernelSetArgMemObjParams
+	urPrintKernelSetArgPointerParams
+	urPrintKernelSetArgSamplerParams
+	urPrintKernelSetArgValueParams
+	urPrintKernelSetExecInfoParams
+	urPrintKernelSetSpecializationConstantsParams
+	urPrintKernelSubGroupInfo
+	urPrintKernelSuggestMaxCooperativeGroupCountExpParams
+	urPrintLoaderConfigCreateParams
+	urPrintLoaderConfigEnableLayerParams
+	urPrintLoaderConfigGetInfoParams
+	urPrintLoaderConfigInfo
+	urPrintLoaderConfigReleaseParams
+	urPrintLoaderConfigRetainParams
+	urPrintLoaderConfigSetCodeLocationCallbackParams
+	urPrintLoaderInitParams
+	urPrintLoaderTearDownParams
+	urPrintMapFlags
+	urPrintMemBufferCreateParams
+	urPrintMemBufferCreateWithNativeHandleParams
+	urPrintMemBufferPartitionParams
+	urPrintMemFlags
+	urPrintMemGetInfoParams
+	urPrintMemGetNativeHandleParams
+	urPrintMemImageCreateParams
+	urPrintMemImageCreateWithNativeHandleParams
+	urPrintMemImageGetInfoParams
+	urPrintMemInfo
+	urPrintMemNativeProperties
+	urPrintMemReleaseParams
+	urPrintMemRetainParams
+	urPrintMemType
+	urPrintMemoryOrderCapabilityFlags
+	urPrintMemoryScopeCapabilityFlags
+	urPrintPhysicalMemCreateParams
+	urPrintPhysicalMemFlags
+	urPrintPhysicalMemProperties
+	urPrintPhysicalMemReleaseParams
+	urPrintPhysicalMemRetainParams
+	urPrintPlatformBackend
+	urPrintPlatformCreateWithNativeHandleParams
+	urPrintPlatformGetApiVersionParams
+	urPrintPlatformGetBackendOptionParams
+	urPrintPlatformGetInfoParams
+	urPrintPlatformGetNativeHandleParams
+	urPrintPlatformGetParams
+	urPrintPlatformInfo
+	urPrintPlatformNativeProperties
+	urPrintProfilingInfo
+	urPrintProgramBinaryType
+	urPrintProgramBuildExpParams
+	urPrintProgramBuildInfo
+	urPrintProgramBuildParams
+	urPrintProgramBuildStatus
+	urPrintProgramCompileExpParams
+	urPrintProgramCompileParams
+	urPrintProgramCreateWithBinaryParams
+	urPrintProgramCreateWithIlParams
+	urPrintProgramCreateWithNativeHandleParams
+	urPrintProgramGetBuildInfoParams
+	urPrintProgramGetFunctionPointerParams
+	urPrintProgramGetGlobalVariablePointerParams
+	urPrintProgramGetInfoParams
+	urPrintProgramGetNativeHandleParams
+	urPrintProgramInfo
+	urPrintProgramLinkExpParams
+	urPrintProgramLinkParams
+	urPrintProgramMetadata
+	urPrintProgramMetadataType
+	urPrintProgramNativeProperties
+	urPrintProgramProperties
+	urPrintProgramReleaseParams
+	urPrintProgramRetainParams
+	urPrintProgramSetSpecializationConstantsParams
+	urPrintQueueCreateParams
+	urPrintQueueCreateWithNativeHandleParams
+	urPrintQueueFinishParams
+	urPrintQueueFlags
+	urPrintQueueFlushParams
+	urPrintQueueGetInfoParams
+	urPrintQueueGetNativeHandleParams
+	urPrintQueueIndexProperties
+	urPrintQueueInfo
+	urPrintQueueNativeDesc
+	urPrintQueueNativeProperties
+	urPrintQueueProperties
+	urPrintQueueReleaseParams
+	urPrintQueueRetainParams
+	urPrintRectOffset
+	urPrintRectRegion
+	urPrintResult
+	urPrintSamplerAddressingMode
+	urPrintSamplerCreateParams
+	urPrintSamplerCreateWithNativeHandleParams
+	urPrintSamplerDesc
+	urPrintSamplerFilterMode
+	urPrintSamplerGetInfoParams
+	urPrintSamplerGetNativeHandleParams
+	urPrintSamplerInfo
+	urPrintSamplerNativeProperties
+	urPrintSamplerReleaseParams
+	urPrintSamplerRetainParams
+	urPrintSpecializationConstantInfo
+	urPrintStructureType
+	urPrintUsmAdviceFlags
+	urPrintUsmAllocInfo
+	urPrintUsmAllocLocationDesc
+	urPrintUsmDesc
+	urPrintUsmDeviceAllocParams
+	urPrintUsmDeviceDesc
+	urPrintUsmDeviceMemFlags
+	urPrintUsmFreeParams
+	urPrintUsmGetMemAllocInfoParams
+	urPrintUsmHostAllocParams
+	urPrintUsmHostDesc
+	urPrintUsmHostMemFlags
+	urPrintUsmImportExpParams
+	urPrintUsmMigrationFlags
+	urPrintUsmP2pDisablePeerAccessExpParams
+	urPrintUsmP2pEnablePeerAccessExpParams
+	urPrintUsmP2pPeerAccessGetInfoExpParams
+	urPrintUsmPitchedAllocExpParams
+	urPrintUsmPoolCreateParams
+	urPrintUsmPoolDesc
+	urPrintUsmPoolFlags
+	urPrintUsmPoolGetInfoParams
+	urPrintUsmPoolInfo
+	urPrintUsmPoolLimitsDesc
+	urPrintUsmPoolReleaseParams
+	urPrintUsmPoolRetainParams
+	urPrintUsmReleaseExpParams
+	urPrintUsmSharedAllocParams
+	urPrintUsmType
+	urPrintVirtualMemAccessFlags
+	urPrintVirtualMemFreeParams
+	urPrintVirtualMemGetInfoParams
+	urPrintVirtualMemGranularityGetInfoParams
+	urPrintVirtualMemGranularityInfo
+	urPrintVirtualMemInfo
+	urPrintVirtualMemMapParams
+	urPrintVirtualMemReserveParams
+	urPrintVirtualMemSetAccessParams
+	urPrintVirtualMemUnmapParams
+	urProgramBuild
+	urProgramBuildExp
+	urProgramCompile
+	urProgramCompileExp
+	urProgramCreateWithBinary
+	urProgramCreateWithIL
+	urProgramCreateWithNativeHandle
+	urProgramGetBuildInfo
+	urProgramGetFunctionPointer
+	urProgramGetGlobalVariablePointer
+	urProgramGetInfo
+	urProgramGetNativeHandle
+	urProgramLink
+	urProgramLinkExp
+	urProgramRelease
+	urProgramRetain
+	urProgramSetSpecializationConstants
+	urQueueCreate
+	urQueueCreateWithNativeHandle
+	urQueueFinish
+	urQueueFlush
+	urQueueGetInfo
+	urQueueGetNativeHandle
+	urQueueRelease
+	urQueueRetain
+	urSamplerCreate
+	urSamplerCreateWithNativeHandle
+	urSamplerGetInfo
+	urSamplerGetNativeHandle
+	urSamplerRelease
+	urSamplerRetain
+	urUSMDeviceAlloc
+	urUSMFree
+	urUSMGetMemAllocInfo
+	urUSMHostAlloc
+	urUSMImportExp
+	urUSMPitchedAllocExp
+	urUSMPoolCreate
+	urUSMPoolGetInfo
+	urUSMPoolRelease
+	urUSMPoolRetain
+	urUSMReleaseExp
+	urUSMSharedAlloc
+	urUsmP2PDisablePeerAccessExp
+	urUsmP2PEnablePeerAccessExp
+	urUsmP2PPeerAccessGetInfoExp
+	urVirtualMemFree
+	urVirtualMemGetInfo
+	urVirtualMemGranularityGetInfo
+	urVirtualMemMap
+	urVirtualMemReserve
+	urVirtualMemSetAccess
+	urVirtualMemUnmap
diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in
new file mode 100644
index 0000000000..fd390f7fc4
--- /dev/null
+++ b/source/loader/loader.map.in
@@ -0,0 +1,541 @@
+@TARGET_LIBNAME@ {
+	global:
+		urAdapterGet;
+		urAdapterGetInfo;
+		urAdapterGetLastError;
+		urAdapterRelease;
+		urAdapterRetain;
+		urBindlessImagesDestroyExternalSemaphoreExp;
+		urBindlessImagesImageAllocateExp;
+		urBindlessImagesImageCopyExp;
+		urBindlessImagesImageFreeExp;
+		urBindlessImagesImageGetInfoExp;
+		urBindlessImagesImportExternalSemaphoreOpaqueFDExp;
+		urBindlessImagesImportOpaqueFDExp;
+		urBindlessImagesMapExternalArrayExp;
+		urBindlessImagesMipmapFreeExp;
+		urBindlessImagesMipmapGetLevelExp;
+		urBindlessImagesReleaseInteropExp;
+		urBindlessImagesSampledImageCreateExp;
+		urBindlessImagesSampledImageHandleDestroyExp;
+		urBindlessImagesSignalExternalSemaphoreExp;
+		urBindlessImagesUnsampledImageCreateExp;
+		urBindlessImagesUnsampledImageHandleDestroyExp;
+		urBindlessImagesWaitExternalSemaphoreExp;
+		urCommandBufferAppendKernelLaunchExp;
+		urCommandBufferAppendMemBufferCopyExp;
+		urCommandBufferAppendMemBufferCopyRectExp;
+		urCommandBufferAppendMemBufferFillExp;
+		urCommandBufferAppendMemBufferReadExp;
+		urCommandBufferAppendMemBufferReadRectExp;
+		urCommandBufferAppendMemBufferWriteExp;
+		urCommandBufferAppendMemBufferWriteRectExp;
+		urCommandBufferAppendUSMAdviseExp;
+		urCommandBufferAppendUSMFillExp;
+		urCommandBufferAppendUSMMemcpyExp;
+		urCommandBufferAppendUSMPrefetchExp;
+		urCommandBufferCommandGetInfoExp;
+		urCommandBufferCreateExp;
+		urCommandBufferEnqueueExp;
+		urCommandBufferFinalizeExp;
+		urCommandBufferGetInfoExp;
+		urCommandBufferReleaseCommandExp;
+		urCommandBufferReleaseExp;
+		urCommandBufferRetainCommandExp;
+		urCommandBufferRetainExp;
+		urCommandBufferUpdateKernelLaunchExp;
+		urContextCreate;
+		urContextCreateWithNativeHandle;
+		urContextGetInfo;
+		urContextGetNativeHandle;
+		urContextRelease;
+		urContextRetain;
+		urContextSetExtendedDeleter;
+		urDeviceCreateWithNativeHandle;
+		urDeviceGet;
+		urDeviceGetGlobalTimestamps;
+		urDeviceGetInfo;
+		urDeviceGetNativeHandle;
+		urDeviceGetSelected;
+		urDevicePartition;
+		urDeviceRelease;
+		urDeviceRetain;
+		urDeviceSelectBinary;
+		urEnqueueCooperativeKernelLaunchExp;
+		urEnqueueDeviceGlobalVariableRead;
+		urEnqueueDeviceGlobalVariableWrite;
+		urEnqueueEventsWait;
+		urEnqueueEventsWaitWithBarrier;
+		urEnqueueKernelLaunch;
+		urEnqueueKernelLaunchCustomExp;
+		urEnqueueMemBufferCopy;
+		urEnqueueMemBufferCopyRect;
+		urEnqueueMemBufferFill;
+		urEnqueueMemBufferMap;
+		urEnqueueMemBufferRead;
+		urEnqueueMemBufferReadRect;
+		urEnqueueMemBufferWrite;
+		urEnqueueMemBufferWriteRect;
+		urEnqueueMemImageCopy;
+		urEnqueueMemImageRead;
+		urEnqueueMemImageWrite;
+		urEnqueueMemUnmap;
+		urEnqueueReadHostPipe;
+		urEnqueueTimestampRecordingExp;
+		urEnqueueUSMAdvise;
+		urEnqueueUSMFill;
+		urEnqueueUSMFill2D;
+		urEnqueueUSMMemcpy;
+		urEnqueueUSMMemcpy2D;
+		urEnqueueUSMPrefetch;
+		urEnqueueWriteHostPipe;
+		urEventCreateWithNativeHandle;
+		urEventGetInfo;
+		urEventGetNativeHandle;
+		urEventGetProfilingInfo;
+		urEventRelease;
+		urEventRetain;
+		urEventSetCallback;
+		urEventWait;
+		urGetBindlessImagesExpProcAddrTable;
+		urGetCommandBufferExpProcAddrTable;
+		urGetContextProcAddrTable;
+		urGetDeviceProcAddrTable;
+		urGetEnqueueExpProcAddrTable;
+		urGetEnqueueProcAddrTable;
+		urGetEventProcAddrTable;
+		urGetGlobalProcAddrTable;
+		urGetKernelExpProcAddrTable;
+		urGetKernelProcAddrTable;
+		urGetMemProcAddrTable;
+		urGetPhysicalMemProcAddrTable;
+		urGetPlatformProcAddrTable;
+		urGetProgramExpProcAddrTable;
+		urGetProgramProcAddrTable;
+		urGetQueueProcAddrTable;
+		urGetSamplerProcAddrTable;
+		urGetUSMExpProcAddrTable;
+		urGetUSMProcAddrTable;
+		urGetUsmP2PExpProcAddrTable;
+		urGetVirtualMemProcAddrTable;
+		urKernelCreate;
+		urKernelCreateWithNativeHandle;
+		urKernelGetGroupInfo;
+		urKernelGetInfo;
+		urKernelGetNativeHandle;
+		urKernelGetSubGroupInfo;
+		urKernelGetSuggestedLocalWorkSize;
+		urKernelRelease;
+		urKernelRetain;
+		urKernelSetArgLocal;
+		urKernelSetArgMemObj;
+		urKernelSetArgPointer;
+		urKernelSetArgSampler;
+		urKernelSetArgValue;
+		urKernelSetExecInfo;
+		urKernelSetSpecializationConstants;
+		urKernelSuggestMaxCooperativeGroupCountExp;
+		urLoaderConfigCreate;
+		urLoaderConfigEnableLayer;
+		urLoaderConfigGetInfo;
+		urLoaderConfigRelease;
+		urLoaderConfigRetain;
+		urLoaderConfigSetCodeLocationCallback;
+		urLoaderInit;
+		urLoaderTearDown;
+		urMemBufferCreate;
+		urMemBufferCreateWithNativeHandle;
+		urMemBufferPartition;
+		urMemGetInfo;
+		urMemGetNativeHandle;
+		urMemImageCreate;
+		urMemImageCreateWithNativeHandle;
+		urMemImageGetInfo;
+		urMemRelease;
+		urMemRetain;
+		urPhysicalMemCreate;
+		urPhysicalMemRelease;
+		urPhysicalMemRetain;
+		urPlatformCreateWithNativeHandle;
+		urPlatformGet;
+		urPlatformGetApiVersion;
+		urPlatformGetBackendOption;
+		urPlatformGetInfo;
+		urPlatformGetNativeHandle;
+		urPrintAdapterBackend;
+		urPrintAdapterGetInfoParams;
+		urPrintAdapterGetLastErrorParams;
+		urPrintAdapterGetParams;
+		urPrintAdapterInfo;
+		urPrintAdapterReleaseParams;
+		urPrintAdapterRetainParams;
+		urPrintApiVersion;
+		urPrintBaseDesc;
+		urPrintBaseProperties;
+		urPrintBindlessImagesDestroyExternalSemaphoreExpParams;
+		urPrintBindlessImagesImageAllocateExpParams;
+		urPrintBindlessImagesImageCopyExpParams;
+		urPrintBindlessImagesImageFreeExpParams;
+		urPrintBindlessImagesImageGetInfoExpParams;
+		urPrintBindlessImagesImportExternalSemaphoreOpaqueFdExpParams;
+		urPrintBindlessImagesImportOpaqueFdExpParams;
+		urPrintBindlessImagesMapExternalArrayExpParams;
+		urPrintBindlessImagesMipmapFreeExpParams;
+		urPrintBindlessImagesMipmapGetLevelExpParams;
+		urPrintBindlessImagesReleaseInteropExpParams;
+		urPrintBindlessImagesSampledImageCreateExpParams;
+		urPrintBindlessImagesSampledImageHandleDestroyExpParams;
+		urPrintBindlessImagesSignalExternalSemaphoreExpParams;
+		urPrintBindlessImagesUnsampledImageCreateExpParams;
+		urPrintBindlessImagesUnsampledImageHandleDestroyExpParams;
+		urPrintBindlessImagesWaitExternalSemaphoreExpParams;
+		urPrintBufferAllocLocationProperties;
+		urPrintBufferChannelProperties;
+		urPrintBufferCreateType;
+		urPrintBufferProperties;
+		urPrintBufferRegion;
+		urPrintCodeLocation;
+		urPrintCommand;
+		urPrintCommandBufferAppendKernelLaunchExpParams;
+		urPrintCommandBufferAppendMemBufferCopyExpParams;
+		urPrintCommandBufferAppendMemBufferCopyRectExpParams;
+		urPrintCommandBufferAppendMemBufferFillExpParams;
+		urPrintCommandBufferAppendMemBufferReadExpParams;
+		urPrintCommandBufferAppendMemBufferReadRectExpParams;
+		urPrintCommandBufferAppendMemBufferWriteExpParams;
+		urPrintCommandBufferAppendMemBufferWriteRectExpParams;
+		urPrintCommandBufferAppendUsmAdviseExpParams;
+		urPrintCommandBufferAppendUsmFillExpParams;
+		urPrintCommandBufferAppendUsmMemcpyExpParams;
+		urPrintCommandBufferAppendUsmPrefetchExpParams;
+		urPrintCommandBufferCommandGetInfoExpParams;
+		urPrintCommandBufferCreateExpParams;
+		urPrintCommandBufferEnqueueExpParams;
+		urPrintCommandBufferFinalizeExpParams;
+		urPrintCommandBufferGetInfoExpParams;
+		urPrintCommandBufferReleaseCommandExpParams;
+		urPrintCommandBufferReleaseExpParams;
+		urPrintCommandBufferRetainCommandExpParams;
+		urPrintCommandBufferRetainExpParams;
+		urPrintCommandBufferUpdateKernelLaunchExpParams;
+		urPrintContextCreateParams;
+		urPrintContextCreateWithNativeHandleParams;
+		urPrintContextFlags;
+		urPrintContextGetInfoParams;
+		urPrintContextGetNativeHandleParams;
+		urPrintContextInfo;
+		urPrintContextNativeProperties;
+		urPrintContextProperties;
+		urPrintContextReleaseParams;
+		urPrintContextRetainParams;
+		urPrintContextSetExtendedDeleterParams;
+		urPrintDeviceAffinityDomainFlags;
+		urPrintDeviceBinary;
+		urPrintDeviceCreateWithNativeHandleParams;
+		urPrintDeviceExecCapabilityFlags;
+		urPrintDeviceFpCapabilityFlags;
+		urPrintDeviceGetGlobalTimestampsParams;
+		urPrintDeviceGetInfoParams;
+		urPrintDeviceGetNativeHandleParams;
+		urPrintDeviceGetParams;
+		urPrintDeviceGetSelectedParams;
+		urPrintDeviceInfo;
+		urPrintDeviceInitFlags;
+		urPrintDeviceLocalMemType;
+		urPrintDeviceMemCacheType;
+		urPrintDeviceNativeProperties;
+		urPrintDevicePartition;
+		urPrintDevicePartitionParams;
+		urPrintDevicePartitionProperties;
+		urPrintDevicePartitionProperty;
+		urPrintDeviceReleaseParams;
+		urPrintDeviceRetainParams;
+		urPrintDeviceSelectBinaryParams;
+		urPrintDeviceType;
+		urPrintDeviceUsmAccessCapabilityFlags;
+		urPrintEnqueueCooperativeKernelLaunchExpParams;
+		urPrintEnqueueDeviceGlobalVariableReadParams;
+		urPrintEnqueueDeviceGlobalVariableWriteParams;
+		urPrintEnqueueEventsWaitParams;
+		urPrintEnqueueEventsWaitWithBarrierParams;
+		urPrintEnqueueKernelLaunchCustomExpParams;
+		urPrintEnqueueKernelLaunchParams;
+		urPrintEnqueueMemBufferCopyParams;
+		urPrintEnqueueMemBufferCopyRectParams;
+		urPrintEnqueueMemBufferFillParams;
+		urPrintEnqueueMemBufferMapParams;
+		urPrintEnqueueMemBufferReadParams;
+		urPrintEnqueueMemBufferReadRectParams;
+		urPrintEnqueueMemBufferWriteParams;
+		urPrintEnqueueMemBufferWriteRectParams;
+		urPrintEnqueueMemImageCopyParams;
+		urPrintEnqueueMemImageReadParams;
+		urPrintEnqueueMemImageWriteParams;
+		urPrintEnqueueMemUnmapParams;
+		urPrintEnqueueReadHostPipeParams;
+		urPrintEnqueueTimestampRecordingExpParams;
+		urPrintEnqueueUsmAdviseParams;
+		urPrintEnqueueUsmFillParams;
+		urPrintEnqueueUsmFill_2dParams;
+		urPrintEnqueueUsmMemcpyParams;
+		urPrintEnqueueUsmMemcpy_2dParams;
+		urPrintEnqueueUsmPrefetchParams;
+		urPrintEnqueueWriteHostPipeParams;
+		urPrintEventCreateWithNativeHandleParams;
+		urPrintEventGetInfoParams;
+		urPrintEventGetNativeHandleParams;
+		urPrintEventGetProfilingInfoParams;
+		urPrintEventInfo;
+		urPrintEventNativeProperties;
+		urPrintEventReleaseParams;
+		urPrintEventRetainParams;
+		urPrintEventSetCallbackParams;
+		urPrintEventStatus;
+		urPrintEventWaitParams;
+		urPrintExecutionInfo;
+		urPrintExpCommandBufferCommandInfo;
+		urPrintExpCommandBufferDesc;
+		urPrintExpCommandBufferInfo;
+		urPrintExpCommandBufferUpdateKernelLaunchDesc;
+		urPrintExpCommandBufferUpdateMemobjArgDesc;
+		urPrintExpCommandBufferUpdatePointerArgDesc;
+		urPrintExpCommandBufferUpdateValueArgDesc;
+		urPrintExpFileDescriptor;
+		urPrintExpImageCopyFlags;
+		urPrintExpInteropMemDesc;
+		urPrintExpInteropSemaphoreDesc;
+		urPrintExpLaunchProperty;
+		urPrintExpLaunchPropertyId;
+		urPrintExpPeerInfo;
+		urPrintExpSamplerAddrModes;
+		urPrintExpSamplerCubemapFilterMode;
+		urPrintExpSamplerCubemapProperties;
+		urPrintExpSamplerMipProperties;
+		urPrintExpWin32Handle;
+		urPrintFunction;
+		urPrintFunctionParams;
+		urPrintImageChannelOrder;
+		urPrintImageChannelType;
+		urPrintImageDesc;
+		urPrintImageFormat;
+		urPrintImageInfo;
+		urPrintKernelArgLocalProperties;
+		urPrintKernelArgMemObjProperties;
+		urPrintKernelArgPointerProperties;
+		urPrintKernelArgSamplerProperties;
+		urPrintKernelArgValueProperties;
+		urPrintKernelCacheConfig;
+		urPrintKernelCreateParams;
+		urPrintKernelCreateWithNativeHandleParams;
+		urPrintKernelExecInfo;
+		urPrintKernelExecInfoProperties;
+		urPrintKernelGetGroupInfoParams;
+		urPrintKernelGetInfoParams;
+		urPrintKernelGetNativeHandleParams;
+		urPrintKernelGetSubGroupInfoParams;
+		urPrintKernelGetSuggestedLocalWorkSizeParams;
+		urPrintKernelGroupInfo;
+		urPrintKernelInfo;
+		urPrintKernelNativeProperties;
+		urPrintKernelReleaseParams;
+		urPrintKernelRetainParams;
+		urPrintKernelSetArgLocalParams;
+		urPrintKernelSetArgMemObjParams;
+		urPrintKernelSetArgPointerParams;
+		urPrintKernelSetArgSamplerParams;
+		urPrintKernelSetArgValueParams;
+		urPrintKernelSetExecInfoParams;
+		urPrintKernelSetSpecializationConstantsParams;
+		urPrintKernelSubGroupInfo;
+		urPrintKernelSuggestMaxCooperativeGroupCountExpParams;
+		urPrintLoaderConfigCreateParams;
+		urPrintLoaderConfigEnableLayerParams;
+		urPrintLoaderConfigGetInfoParams;
+		urPrintLoaderConfigInfo;
+		urPrintLoaderConfigReleaseParams;
+		urPrintLoaderConfigRetainParams;
+		urPrintLoaderConfigSetCodeLocationCallbackParams;
+		urPrintLoaderInitParams;
+		urPrintLoaderTearDownParams;
+		urPrintMapFlags;
+		urPrintMemBufferCreateParams;
+		urPrintMemBufferCreateWithNativeHandleParams;
+		urPrintMemBufferPartitionParams;
+		urPrintMemFlags;
+		urPrintMemGetInfoParams;
+		urPrintMemGetNativeHandleParams;
+		urPrintMemImageCreateParams;
+		urPrintMemImageCreateWithNativeHandleParams;
+		urPrintMemImageGetInfoParams;
+		urPrintMemInfo;
+		urPrintMemNativeProperties;
+		urPrintMemReleaseParams;
+		urPrintMemRetainParams;
+		urPrintMemType;
+		urPrintMemoryOrderCapabilityFlags;
+		urPrintMemoryScopeCapabilityFlags;
+		urPrintPhysicalMemCreateParams;
+		urPrintPhysicalMemFlags;
+		urPrintPhysicalMemProperties;
+		urPrintPhysicalMemReleaseParams;
+		urPrintPhysicalMemRetainParams;
+		urPrintPlatformBackend;
+		urPrintPlatformCreateWithNativeHandleParams;
+		urPrintPlatformGetApiVersionParams;
+		urPrintPlatformGetBackendOptionParams;
+		urPrintPlatformGetInfoParams;
+		urPrintPlatformGetNativeHandleParams;
+		urPrintPlatformGetParams;
+		urPrintPlatformInfo;
+		urPrintPlatformNativeProperties;
+		urPrintProfilingInfo;
+		urPrintProgramBinaryType;
+		urPrintProgramBuildExpParams;
+		urPrintProgramBuildInfo;
+		urPrintProgramBuildParams;
+		urPrintProgramBuildStatus;
+		urPrintProgramCompileExpParams;
+		urPrintProgramCompileParams;
+		urPrintProgramCreateWithBinaryParams;
+		urPrintProgramCreateWithIlParams;
+		urPrintProgramCreateWithNativeHandleParams;
+		urPrintProgramGetBuildInfoParams;
+		urPrintProgramGetFunctionPointerParams;
+		urPrintProgramGetGlobalVariablePointerParams;
+		urPrintProgramGetInfoParams;
+		urPrintProgramGetNativeHandleParams;
+		urPrintProgramInfo;
+		urPrintProgramLinkExpParams;
+		urPrintProgramLinkParams;
+		urPrintProgramMetadata;
+		urPrintProgramMetadataType;
+		urPrintProgramNativeProperties;
+		urPrintProgramProperties;
+		urPrintProgramReleaseParams;
+		urPrintProgramRetainParams;
+		urPrintProgramSetSpecializationConstantsParams;
+		urPrintQueueCreateParams;
+		urPrintQueueCreateWithNativeHandleParams;
+		urPrintQueueFinishParams;
+		urPrintQueueFlags;
+		urPrintQueueFlushParams;
+		urPrintQueueGetInfoParams;
+		urPrintQueueGetNativeHandleParams;
+		urPrintQueueIndexProperties;
+		urPrintQueueInfo;
+		urPrintQueueNativeDesc;
+		urPrintQueueNativeProperties;
+		urPrintQueueProperties;
+		urPrintQueueReleaseParams;
+		urPrintQueueRetainParams;
+		urPrintRectOffset;
+		urPrintRectRegion;
+		urPrintResult;
+		urPrintSamplerAddressingMode;
+		urPrintSamplerCreateParams;
+		urPrintSamplerCreateWithNativeHandleParams;
+		urPrintSamplerDesc;
+		urPrintSamplerFilterMode;
+		urPrintSamplerGetInfoParams;
+		urPrintSamplerGetNativeHandleParams;
+		urPrintSamplerInfo;
+		urPrintSamplerNativeProperties;
+		urPrintSamplerReleaseParams;
+		urPrintSamplerRetainParams;
+		urPrintSpecializationConstantInfo;
+		urPrintStructureType;
+		urPrintUsmAdviceFlags;
+		urPrintUsmAllocInfo;
+		urPrintUsmAllocLocationDesc;
+		urPrintUsmDesc;
+		urPrintUsmDeviceAllocParams;
+		urPrintUsmDeviceDesc;
+		urPrintUsmDeviceMemFlags;
+		urPrintUsmFreeParams;
+		urPrintUsmGetMemAllocInfoParams;
+		urPrintUsmHostAllocParams;
+		urPrintUsmHostDesc;
+		urPrintUsmHostMemFlags;
+		urPrintUsmImportExpParams;
+		urPrintUsmMigrationFlags;
+		urPrintUsmP2pDisablePeerAccessExpParams;
+		urPrintUsmP2pEnablePeerAccessExpParams;
+		urPrintUsmP2pPeerAccessGetInfoExpParams;
+		urPrintUsmPitchedAllocExpParams;
+		urPrintUsmPoolCreateParams;
+		urPrintUsmPoolDesc;
+		urPrintUsmPoolFlags;
+		urPrintUsmPoolGetInfoParams;
+		urPrintUsmPoolInfo;
+		urPrintUsmPoolLimitsDesc;
+		urPrintUsmPoolReleaseParams;
+		urPrintUsmPoolRetainParams;
+		urPrintUsmReleaseExpParams;
+		urPrintUsmSharedAllocParams;
+		urPrintUsmType;
+		urPrintVirtualMemAccessFlags;
+		urPrintVirtualMemFreeParams;
+		urPrintVirtualMemGetInfoParams;
+		urPrintVirtualMemGranularityGetInfoParams;
+		urPrintVirtualMemGranularityInfo;
+		urPrintVirtualMemInfo;
+		urPrintVirtualMemMapParams;
+		urPrintVirtualMemReserveParams;
+		urPrintVirtualMemSetAccessParams;
+		urPrintVirtualMemUnmapParams;
+		urProgramBuild;
+		urProgramBuildExp;
+		urProgramCompile;
+		urProgramCompileExp;
+		urProgramCreateWithBinary;
+		urProgramCreateWithIL;
+		urProgramCreateWithNativeHandle;
+		urProgramGetBuildInfo;
+		urProgramGetFunctionPointer;
+		urProgramGetGlobalVariablePointer;
+		urProgramGetInfo;
+		urProgramGetNativeHandle;
+		urProgramLink;
+		urProgramLinkExp;
+		urProgramRelease;
+		urProgramRetain;
+		urProgramSetSpecializationConstants;
+		urQueueCreate;
+		urQueueCreateWithNativeHandle;
+		urQueueFinish;
+		urQueueFlush;
+		urQueueGetInfo;
+		urQueueGetNativeHandle;
+		urQueueRelease;
+		urQueueRetain;
+		urSamplerCreate;
+		urSamplerCreateWithNativeHandle;
+		urSamplerGetInfo;
+		urSamplerGetNativeHandle;
+		urSamplerRelease;
+		urSamplerRetain;
+		urUSMDeviceAlloc;
+		urUSMFree;
+		urUSMGetMemAllocInfo;
+		urUSMHostAlloc;
+		urUSMImportExp;
+		urUSMPitchedAllocExp;
+		urUSMPoolCreate;
+		urUSMPoolGetInfo;
+		urUSMPoolRelease;
+		urUSMPoolRetain;
+		urUSMReleaseExp;
+		urUSMSharedAlloc;
+		urUsmP2PDisablePeerAccessExp;
+		urUsmP2PEnablePeerAccessExp;
+		urUsmP2PPeerAccessGetInfoExp;
+		urVirtualMemFree;
+		urVirtualMemGetInfo;
+		urVirtualMemGranularityGetInfo;
+		urVirtualMemMap;
+		urVirtualMemReserve;
+		urVirtualMemSetAccess;
+		urVirtualMemUnmap;
+	local:
+		*;
+};
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index d7a9447b06..ed7e4299b1 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -22,7 +22,6 @@ ur_event_factory_t ur_event_factory;
 ur_program_factory_t ur_program_factory;
 ur_kernel_factory_t ur_kernel_factory;
 ur_queue_factory_t ur_queue_factory;
-ur_native_factory_t ur_native_factory;
 ur_sampler_factory_t ur_sampler_factory;
 ur_mem_factory_t ur_mem_factory;
 ur_physical_mem_factory_t ur_physical_mem_factory;
@@ -364,6 +363,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle(
 __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -372,8 +373,7 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     // extract platform's function pointer table
-    auto dditable =
-        reinterpret_cast<ur_native_object_t *>(hNativePlatform)->dditable;
+    auto dditable = reinterpret_cast<ur_adapter_object_t *>(hAdapter)->dditable;
     auto pfnCreateWithNativeHandle =
         dditable->ur.Platform.pfnCreateWithNativeHandle;
     if (nullptr == pfnCreateWithNativeHandle) {
@@ -381,12 +381,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     }
 
     // convert loader handle to platform handle
-    hNativePlatform =
-        reinterpret_cast<ur_native_object_t *>(hNativePlatform)->handle;
+    hAdapter = reinterpret_cast<ur_adapter_object_t *>(hAdapter)->handle;
 
     // forward to device-platform
-    result =
-        pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
+    result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, pProperties,
+                                       phPlatform);
 
     if (UR_RESULT_SUCCESS != result) {
         return result;
@@ -3449,6 +3448,49 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize
+__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_kernel_object_t *>(hKernel)->dditable;
+    auto pfnGetSuggestedLocalWorkSize =
+        dditable->ur.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
+
+    // convert loader handle to platform handle
+    hQueue = reinterpret_cast<ur_queue_object_t *>(hQueue)->handle;
+
+    // forward to device-platform
+    result = pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim,
+                                          pGlobalWorkOffset, pGlobalWorkSize,
+                                          pSuggestedLocalWorkSize);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urQueueGetInfo
 __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
@@ -8599,6 +8641,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
             pDdiTable->pfnGetNativeHandle = ur_loader::urKernelGetNativeHandle;
             pDdiTable->pfnCreateWithNativeHandle =
                 ur_loader::urKernelCreateWithNativeHandle;
+            pDdiTable->pfnGetSuggestedLocalWorkSize =
+                ur_loader::urKernelGetSuggestedLocalWorkSize;
             pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue;
             pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal;
             pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer;
diff --git a/source/loader/ur_ldrddi.hpp b/source/loader/ur_ldrddi.hpp
index d98b99a655..af5b5d49b5 100644
--- a/source/loader/ur_ldrddi.hpp
+++ b/source/loader/ur_ldrddi.hpp
@@ -49,10 +49,6 @@ using ur_queue_object_t = object_t<ur_queue_handle_t>;
 using ur_queue_factory_t =
     singleton_factory_t<ur_queue_object_t, ur_queue_handle_t>;
 
-using ur_native_object_t = object_t<ur_native_handle_t>;
-using ur_native_factory_t =
-    singleton_factory_t<ur_native_object_t, ur_native_handle_t>;
-
 using ur_sampler_object_t = object_t<ur_sampler_handle_t>;
 using ur_sampler_factory_t =
     singleton_factory_t<ur_sampler_object_t, ur_sampler_handle_t>;
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 35e5d68e36..1bc1fc82f0 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -653,6 +653,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle(
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phPlatform`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -660,6 +662,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle(
 ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -671,7 +675,8 @@ ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
+    return pfnCreateWithNativeHandle(hNativePlatform, hAdapter, pProperties,
+                                     phPlatform);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
@@ -1470,6 +1475,22 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Create an image object
 ///
+/// @details
+///     - The primary ::ur_image_format_t that must be supported by all the
+///       adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA,
+///       UR_IMAGE_CHANNEL_TYPE_UNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}.
+///
 /// @remarks
 ///   _Analogues_
 ///     - **clCreateImage**
@@ -1490,12 +1511,13 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_INVALID_HOST_PTR
 ///         + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0`
 ///         + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urMemImageCreate(
@@ -4096,6 +4118,58 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+    ) try {
+    auto pfnGetSuggestedLocalWorkSize =
+        ur_lib::context->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize;
+    if (nullptr == pfnGetSuggestedLocalWorkSize) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim,
+                                        pGlobalWorkOffset, pGlobalWorkSize,
+                                        pSuggestedLocalWorkSize);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query information about a command queue
 ///
@@ -4120,7 +4194,7 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `propSize != 0 && pPropValue == NULL`
 ///         + `pPropValue == NULL && pPropSizeRet == NULL`
-///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urQueueGetInfo(
@@ -6525,7 +6599,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageHandleDestroyExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
@@ -6607,7 +6681,7 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
@@ -6658,7 +6732,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_SAMPLER
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
@@ -6713,7 +6787,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_QUEUE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
@@ -6936,7 +7010,7 @@ ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
@@ -8065,7 +8139,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Update a kernel launch command in a finalized command-buffer.
+/// @brief Update a kernel launch command in a finalized command-buffer. This
+///        entry-point is synchronous and may block if the command-buffer is
+///        executing when the entry-point is called.
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 5af2165ea4..79107c733d 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1821,6 +1821,14 @@ ur_result_t urPrintKernelCreateWithNativeHandleParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams(
+    const struct ur_kernel_get_suggested_local_work_size_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t urPrintKernelSetArgValueParams(
     const struct ur_kernel_set_arg_value_params_t *params, char *buffer,
     const size_t buff_size, size_t *out_size) {
diff --git a/source/loader/windows/adapter_search.cpp b/source/loader/windows/adapter_search.cpp
index 609d5576f2..b514897d91 100644
--- a/source/loader/windows/adapter_search.cpp
+++ b/source/loader/windows/adapter_search.cpp
@@ -23,8 +23,14 @@ namespace fs = filesystem;
 namespace ur_loader {
 
 std::optional<fs::path> getLoaderLibPath() {
+    HMODULE hModule = NULL;
     char pathStr[MAX_PATH_LEN_WIN];
-    if (GetModuleFileNameA(nullptr, pathStr, MAX_PATH_LEN_WIN)) {
+
+    if (GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                          reinterpret_cast<LPCSTR>(&getLoaderLibPath),
+                          &hModule) &&
+        GetModuleFileNameA(hModule, pathStr, MAX_PATH_LEN_WIN)) {
         auto libPath = fs::path(pathStr);
         if (fs::exists(libPath)) {
             return fs::absolute(libPath).parent_path();
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
index ccca6a756b..629ba277c8 100644
--- a/source/ur/ur.hpp
+++ b/source/ur/ur.hpp
@@ -337,7 +337,7 @@ roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
 
 // Returns whether or not Value is a power of 2
 template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
-bool isPowerOf2(const T &Value) {
+bool isPowerOf2(const T Value) {
   return Value && !(Value & (Value - 1));
 }
 
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index bf90700e7d..07202a8aad 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -569,6 +569,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle(
 ///     - ::UR_RESULT_ERROR_UNINITIALIZED
 ///     - ::UR_RESULT_ERROR_DEVICE_LOST
 ///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hAdapter`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == phPlatform`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
@@ -576,6 +578,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle(
 ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
         hNativePlatform, ///< [in][nocheck] the native handle of the platform.
+    ur_adapter_handle_t
+        hAdapter, ///< [in] handle of the adapter associated with the native backend.
     const ur_platform_native_properties_t *
         pProperties, ///< [in][optional] pointer to native platform properties struct.
     ur_platform_handle_t *
@@ -1265,6 +1269,22 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Create an image object
 ///
+/// @details
+///     - The primary ::ur_image_format_t that must be supported by all the
+///       adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA,
+///       UR_IMAGE_CHANNEL_TYPE_UNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT},
+///       {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}.
+///
 /// @remarks
 ///   _Analogues_
 ///     - **clCreateImage**
@@ -1285,12 +1305,13 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_INVALID_HOST_PTR
 ///         + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0`
 ///         + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urMemImageCreate(
@@ -3472,6 +3493,49 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the suggested local work size for a kernel.
+///
+/// @details
+///     - Query a suggested local work size for a kernel given a global size for
+///       each dimension.
+///     - The application may call this function from simultaneous threads for
+///       the same context.
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hKernel`
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
+///         + `NULL == pGlobalWorkSize`
+///         + `NULL == pSuggestedLocalWorkSize`
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE
+ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel
+    ur_queue_handle_t hQueue,   ///< [in] handle of the queue object
+    uint32_t
+        numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global
+                    ///< and work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the offset used to calculate the global ID of a work-item
+    const size_t *
+        pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify
+    ///< the number of global work-items in workDim that will execute the
+    ///< kernel function
+    size_t *
+        pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify
+    ///< suggested local work size that will contain the result of the query
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Query information about a command queue
 ///
@@ -3496,7 +3560,7 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `propSize != 0 && pPropValue == NULL`
 ///         + `pPropValue == NULL && pPropSizeRet == NULL`
-///     - ::UR_RESULT_ERROR_INVALID_QUEUE
+///     - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urQueueGetInfo(
@@ -5572,7 +5636,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageHandleDestroyExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
@@ -5639,7 +5703,7 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
@@ -5681,7 +5745,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_SAMPLER
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
@@ -5728,7 +5792,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_QUEUE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
@@ -5911,7 +5975,7 @@ ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR
-///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`
+///         + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`
 ///     - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_OPERATION
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
@@ -6825,7 +6889,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Update a kernel launch command in a finalized command-buffer.
+/// @brief Update a kernel launch command in a finalized command-buffer. This
+///        entry-point is synchronous and may block if the command-buffer is
+///        executing when the entry-point is called.
 ///
 /// @returns
 ///     - ::UR_RESULT_SUCCESS
diff --git a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp
index 19e4de21ec..62c667b242 100644
--- a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp
+++ b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp
@@ -3,9 +3,9 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "level_zero/ze_api.h"
 #include "ur_api.h"
 #include "uur/checks.h"
+#include "ze_api.h"
 #include <uur/fixtures.h>
 
 using urLevelZeroKernelNativeHandleTest = uur::urContextTest;
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index 79cefdd06f..439b9b7a79 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -5,6 +5,10 @@
 
 set(UR_CONFORMANCE_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
+set(UR_CONFORMANCE_DEVICE_BINARIES_DIR
+    "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL
+    "Internal cache variable for device binaries directory")
+
 function(add_test_adapter name adapter)
     set(TEST_TARGET_NAME test-${name})
     set(TEST_NAME ${name}-${adapter})
@@ -140,6 +144,7 @@ if(UR_DPCXX)
     add_subdirectory(exp_command_buffer)
     add_subdirectory(exp_usm_p2p)
     add_subdirectory(exp_launch_properties)
+    add_subdirectory(memory-migrate)
 else()
     message(WARNING
         "UR_DPCXX is not defined, the following conformance test executables \
diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match
index 0c9d1530b7..cecf5c8747 100644
--- a/test/conformance/context/context_adapter_hip.match
+++ b/test/conformance/context/context_adapter_hip.match
@@ -1,4 +1,3 @@
 urContextCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_
 urContextCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_
 urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_
-urContextGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}
diff --git a/test/conformance/context/urContextGetInfo.cpp b/test/conformance/context/urContextGetInfo.cpp
index 1d75718d87..f9f699d511 100644
--- a/test/conformance/context/urContextGetInfo.cpp
+++ b/test/conformance/context/urContextGetInfo.cpp
@@ -2,7 +2,6 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#include <map>
 #include <uur/fixtures.h>
 
 struct urContextGetInfoTestWithInfoParam
@@ -14,9 +13,7 @@ struct urContextGetInfoTestWithInfoParam
 
         ctx_info_size_map = {
             {UR_CONTEXT_INFO_NUM_DEVICES, sizeof(uint32_t)},
-            {UR_CONTEXT_INFO_DEVICES,
-             sizeof(ur_device_handle_t) *
-                 uur::DevicesEnvironment::instance->devices.size()},
+            {UR_CONTEXT_INFO_DEVICES, sizeof(ur_device_handle_t)},
             {UR_CONTEXT_INFO_REFERENCE_COUNT, sizeof(uint32_t)},
             {UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT, sizeof(bool)},
             {UR_CONTEXT_INFO_USM_FILL2D_SUPPORT, sizeof(bool)},
diff --git a/test/conformance/device/device_adapter_hip.match b/test/conformance/device/device_adapter_hip.match
index f64efa4bac..9989fbd774 100644
--- a/test/conformance/device/device_adapter_hip.match
+++ b/test/conformance/device/device_adapter_hip.match
@@ -1,4 +1,2 @@
-urDeviceCreateWithNativeHandleTest.Success
-urDeviceCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle
 urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match
index 1f735d7a77..b65a51b9cb 100644
--- a/test/conformance/device/device_adapter_level_zero.match
+++ b/test/conformance/device/device_adapter_level_zero.match
@@ -1,10 +1,7 @@
 urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
+{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP
diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index 1419604b9d..24c437e853 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -135,6 +135,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp)
diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp
index a2bdaccf55..644b151fd3 100644
--- a/test/conformance/device_code/cpy_and_mult.cpp
+++ b/test/conformance/device_code/cpy_and_mult.cpp
@@ -3,24 +3,24 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
     std::vector<uint32_t> src(array_size, 1);
     std::vector<uint32_t> dst(array_size, 1);
     auto src_buff =
-        cl::sycl::buffer<uint32_t>(src.data(), cl::sycl::range<1>(array_size));
+        sycl::buffer<uint32_t>(src.data(), sycl::range<1>(array_size));
     auto dst_buff =
-        cl::sycl::buffer<uint32_t>(dst.data(), cl::sycl::range<1>(array_size));
+        sycl::buffer<uint32_t>(dst.data(), sycl::range<1>(array_size));
 
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto src_acc = src_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto dst_acc = dst_buff.get_access<cl::sycl::access::mode::write>(cgh);
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto src_acc = src_buff.get_access<sycl::access::mode::read>(cgh);
+        auto dst_acc = dst_buff.get_access<sycl::access::mode::write>(cgh);
         cgh.parallel_for<class cpy_and_mult>(
-            cl::sycl::range<1>{array_size},
-            [src_acc, dst_acc](cl::sycl::item<1> itemId) {
+            sycl::range<1>{array_size},
+            [src_acc, dst_acc](sycl::item<1> itemId) {
                 auto id = itemId.get_id(0);
                 dst_acc[id] = src_acc[id] * 2;
             });
diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp
index e253dfe14b..f34041b6c2 100644
--- a/test/conformance/device_code/cpy_and_mult_usm.cpp
+++ b/test/conformance/device_code/cpy_and_mult_usm.cpp
@@ -3,17 +3,16 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
-    cl::sycl::queue sycl_queue;
-    uint32_t *src = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
-    uint32_t *dst = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+    sycl::queue sycl_queue;
+    uint32_t *src = sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    uint32_t *dst = sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    sycl_queue.submit([&](sycl::handler &cgh) {
         cgh.parallel_for<class cpy_and_mult_usm>(
-            cl::sycl::range<1>{array_size},
-            [src, dst](cl::sycl::item<1> itemId) {
+            sycl::range<1>{array_size}, [src, dst](sycl::item<1> itemId) {
                 auto id = itemId.get_id(0);
                 dst[id] = src[id] * 2;
             });
diff --git a/test/conformance/device_code/inc.cpp b/test/conformance/device_code/inc.cpp
new file mode 100644
index 0000000000..d41c07366d
--- /dev/null
+++ b/test/conformance/device_code/inc.cpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+class inc;
+
+int main() {
+    uint32_t *ptr;
+    sycl::buffer<uint32_t> buf{ptr, 1};
+    sycl::queue{}.submit([&](sycl::handler &cgh) {
+        sycl::accessor acc{buf, cgh};
+        auto kernel = [acc](sycl::item<1> it) { acc[it]++; };
+        cgh.parallel_for<inc>(sycl::range<1>{1}, kernel);
+    });
+}
diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match
index 56376ff37d..443628e36e 100644
--- a/test/conformance/enqueue/enqueue_adapter_hip.match
+++ b/test/conformance/enqueue/enqueue_adapter_hip.match
@@ -1,44 +1,15 @@
+# HIP can't check kernel arguments
 urEnqueueKernelLaunchTest.InvalidKernelArgs/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueKernelLaunchKernelSubGroupTest.Success/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_row_2D
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d
-urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256
-urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d
-urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___2D
-urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___2D
-urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___2D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___2D
-urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.InvalidNullHandleQueue/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.InvalidNullHandleQueue/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageCopyTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___1D
-urEnqueueMemImageCopyTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___3D
-urEnqueueMemImageReadTest.Success1D/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueMemImageReadTest.Success3D/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueMemImageWriteTest.Success1D/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueMemImageWriteTest.Success3D/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueUSMFill2DNegativeTest.OutOfBounds/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT
 urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_
 urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_
-urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
 urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
 urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
 urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}}
diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
index 155a400e89..8d6cf18c3e 100644
--- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match
+++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
@@ -46,4 +46,328 @@
 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D_256_79_8
 {{OPT}}urEnqueueKernelLaunchWithVirtualMemory.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/SYCL_NATIVE_CPU___SYCL_Native_CPU_
-{{Segmentation fault|Aborted}}
+{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UsePoolEnabled
+{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UsePoolDisabled
+{{OPT}}urEnqueueMemBufferCopyRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096
+{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000
+{{OPT}}urEnqueueMemBufferFillNegativeTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferReadRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferReadRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemBufferWriteRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemBufferWriteRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D
+{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__1__patternSize__1
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__256
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__1024__patternSize__256
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__4
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__8
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__16
+{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__32
+{{OPT}}urEnqueueUSMFillNegativeTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFillNegativeTest.InvalidNullPtr/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFillNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFillNegativeTest.OutOfBounds/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFillNegativeTest.invalidPatternSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFillNegativeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullPtr/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidWidth/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidHeight/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.OutOfBounds/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.invalidPatternSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseWithParamTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_ADVICE_FLAG_DEFAULT
+{{OPT}}urEnqueueUSMAdviseTest.MultipleParamsSuccess/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.InvalidNullPointerMem/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.InvalidSizeTooLarge/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.Blocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.NonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullDst/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST
+{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED
+{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidNullPointer/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
+{{OPT}}urEnqueueUSMPrefetchWithParamTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_MIGRATION_FLAG_DEFAULT
+{{OPT}}urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_MIGRATION_FLAG_DEFAULT
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidNullPointerMem/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueUSMPrefetchTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueReadHostPipeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueWriteHostPipeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
index 79c8ac14da..3f9d6cb996 100644
--- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
+++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -480,7 +480,7 @@ struct urEnqueueKernelLaunchUSMLinkedList
     const int num_nodes = 4;
     bool use_pool = false;
     ur_usm_pool_handle_t pool = nullptr;
-    ur_queue_handle_t queue;
+    ur_queue_handle_t queue = nullptr;
 };
 
 UUR_TEST_SUITE_P(
diff --git a/test/conformance/event/event_adapter_cuda.match b/test/conformance/event/event_adapter_cuda.match
index e40ea36db6..3cffb24c5f 100644
--- a/test/conformance/event/event_adapter_cuda.match
+++ b/test/conformance/event/event_adapter_cuda.match
@@ -1,3 +1,5 @@
+urEventGetProfilingInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE
+urEventGetProfilingInfoWithTimingComparisonTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
 urEventSetCallbackTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
 urEventSetCallbackTest.ValidateParameters/NVIDIA_CUDA_BACKEND___{{.*}}_
 urEventSetCallbackTest.AllStates/NVIDIA_CUDA_BACKEND___{{.*}}_
diff --git a/test/conformance/event/event_adapter_hip.match b/test/conformance/event/event_adapter_hip.match
index 8682cdf4a6..b25428a187 100644
--- a/test/conformance/event/event_adapter_hip.match
+++ b/test/conformance/event/event_adapter_hip.match
@@ -1,3 +1,5 @@
+urEventGetProfilingInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE
+urEventGetProfilingInfoWithTimingComparisonTest.Success/AMD_HIP_BACKEND___{{.*}}_
 urEventSetCallbackTest.Success/AMD_HIP_BACKEND___{{.*}}_
 urEventSetCallbackTest.ValidateParameters/AMD_HIP_BACKEND___{{.*}}_
 urEventSetCallbackTest.AllStates/AMD_HIP_BACKEND___{{.*}}_
diff --git a/test/conformance/event/event_adapter_level_zero.match b/test/conformance/event/event_adapter_level_zero.match
index c29f67cbc6..32ffbeaf1e 100644
--- a/test/conformance/event/event_adapter_level_zero.match
+++ b/test/conformance/event/event_adapter_level_zero.match
@@ -1,4 +1,6 @@
 {{OPT}}urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_TYPE
 {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_QUEUED
 {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_SUBMIT
+{{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE
+{{OPT}}urEventGetProfilingInfoWithTimingComparisonTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 {{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/event/event_adapter_native_cpu.match b/test/conformance/event/event_adapter_native_cpu.match
index f5769d271f..9d43898b09 100644
--- a/test/conformance/event/event_adapter_native_cpu.match
+++ b/test/conformance/event/event_adapter_native_cpu.match
@@ -13,6 +13,8 @@ urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFI
 urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_SUBMIT
 urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_START
 urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_END
+urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_COMPLETE
+urEventGetProfilingInfoWithTimingComparisonTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventGetProfilingInfoNegativeTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventGetProfilingInfoNegativeTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventGetProfilingInfoNegativeTest.InvalidValue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index b7ec52ffca..7b91679dad 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -30,9 +30,58 @@ UUR_TEST_SUITE_P(urEventGetProfilingInfoTest,
                  ::testing::Values(UR_PROFILING_INFO_COMMAND_QUEUED,
                                    UR_PROFILING_INFO_COMMAND_SUBMIT,
                                    UR_PROFILING_INFO_COMMAND_START,
-                                   UR_PROFILING_INFO_COMMAND_END),
+                                   UR_PROFILING_INFO_COMMAND_END,
+                                   UR_PROFILING_INFO_COMMAND_COMPLETE),
                  uur::deviceTestWithParamPrinter<ur_profiling_info_t>);
 
+using urEventGetProfilingInfoWithTimingComparisonTest = uur::event::urEventTest;
+
+TEST_P(urEventGetProfilingInfoWithTimingComparisonTest, Success) {
+    uint8_t size = 8;
+
+    std::vector<uint8_t> queued_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event,
+                                           UR_PROFILING_INFO_COMMAND_QUEUED,
+                                           size, queued_data.data(), nullptr));
+    auto queued_timing = reinterpret_cast<size_t *>(queued_data.data());
+    ASSERT_NE(*queued_timing, 0);
+
+    std::vector<uint8_t> submit_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event,
+                                           UR_PROFILING_INFO_COMMAND_SUBMIT,
+                                           size, submit_data.data(), nullptr));
+    auto submit_timing = reinterpret_cast<size_t *>(submit_data.data());
+    ASSERT_NE(*submit_timing, 0);
+
+    std::vector<uint8_t> start_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event,
+                                           UR_PROFILING_INFO_COMMAND_START,
+                                           size, start_data.data(), nullptr));
+    auto start_timing = reinterpret_cast<size_t *>(start_data.data());
+    ASSERT_NE(*start_timing, 0);
+
+    std::vector<uint8_t> end_data(size);
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
+                                           size, end_data.data(), nullptr));
+    auto end_timing = reinterpret_cast<size_t *>(end_data.data());
+    ASSERT_NE(*end_timing, 0);
+
+    std::vector<uint8_t> complete_data(size);
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_COMPLETE, size,
+                                complete_data.data(), nullptr));
+    auto complete_timing = reinterpret_cast<size_t *>(complete_data.data());
+    ASSERT_NE(*complete_timing, 0);
+
+    ASSERT_LE(*queued_timing, *submit_timing);
+    ASSERT_LT(*submit_timing, *start_timing);
+    ASSERT_LT(*start_timing, *end_timing);
+    ASSERT_LE(*end_timing, *complete_timing);
+}
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(
+    urEventGetProfilingInfoWithTimingComparisonTest);
+
 using urEventGetProfilingInfoNegativeTest = uur::event::urEventTest;
 
 TEST_P(urEventGetProfilingInfoNegativeTest, InvalidNullHandle) {
diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
index 80561cc9e1..78e1ffd009 100644
--- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
+++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
@@ -148,10 +148,6 @@ TEST_P(BufferFillCommandTest, UpdateParameters) {
 
 // Test updating the global size so that the fill outputs to a larger buffer
 TEST_P(BufferFillCommandTest, UpdateGlobalSize) {
-    if (!updatable_execution_range_support) {
-        GTEST_SKIP() << "Execution range update is not supported.";
-    }
-
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
@@ -208,8 +204,7 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) {
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, sizeof(val) * global_size, val);
 
-    size_t new_global_size =
-        updatable_execution_range_support ? 64 : global_size;
+    size_t new_global_size = global_size * 2;
     const size_t new_buffer_size = sizeof(val) * new_global_size;
     ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE,
                                      new_buffer_size, nullptr, &new_buffer));
@@ -272,26 +267,24 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) {
     ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle,
                                                         &input_update_desc));
 
-    if (updatable_execution_range_support) {
-        ur_exp_command_buffer_update_kernel_launch_desc_t
-            global_size_update_desc = {
-                UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
-                nullptr,          // pNext
-                0,                // numNewMemObjArgs
-                0,                // numNewPointerArgs
-                0,                // numNewValueArgs
-                0,                // newWorkDim
-                nullptr,          // pNewMemObjArgList
-                nullptr,          // pNewPointerArgList
-                nullptr,          // pNewValueArgList
-                nullptr,          // pNewGlobalWorkOffset
-                &new_global_size, // pNewGlobalWorkSize
-                nullptr,          // pNewLocalWorkSize
-            };
-
-        ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(
-            command_handle, &global_size_update_desc));
-    }
+    size_t new_local_size = local_size;
+    ur_exp_command_buffer_update_kernel_launch_desc_t global_size_update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        0,                                   // numNewMemObjArgs
+        0,                                   // numNewPointerArgs
+        0,                                   // numNewValueArgs
+        static_cast<uint32_t>(n_dimensions), // newWorkDim
+        nullptr,                             // pNewMemObjArgList
+        nullptr,                             // pNewPointerArgList
+        nullptr,                             // pNewValueArgList
+        nullptr,                             // pNewGlobalWorkOffset
+        &new_global_size,                    // pNewGlobalWorkSize
+        &new_local_size,                     // pNewLocalWorkSize
+    };
+
+    ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(
+        command_handle, &global_size_update_desc));
 
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
                                              nullptr, nullptr));
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
index f9c9025ea1..8422cc5d2f 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match
@@ -4,9 +4,12 @@
 {{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}BufferSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h
index 7e5367aa9c..eeb0a5d5d8 100644
--- a/test/conformance/exp_command_buffer/fixtures.h
+++ b/test/conformance/exp_command_buffer/fixtures.h
@@ -112,11 +112,6 @@ struct urUpdatableCommandBufferExpExecutionTest
             GTEST_SKIP() << "Updating EXP command-buffers is not supported.";
         }
 
-        // Currently level zero driver doesn't support updating execution range.
-        if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
-            updatable_execution_range_support = false;
-        }
-
         // Create a command-buffer with update enabled.
         ur_exp_command_buffer_desc_t desc{
             UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true};
@@ -154,7 +149,6 @@ struct urUpdatableCommandBufferExpExecutionTest
     }
 
     ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr;
-    ur_bool_t updatable_execution_range_support = true;
     ur_queue_handle_t queue = nullptr;
 };
 
diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/ndrange_update.cpp
index 486837df85..8f4edad095 100644
--- a/test/conformance/exp_command_buffer/ndrange_update.cpp
+++ b/test/conformance/exp_command_buffer/ndrange_update.cpp
@@ -15,10 +15,6 @@ struct NDRangeUpdateTest
         UUR_RETURN_ON_FATAL_FAILURE(
             urUpdatableCommandBufferExpExecutionTest::SetUp());
 
-        if (!updatable_execution_range_support) {
-            GTEST_SKIP() << "Execution range update is not supported.";
-        }
-
         ur_device_usm_access_capability_flags_t shared_usm_flags;
         ASSERT_SUCCESS(
             uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp
index cf0259c7ab..413b555623 100644
--- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp
+++ b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp
@@ -88,8 +88,7 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
     Validate((uint32_t *)shared_ptr, global_size, val);
 
     // Allocate a new USM pointer of larger size if feature is supported.
-    size_t new_global_size =
-        updatable_execution_range_support ? 64 : global_size;
+    size_t new_global_size = global_size * 2;
     const size_t new_allocation_size = sizeof(val) * new_global_size;
     ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
                                     new_allocation_size, &new_shared_ptr));
@@ -116,6 +115,60 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
         &new_val, // hArgValue
     };
 
+    size_t new_local_size = local_size;
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        0,                                   // numNewMemObjArgs
+        1,                                   // numNewPointerArgs
+        1,                                   // numNewValueArgs
+        static_cast<uint32_t>(n_dimensions), // newWorkDim
+        nullptr,                             // pNewMemObjArgList
+        &new_output_desc,                    // pNewPointerArgList
+        &new_input_desc,                     // pNewValueArgList
+        nullptr,                             // pNewGlobalWorkOffset
+        &new_global_size,                    // pNewGlobalWorkSize
+        &new_local_size,                     // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    ASSERT_SUCCESS(
+        urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    Validate((uint32_t *)new_shared_ptr, new_global_size, new_val);
+}
+
+// Test updating a command-buffer which hasn't been enqueued yet
+TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) {
+    ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                    allocation_size, &new_shared_ptr));
+    ASSERT_NE(new_shared_ptr, nullptr);
+    std::memset(new_shared_ptr, 0, allocation_size);
+
+    // Set new USM pointer as kernel output at index 0
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        0,               // argIndex
+        nullptr,         // pProperties
+        &new_shared_ptr, // pArgValue
+    };
+
+    // Set new value to use for fill at kernel index 1
+    uint32_t new_val = 33;
+    ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1,                                                          // argIndex
+        sizeof(new_val),                                            // argSize
+        nullptr,  // pProperties
+        &new_val, // hArgValue
+    };
+
     ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
         UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
         nullptr,                                                        // pNext
@@ -127,12 +180,11 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
         &new_output_desc, // pNewPointerArgList
         &new_input_desc,  // pNewValueArgList
         nullptr,          // pNewGlobalWorkOffset
-        updatable_execution_range_support ? &new_global_size
-                                          : nullptr, // pNewGlobalWorkSize
-        nullptr,                                     // pNewLocalWorkSize
+        nullptr,          // pNewGlobalWorkSize
+        nullptr,          // pNewLocalWorkSize
     };
 
-    // Update kernel and enqueue command-buffer again
+    // Update kernel and enqueue command-buffer
     ASSERT_SUCCESS(
         urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
     ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -140,7 +192,7 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
     ASSERT_SUCCESS(urQueueFinish(queue));
 
     // Verify that update occurred correctly
-    Validate((uint32_t *)new_shared_ptr, new_global_size, new_val);
+    Validate((uint32_t *)new_shared_ptr, global_size, new_val);
 }
 
 // Test updating a command-buffer with multiple USM fill kernel commands
diff --git a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp
index 8f213e8b24..d44fef3011 100644
--- a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp
+++ b/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp
@@ -8,9 +8,10 @@
 
 // Test that updating a command-buffer with a single kernel command
 // taking USM & scalar arguments works correctly.
-struct USMSaxpyKernelTest
+
+struct USMSaxpyKernelTestBase
     : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {
-    void SetUp() override {
+    virtual void SetUp() override {
         program_name = "saxpy_usm";
         UUR_RETURN_ON_FATAL_FAILURE(
             urUpdatableCommandBufferExpExecutionTest::SetUp());
@@ -44,14 +45,6 @@ struct USMSaxpyKernelTest
         // Index 3 is Y
         ASSERT_SUCCESS(
             urKernelSetArgPointer(kernel, 3, nullptr, &shared_ptrs[2]));
-
-        // Append kernel command to command-buffer and close command-buffer
-        ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
-            updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
-            &global_size, &local_size, 0, nullptr, nullptr, &command_handle));
-        ASSERT_NE(command_handle, nullptr);
-
-        ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
     }
 
     void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A,
@@ -62,17 +55,13 @@ struct USMSaxpyKernelTest
         }
     }
 
-    void TearDown() override {
+    virtual void TearDown() override {
         for (auto &shared_ptr : shared_ptrs) {
             if (shared_ptr) {
                 EXPECT_SUCCESS(urUSMFree(context, shared_ptr));
             }
         }
 
-        if (command_handle) {
-            EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle));
-        }
-
         UUR_RETURN_ON_FATAL_FAILURE(
             urUpdatableCommandBufferExpExecutionTest::TearDown());
     }
@@ -83,6 +72,29 @@ struct USMSaxpyKernelTest
     static constexpr size_t n_dimensions = 1;
     static constexpr uint32_t A = 42;
     std::array<void *, 5> shared_ptrs = {nullptr, nullptr, nullptr, nullptr};
+};
+
+struct USMSaxpyKernelTest : USMSaxpyKernelTestBase {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::SetUp());
+
+        // Append kernel command to command-buffer and close command-buffer
+        ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+            updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
+            &global_size, &local_size, 0, nullptr, nullptr, &command_handle));
+        ASSERT_NE(command_handle, nullptr);
+
+        ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
+    }
+
+    void TearDown() override {
+        if (command_handle) {
+            EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle));
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::TearDown());
+    }
+
     ur_exp_command_buffer_command_handle_t command_handle = nullptr;
 };
 
@@ -160,3 +172,179 @@ TEST_P(USMSaxpyKernelTest, UpdateParameters) {
     uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
     Validate(new_output, new_X, new_Y, new_A, global_size);
 }
+
+struct USMMultiSaxpyKernelTest : USMSaxpyKernelTestBase {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::SetUp());
+
+        // Append kernel command to command-buffer and close command-buffer
+        for (unsigned node = 0; node < nodes; node++) {
+            ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
+                updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset,
+                &global_size, &local_size, 0, nullptr, nullptr,
+                &command_handles[node]));
+            ASSERT_NE(command_handles[node], nullptr);
+        }
+
+        ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle));
+    }
+
+    void TearDown() override {
+        for (auto &handle : command_handles) {
+            if (handle) {
+                EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(handle));
+            }
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::TearDown());
+    }
+
+    static constexpr size_t nodes = 1024;
+    static constexpr uint32_t A = 42;
+    std::array<ur_exp_command_buffer_command_handle_t, nodes> command_handles{};
+};
+
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(USMMultiSaxpyKernelTest);
+
+TEST_P(USMMultiSaxpyKernelTest, UpdateParameters) {
+    // Run command-buffer prior to update an verify output
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    uint32_t *output = (uint32_t *)shared_ptrs[0];
+    uint32_t *X = (uint32_t *)shared_ptrs[1];
+    uint32_t *Y = (uint32_t *)shared_ptrs[2];
+    Validate(output, X, Y, A, global_size);
+
+    // Update inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+
+    // New X at index 2
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        2,               // argIndex
+        nullptr,         // pProperties
+        &shared_ptrs[3], // pArgValue
+    };
+
+    // New Y at index 3
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        3,               // argIndex
+        nullptr,         // pProperties
+        &shared_ptrs[4], // pArgValue
+    };
+
+    // New A at index 1
+    uint32_t new_A = 33;
+    ur_exp_command_buffer_update_value_arg_desc_t new_A_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1,                                                          // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        0,               // numNewMemObjArgs
+        2,               // numNewPointerArgs
+        1,               // numNewValueArgs
+        0,               // newWorkDim
+        nullptr,         // pNewMemObjArgList
+        new_input_descs, // pNewPointerArgList
+        &new_A_desc,     // pNewValueArgList
+        nullptr,         // pNewGlobalWorkOffset
+        nullptr,         // pNewGlobalWorkSize
+        nullptr,         // pNewLocalWorkSize
+    };
+
+    // Update kernel and enqueue command-buffer again
+    for (auto &handle : command_handles) {
+        ASSERT_SUCCESS(
+            urCommandBufferUpdateKernelLaunchExp(handle, &update_desc));
+    }
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size);
+}
+
+TEST_P(USMMultiSaxpyKernelTest, UpdateWithoutBlocking) {
+    // Prepare new inputs
+    ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2];
+
+    // New X at index 2
+    new_input_descs[0] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        2,               // argIndex
+        nullptr,         // pProperties
+        &shared_ptrs[3], // pArgValue
+    };
+
+    // New Y at index 3
+    new_input_descs[1] = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
+        nullptr,                                                      // pNext
+        3,               // argIndex
+        nullptr,         // pProperties
+        &shared_ptrs[4], // pArgValue
+    };
+
+    // New A at index 1
+    uint32_t new_A = 33;
+    ur_exp_command_buffer_update_value_arg_desc_t new_A_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
+        nullptr,                                                    // pNext
+        1,                                                          // argIndex
+        sizeof(new_A),                                              // argSize
+        nullptr, // pProperties
+        &new_A,  // hArgValue
+    };
+
+    // Update kernel inputs
+    ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
+        nullptr,                                                        // pNext
+        0,               // numNewMemObjArgs
+        2,               // numNewPointerArgs
+        1,               // numNewValueArgs
+        0,               // newWorkDim
+        nullptr,         // pNewMemObjArgList
+        new_input_descs, // pNewPointerArgList
+        &new_A_desc,     // pNewValueArgList
+        nullptr,         // pNewGlobalWorkOffset
+        nullptr,         // pNewGlobalWorkSize
+        nullptr,         // pNewLocalWorkSize
+    };
+
+    // Run command-buffer prior to update without doing a blocking wait after
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+
+    // Update kernel and enqueue command-buffer again
+    for (auto &handle : command_handles) {
+        ASSERT_SUCCESS(
+            urCommandBufferUpdateKernelLaunchExp(handle, &update_desc));
+    }
+    ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
+                                             nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    // Verify that update occurred correctly
+    uint32_t *new_output = (uint32_t *)shared_ptrs[0];
+    uint32_t *new_X = (uint32_t *)shared_ptrs[3];
+    uint32_t *new_Y = (uint32_t *)shared_ptrs[4];
+    Validate(new_output, new_X, new_Y, new_A, global_size);
+}
diff --git a/test/conformance/kernel/CMakeLists.txt b/test/conformance/kernel/CMakeLists.txt
index df19ba2550..73ab3f1101 100644
--- a/test/conformance/kernel/CMakeLists.txt
+++ b/test/conformance/kernel/CMakeLists.txt
@@ -18,4 +18,5 @@ add_conformance_test_with_kernels_environment(kernel
     urKernelSetArgSampler.cpp
     urKernelSetArgValue.cpp
     urKernelSetExecInfo.cpp
-    urKernelSetSpecializationConstants.cpp)
+    urKernelSetSpecializationConstants.cpp
+    urKernelGetSuggestedLocalWorkSize.cpp)
diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match
index 2c728224f2..eb023ff6a5 100644
--- a/test/conformance/kernel/kernel_adapter_hip.match
+++ b/test/conformance/kernel/kernel_adapter_hip.match
@@ -12,7 +12,3 @@ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_
 urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_
 urKernelSetArgValueTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_
 urKernelSetArgValueTest.InvalidKernelArgumentSize/AMD_HIP_BACKEND___{{.*}}_
-urKernelSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_
-urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/AMD_HIP_BACKEND___{{.*}}_
-urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/AMD_HIP_BACKEND___{{.*}}_
-urKernelSetSpecializationConstantsTest.InvalidSizeCount/AMD_HIP_BACKEND___{{.*}}_
diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match
index 2668b6821a..82c92e3f28 100644
--- a/test/conformance/kernel/kernel_adapter_level_zero.match
+++ b/test/conformance/kernel/kernel_adapter_level_zero.match
@@ -8,9 +8,6 @@ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_
 urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS
 urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index 93e3ddd67d..818c625e92 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -162,3 +162,12 @@ urKernelSetSpecializationConstantsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU
 urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.Success3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetSuggestedLocalWorkSizeTest.InvalidSuggestedLocalWorkSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp
new file mode 100644
index 0000000000..4eeabf5573
--- /dev/null
+++ b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <uur/fixtures.h>
+
+struct urKernelGetSuggestedLocalWorkSizeTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "bar";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+    size_t global_size = 32;
+    size_t global_offset = 0;
+    size_t n_dimensions = 1;
+
+    size_t suggested_local_work_size;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSuggestedLocalWorkSizeTest);
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success) {
+    suggested_local_work_size = SIZE_MAX;
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, n_dimensions, &global_offset, &global_size,
+        &suggested_local_work_size);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    ASSERT_LE(suggested_local_work_size, global_size);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success2D) {
+    size_t global_size_2d[2] = {32, 32};
+    size_t global_offset_2d[2] = {0, 0};
+    size_t suggested_local_work_size_2d[2] = {SIZE_MAX, SIZE_MAX};
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, 2, global_offset_2d, global_size_2d,
+        suggested_local_work_size_2d);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    for (int I = 0; I < 2; ++I) {
+        ASSERT_LE(suggested_local_work_size_2d[I], global_size_2d[I]);
+    }
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success3D) {
+    size_t global_size_3d[3] = {32, 32, 32};
+    size_t global_offset_3d[3] = {0, 0, 0};
+    size_t suggested_local_work_size_3d[3] = {SIZE_MAX, SIZE_MAX, SIZE_MAX};
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, 3, global_offset_3d, global_size_3d,
+        suggested_local_work_size_3d);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(result);
+    for (int I = 0; I < 3; ++I) {
+        ASSERT_LE(suggested_local_work_size_3d[I], global_size_3d[I]);
+    }
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleKernel) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         nullptr, queue, n_dimensions, &global_offset,
+                         &global_size, &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleQueue) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         kernel, nullptr, n_dimensions, &global_offset,
+                         &global_size, &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidWorkDimension) {
+    uint32_t max_work_item_dimensions = 0;
+    ASSERT_SUCCESS(urDeviceGetInfo(
+        device, UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS,
+        sizeof(max_work_item_dimensions), &max_work_item_dimensions, nullptr));
+    auto result = urKernelGetSuggestedLocalWorkSize(
+        kernel, queue, max_work_item_dimensions + 1, &global_offset,
+        &global_size, &suggested_local_work_size);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+        GTEST_SKIP();
+    }
+    ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalOffset) {
+    ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize(
+                         kernel, queue, n_dimensions, nullptr, &global_size,
+                         &suggested_local_work_size),
+                     UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalSize) {
+    ASSERT_EQ_RESULT(
+        urKernelGetSuggestedLocalWorkSize(kernel, queue, n_dimensions,
+                                          &global_offset, nullptr, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidSuggestedLocalWorkSize) {
+    ASSERT_EQ_RESULT(
+        urKernelGetSuggestedLocalWorkSize(
+            kernel, queue, n_dimensions, &global_offset, &global_size, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
diff --git a/test/conformance/memory-migrate/CMakeLists.txt b/test/conformance/memory-migrate/CMakeLists.txt
new file mode 100644
index 0000000000..785a99aa84
--- /dev/null
+++ b/test/conformance/memory-migrate/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Copyright (C) 2022-2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+add_conformance_test_with_kernels_environment(memory-migrate
+    urMemBufferMigrateAcrossDevices.cpp)
diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match b/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_hip.match b/test/conformance/memory-migrate/memory-migrate_adapter_hip.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match b/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match b/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match b/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp
new file mode 100644
index 0000000000..2e8856ac97
--- /dev/null
+++ b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp
@@ -0,0 +1,263 @@
+// Copyright (C) 2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Some tests to ensure implicit memory migration of buffers across devices
+// in the same context.
+
+#include "uur/fixtures.h"
+
+using T = uint32_t;
+
+struct urMultiDeviceContextTest : uur::urPlatformTest {
+    void SetUp() {
+        uur::urPlatformTest::SetUp();
+        ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr,
+                                   &num_devices));
+        ASSERT_NE(num_devices, 0);
+        if (num_devices == 1) {
+            return;
+        }
+
+        devices = std::vector<ur_device_handle_t>(num_devices);
+        ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, num_devices,
+                                   devices.data(), nullptr));
+        ASSERT_SUCCESS(
+            urContextCreate(num_devices, devices.data(), nullptr, &context));
+
+        queues = std::vector<ur_queue_handle_t>(num_devices);
+        for (auto i = 0u; i < num_devices; ++i) {
+            ASSERT_SUCCESS(
+                urQueueCreate(context, devices[i], nullptr, &queues[i]));
+        }
+    }
+
+    void TearDown() {
+        uur::urPlatformTest::TearDown();
+        if (num_devices == 1) {
+            return;
+        }
+        for (auto i = 0u; i < num_devices; ++i) {
+            urDeviceRelease(devices[i]);
+            urQueueRelease(queues[i]);
+        }
+        urContextRelease(context);
+    }
+
+    uint32_t num_devices = 0;
+    ur_context_handle_t context;
+    std::vector<ur_device_handle_t> devices;
+    std::vector<ur_queue_handle_t> queues;
+};
+
+struct urMultiDeviceContextMemBufferTest : urMultiDeviceContextTest {
+    void SetUp() {
+        urMultiDeviceContextTest::SetUp();
+        if (num_devices == 1) {
+            return;
+        }
+        ASSERT_SUCCESS(urMemBufferCreate(context, 0 /*flags=*/,
+                                         buffer_size_bytes,
+                                         nullptr /*pProperties=*/, &buffer));
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
+                                                          il_binary));
+
+        programs = std::vector<ur_program_handle_t>(num_devices);
+        kernels = std::vector<ur_kernel_handle_t>(num_devices);
+
+        const ur_program_properties_t properties = {
+            UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr,
+            static_cast<uint32_t>(metadatas.size()),
+            metadatas.empty() ? nullptr : metadatas.data()};
+        for (auto i = 0u; i < num_devices; ++i) {
+            ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
+                platform, context, devices[i], *il_binary, &properties,
+                &programs[i]));
+            ASSERT_SUCCESS(urProgramBuild(context, programs[i], nullptr));
+            auto kernel_names =
+                uur::KernelsEnvironment::instance->GetEntryPointNames(
+                    program_name);
+            kernel_name = kernel_names[0];
+            ASSERT_FALSE(kernel_name.empty());
+            ASSERT_SUCCESS(
+                urKernelCreate(programs[i], kernel_name.data(), &kernels[i]));
+        }
+    }
+
+    // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
+    void AddBuffer1DArg(ur_kernel_handle_t kernel, size_t current_arg_index,
+                        ur_mem_handle_t buffer) {
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, current_arg_index, nullptr, buffer));
+
+        // SYCL device kernels have different interfaces depending on the
+        // backend being used. Typically a kernel which takes a buffer argument
+        // will take a pointer to the start of the buffer and a sycl::id param
+        // which is a struct that encodes the accessor to the buffer. However
+        // the AMD backend handles this differently and uses three separate
+        // arguments for each of the three dimensions of the accessor.
+
+        ur_platform_backend_t backend;
+        ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                         sizeof(backend), &backend, nullptr));
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            // this emulates the three offset params for buffer accessor on AMD.
+            size_t val = 0;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
+                                               sizeof(size_t), nullptr, &val));
+            current_arg_index += 4;
+        } else {
+            // This emulates the offset struct sycl adds for a 1D buffer accessor.
+            struct {
+                size_t offsets[1] = {0};
+            } accessor;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
+                                               sizeof(accessor), nullptr,
+                                               &accessor));
+            current_arg_index += 2;
+        }
+    }
+
+    void TearDown() {
+        if (num_devices > 1) {
+            for (auto i = 0u; i < num_devices; ++i) {
+                ASSERT_SUCCESS(urKernelRelease(kernels[i]));
+                ASSERT_SUCCESS(urProgramRelease(programs[i]));
+            }
+            urMemRelease(buffer);
+        }
+        urMultiDeviceContextTest::TearDown();
+    }
+
+    size_t buffer_size = 4096;
+    size_t buffer_size_bytes = 4096 * sizeof(T);
+    ur_mem_handle_t buffer;
+
+    // Program stuff so we can launch kernels
+    std::shared_ptr<std::vector<char>> il_binary;
+    std::string program_name = "inc";
+    std::string kernel_name;
+    std::vector<ur_program_handle_t> programs;
+    std::vector<ur_kernel_handle_t> kernels;
+    std::vector<ur_program_metadata_t> metadatas{};
+};
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size, 0);
+    ur_event_handle_t e1;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e1, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, FillRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &fill_val,
+                                          sizeof(fill_val), 0,
+                                          buffer_size_bytes, 0, nullptr, &e1));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e1, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+
+    // Kernel to run on queues[1]
+    AddBuffer1DArg(kernels[1], 0, buffer);
+
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1, e2;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    size_t work_dims[3] = {buffer_size, 1, 1};
+    size_t offset[3] = {0, 0, 0};
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e1,
+                                         &e2));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e2, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val + 1);
+    }
+}
+
+TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) {
+    if (num_devices == 1) {
+        GTEST_SKIP();
+    }
+
+    AddBuffer1DArg(kernels[0], 0, buffer);
+    AddBuffer1DArg(kernels[1], 0, buffer);
+
+    T fill_val = 42;
+    std::vector<T> in_vec(buffer_size, fill_val);
+    std::vector<T> out_vec(buffer_size);
+    ur_event_handle_t e1, e2, e3;
+
+    ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0,
+                                           buffer_size_bytes, in_vec.data(), 0,
+                                           nullptr, &e1));
+
+    size_t work_dims[3] = {buffer_size, 1, 1};
+    size_t offset[3] = {0, 0, 0};
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e1,
+                                         &e2));
+
+    // Kernel increments the fill val by 1
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernels[0], 1 /*workDim=*/,
+                                         offset, work_dims, nullptr, 1, &e2,
+                                         &e3));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0,
+                                          buffer_size_bytes, out_vec.data(), 1,
+                                          &e3, nullptr));
+    for (auto &a : out_vec) {
+        ASSERT_EQ(a, fill_val + 2);
+    }
+}
diff --git a/test/conformance/memory/CMakeLists.txt b/test/conformance/memory/CMakeLists.txt
index 64de8ef059..041f73a079 100644
--- a/test/conformance/memory/CMakeLists.txt
+++ b/test/conformance/memory/CMakeLists.txt
@@ -10,6 +10,7 @@ add_conformance_test_with_devices_environment(memory
     urMemGetInfo.cpp 
     urMemGetNativeHandle.cpp 
     urMemImageCreate.cpp
+    urMemImageCreateWithImageFormatParam.cpp
     urMemImageCreateWithNativeHandle.cpp 
     urMemImageGetInfo.cpp 
     urMemRelease.cpp 
diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match
index b9353f20be..7d2e6a1c01 100644
--- a/test/conformance/memory/memory_adapter_cuda.match
+++ b/test/conformance/memory/memory_adapter_cuda.match
@@ -1 +1,3 @@
 urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
+{{OPT}}urMemImageCremBufferCrateTestWith1DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE1D_ARRAY
+{{OPT}}urMemImageCreateTestWith2DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE2D_ARRAY
diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match
index d0cb7644d9..27e3d859e0 100644
--- a/test/conformance/memory/memory_adapter_native_cpu.match
+++ b/test/conformance/memory/memory_adapter_native_cpu.match
@@ -4,14 +4,243 @@ urMemGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_SIZE
 urMemGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_CONTEXT
 urMemGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_SIZE
 urMemGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_CONTEXT
-urMemImageCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urMemImageCreateTest.SuccessWith3DImageType/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescStype/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescNumMipLevel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescNumSamples/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescRowPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescSlicePitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urMemImageCreateTestWith1DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE1D
+urMemImageCreateTestWith1DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE1D_ARRAY
+urMemImageCreateTestWith2DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE2D
+urMemImageCreateTestWith2DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE2D_ARRAY
 urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER
 urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_USE_HOST_POINTER
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT
+urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT
 urMemReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp
index 7b19ed7232..482bff466e 100644
--- a/test/conformance/memory/urMemImageCreate.cpp
+++ b/test/conformance/memory/urMemImageCreate.cpp
@@ -25,11 +25,87 @@ static ur_image_desc_t image_desc{
     0                    ///< [in] number of samples
 };
 
-TEST_P(urMemImageCreateTest, Success) {
+using urMemImageCreateTestWith1DMemoryTypeParam =
+    uur::urContextTestWithParam<ur_mem_type_t>;
+
+UUR_TEST_SUITE_P(urMemImageCreateTestWith1DMemoryTypeParam,
+                 ::testing::Values(UR_MEM_TYPE_IMAGE1D,
+                                   UR_MEM_TYPE_IMAGE1D_ARRAY),
+                 uur::deviceTestWithParamPrinter<ur_mem_type_t>);
+
+TEST_P(urMemImageCreateTestWith1DMemoryTypeParam, Success) {
+    ur_image_desc_t image_desc_with_param{
+        UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure
+        nullptr,    ///< [in][optional] pointer to extension-specific structure
+        getParam(), ///< [in] memory object type
+        1,          ///< [in] image width
+        0,          ///< [in] image height
+        0,          ///< [in] image depth
+        1,          ///< [in] image array size
+        0,          ///< [in] image row pitch
+        0,          ///< [in] image slice pitch
+        0,          ///< [in] number of MIP levels
+        0           ///< [in] number of samples
+    };
+
     ur_mem_handle_t image_handle = nullptr;
     ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
-                                    &image_format, &image_desc, nullptr,
-                                    &image_handle));
+                                    &image_format, &image_desc_with_param,
+                                    nullptr, &image_handle));
+    ASSERT_NE(nullptr, image_handle);
+    ASSERT_SUCCESS(urMemRelease(image_handle));
+}
+
+using urMemImageCreateTestWith2DMemoryTypeParam =
+    uur::urContextTestWithParam<ur_mem_type_t>;
+
+UUR_TEST_SUITE_P(urMemImageCreateTestWith2DMemoryTypeParam,
+                 ::testing::Values(UR_MEM_TYPE_IMAGE2D,
+                                   UR_MEM_TYPE_IMAGE2D_ARRAY),
+                 uur::deviceTestWithParamPrinter<ur_mem_type_t>);
+
+TEST_P(urMemImageCreateTestWith2DMemoryTypeParam, Success) {
+    ur_image_desc_t image_desc_with_param{
+        UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure
+        nullptr,    ///< [in][optional] pointer to extension-specific structure
+        getParam(), ///< [in] memory object type
+        1,          ///< [in] image width
+        1,          ///< [in] image height
+        0,          ///< [in] image depth
+        1,          ///< [in] image array size
+        0,          ///< [in] image row pitch
+        0,          ///< [in] image slice pitch
+        0,          ///< [in] number of MIP levels
+        0           ///< [in] number of samples
+    };
+
+    ur_mem_handle_t image_handle = nullptr;
+    ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
+                                    &image_format, &image_desc_with_param,
+                                    nullptr, &image_handle));
+    ASSERT_NE(nullptr, image_handle);
+    ASSERT_SUCCESS(urMemRelease(image_handle));
+}
+
+TEST_P(urMemImageCreateTest, SuccessWith3DImageType) {
+    ur_image_desc_t image_desc_with_param{
+        UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure
+        nullptr, ///< [in][optional] pointer to extension-specific structure
+        UR_MEM_TYPE_IMAGE3D, ///< [in] memory object type
+        1,                   ///< [in] image width
+        1,                   ///< [in] image height
+        1,                   ///< [in] image depth
+        1,                   ///< [in] image array size
+        0,                   ///< [in] image row pitch
+        0,                   ///< [in] image slice pitch
+        0,                   ///< [in] number of MIP levels
+        0                    ///< [in] number of samples
+    };
+
+    ur_mem_handle_t image_handle = nullptr;
+    ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE,
+                                    &image_format, &image_desc_with_param,
+                                    nullptr, &image_handle));
     ASSERT_NE(nullptr, image_handle);
     ASSERT_SUCCESS(urMemRelease(image_handle));
 }
diff --git a/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp
new file mode 100644
index 0000000000..c305f58f00
--- /dev/null
+++ b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include <uur/fixtures.h>
+#include <vector>
+
+static ur_image_desc_t image_desc{
+    UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure
+    nullptr, ///< [in][optional] pointer to extension-specific structure
+    UR_MEM_TYPE_IMAGE3D, ///< [in] memory object type
+    1,                   ///< [in] image width
+    1,                   ///< [in] image height
+    1,                   ///< [in] image depth
+    1,                   ///< [in] image array size
+    0,                   ///< [in] image row pitch
+    0,                   ///< [in] image slice pitch
+    0,                   ///< [in] number of MIP levels
+    0                    ///< [in] number of samples
+};
+
+const std::vector<ur_image_format_t> primary_image_formats = {
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT},
+    {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}};
+
+const std::vector<ur_image_channel_order_t> channel_orders = {
+    UR_IMAGE_CHANNEL_ORDER_A,         UR_IMAGE_CHANNEL_ORDER_R,
+    UR_IMAGE_CHANNEL_ORDER_RG,        UR_IMAGE_CHANNEL_ORDER_RA,
+    UR_IMAGE_CHANNEL_ORDER_RGB,       UR_IMAGE_CHANNEL_ORDER_RGBA,
+    UR_IMAGE_CHANNEL_ORDER_BGRA,      UR_IMAGE_CHANNEL_ORDER_ARGB,
+    UR_IMAGE_CHANNEL_ORDER_ABGR,      UR_IMAGE_CHANNEL_ORDER_INTENSITY,
+    UR_IMAGE_CHANNEL_ORDER_LUMINANCE, UR_IMAGE_CHANNEL_ORDER_RX,
+    UR_IMAGE_CHANNEL_ORDER_RGX,       UR_IMAGE_CHANNEL_ORDER_RGBX,
+    UR_IMAGE_CHANNEL_ORDER_SRGBA};
+
+const std::vector<ur_image_channel_type_t> channel_types = {
+    UR_IMAGE_CHANNEL_TYPE_SNORM_INT8,
+    UR_IMAGE_CHANNEL_TYPE_SNORM_INT16,
+    UR_IMAGE_CHANNEL_TYPE_UNORM_INT8,
+    UR_IMAGE_CHANNEL_TYPE_UNORM_INT16,
+    UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565,
+    UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555,
+    UR_IMAGE_CHANNEL_TYPE_INT_101010,
+    UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8,
+    UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16,
+    UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32,
+    UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8,
+    UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16,
+    UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32,
+    UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT,
+    UR_IMAGE_CHANNEL_TYPE_FLOAT};
+
+std::vector<ur_image_format_t> all_image_formats;
+
+struct urMemImageCreateTestWithImageFormatParam
+    : uur::urContextTestWithParam<ur_image_format_t> {
+    void SetUp() {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urContextTestWithParam<ur_image_format_t>::SetUp());
+    }
+    void TearDown() {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urContextTestWithParam<ur_image_format_t>::TearDown());
+    }
+
+    static std::vector<ur_image_format_t> makeImageFormats() {
+        for (auto channel_order : channel_orders) {
+            for (auto channel_type : channel_types) {
+                all_image_formats.push_back({channel_order, channel_type});
+            }
+        }
+        return all_image_formats;
+    }
+};
+
+UUR_TEST_SUITE_P(
+    urMemImageCreateTestWithImageFormatParam,
+    ::testing::ValuesIn(
+        urMemImageCreateTestWithImageFormatParam::makeImageFormats()),
+    uur::deviceTestWithParamPrinter<ur_image_format_t>);
+
+TEST_P(urMemImageCreateTestWithImageFormatParam, Success) {
+    ur_image_channel_order_t channel_order =
+        std::get<1>(GetParam()).channelOrder;
+    ur_image_channel_type_t channel_type = std::get<1>(GetParam()).channelType;
+
+    ur_image_format_t image_format{channel_order, channel_type};
+
+    ur_mem_handle_t image_handle = nullptr;
+    ur_result_t res =
+        urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format,
+                         &image_desc, nullptr, &image_handle);
+
+    bool is_primary_image_format = false;
+    for (auto primary_image_format : primary_image_formats) {
+        if (primary_image_format.channelOrder == image_format.channelOrder &&
+            primary_image_format.channelType == image_format.channelType) {
+            is_primary_image_format = true;
+            break;
+        }
+    }
+
+    if (!is_primary_image_format &&
+        res == UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) {
+        GTEST_SKIP();
+    }
+    ASSERT_SUCCESS(res);
+    ASSERT_NE(nullptr, image_handle);
+    ASSERT_SUCCESS(urMemRelease(image_handle));
+}
diff --git a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
index 435ac23a00..4f6684c432 100644
--- a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
+++ b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
@@ -20,8 +20,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, Success) {
         // We can however convert the native_handle back into a unified-runtime
         // handle and perform some query on it to verify that it works.
         ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformCreateWithNativeHandle(native_handle, nullptr, &plat));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+            native_handle, adapters[0], nullptr, &plat));
         ASSERT_NE(plat, nullptr);
 
         std::string input_platform_name = uur::GetPlatformName(platform);
@@ -45,8 +45,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
         ur_platform_native_properties_t props = {
             UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true};
         ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformCreateWithNativeHandle(native_handle, &props, &plat));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+            native_handle, adapters[0], &props, &plat));
         ASSERT_NE(plat, nullptr);
 
         std::string input_platform_name = uur::GetPlatformName(platform);
@@ -70,8 +70,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
         ur_platform_native_properties_t props = {
             UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false};
         ur_platform_handle_t plat = nullptr;
-        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
-            urPlatformCreateWithNativeHandle(native_handle, &props, &plat));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle(
+            native_handle, adapters[0], &props, &plat));
         ASSERT_NE(plat, nullptr);
 
         std::string input_platform_name = uur::GetPlatformName(platform);
@@ -84,8 +84,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) {
     for (auto platform : platforms) {
         ur_native_handle_t native_handle = nullptr;
         ASSERT_SUCCESS(urPlatformGetNativeHandle(platform, &native_handle));
-        ASSERT_EQ_RESULT(
-            UR_RESULT_ERROR_INVALID_NULL_POINTER,
-            urPlatformCreateWithNativeHandle(native_handle, nullptr, nullptr));
+        ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                         urPlatformCreateWithNativeHandle(
+                             native_handle, adapters[0], nullptr, nullptr));
     }
 }
diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match
index 27bd6831a5..498d19f0b9 100644
--- a/test/conformance/program/program_adapter_hip.match
+++ b/test/conformance/program/program_adapter_hip.match
@@ -1,30 +1,16 @@
 urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_
+# HIP hasn't implemented urProgramCreateWithNativeHandleTest
 {{OPT}}urProgramCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramGetBuildInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
+# This test flakily fails
 {{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_REFERENCE_COUNT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_CONTEXT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_SOURCE
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARY_SIZES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARIES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
+# HIP doesn't expose kernel numbers or names
+urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
+urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
+
+# HIP hasn't implemented urProgramLink
 {{OPT}}urProgramLinkTest.Success/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_
+
+urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_
+urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_
+urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_
+urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_
diff --git a/test/conformance/queue/queue_adapter_hip.match b/test/conformance/queue/queue_adapter_hip.match
index d39b30aa73..e69de29bb2 100644
--- a/test/conformance/queue/queue_adapter_hip.match
+++ b/test/conformance/queue/queue_adapter_hip.match
@@ -1,2 +0,0 @@
-urQueueGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}___UR_QUEUE_INFO_DEVICE_DEFAULT
-urQueueGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}___UR_QUEUE_INFO_SIZE
diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match
index a4c2c502a4..90bc25b23b 100644
--- a/test/conformance/queue/queue_adapter_native_cpu.match
+++ b/test/conformance/queue/queue_adapter_native_cpu.match
@@ -26,11 +26,16 @@ urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE
-urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE_DEFAULT
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_FLAGS
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_REFERENCE_COUNT
-urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_SIZE
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_EMPTY
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE_DEFAULT
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_FLAGS
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_REFERENCE_COUNT
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_SIZE
+urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_EMPTY
 urQueueGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/queue/queue_adapter_opencl.match b/test/conformance/queue/queue_adapter_opencl.match
index a374e0b4b1..e69de29bb2 100644
--- a/test/conformance/queue/queue_adapter_opencl.match
+++ b/test/conformance/queue/queue_adapter_opencl.match
@@ -1,2 +0,0 @@
-urQueueGetInfoTestWithInfoParam.Success/Intel_R__OpenCL___{{.*}}___UR_QUEUE_INFO_DEVICE_DEFAULT
-urQueueGetInfoTestWithInfoParam.Success/Intel_R__OpenCL___{{.*}}___UR_QUEUE_INFO_SIZE
diff --git a/test/conformance/queue/urQueueGetInfo.cpp b/test/conformance/queue/urQueueGetInfo.cpp
index 9704c72f64..9ffb97e1ff 100644
--- a/test/conformance/queue/urQueueGetInfo.cpp
+++ b/test/conformance/queue/urQueueGetInfo.cpp
@@ -20,10 +20,9 @@ using urQueueGetInfoTestWithInfoParam =
 
 UUR_TEST_SUITE_P(urQueueGetInfoTestWithInfoParam,
                  ::testing::Values(UR_QUEUE_INFO_CONTEXT, UR_QUEUE_INFO_DEVICE,
-                                   UR_QUEUE_INFO_DEVICE_DEFAULT,
                                    UR_QUEUE_INFO_FLAGS,
                                    UR_QUEUE_INFO_REFERENCE_COUNT,
-                                   UR_QUEUE_INFO_SIZE, UR_QUEUE_INFO_EMPTY),
+                                   UR_QUEUE_INFO_EMPTY),
                  uur::deviceTestWithParamPrinter<ur_queue_info_t>);
 
 TEST_P(urQueueGetInfoTestWithInfoParam, Success) {
@@ -70,6 +69,65 @@ TEST_P(urQueueGetInfoTestWithInfoParam, Success) {
     }
 }
 
+struct urQueueGetInfoDeviceQueueTestWithInfoParam
+    : public uur::urContextTestWithParam<ur_queue_info_t> {
+    void SetUp() {
+        urContextTestWithParam<ur_queue_info_t>::SetUp();
+        ur_queue_flags_t deviceQueueCapabilities;
+        ASSERT_SUCCESS(
+            urDeviceGetInfo(device, UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES,
+                            sizeof(deviceQueueCapabilities),
+                            &deviceQueueCapabilities, nullptr));
+        if (!deviceQueueCapabilities) {
+            GTEST_SKIP() << "Queue on device is not supported.";
+        }
+        ASSERT_SUCCESS(
+            urQueueCreate(context, device, &queueProperties, &queue));
+    }
+
+    void TearDown() {
+        if (queue) {
+            ASSERT_SUCCESS(urQueueRelease(queue));
+        }
+        urContextTestWithParam<ur_queue_info_t>::TearDown();
+    }
+
+    ur_queue_handle_t queue = nullptr;
+    ur_queue_properties_t queueProperties = {
+        UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr,
+        UR_QUEUE_FLAG_ON_DEVICE | UR_QUEUE_FLAG_ON_DEVICE_DEFAULT |
+            UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE};
+};
+
+UUR_TEST_SUITE_P(urQueueGetInfoDeviceQueueTestWithInfoParam,
+                 ::testing::Values(UR_QUEUE_INFO_CONTEXT, UR_QUEUE_INFO_DEVICE,
+                                   UR_QUEUE_INFO_DEVICE_DEFAULT,
+                                   UR_QUEUE_INFO_FLAGS,
+                                   UR_QUEUE_INFO_REFERENCE_COUNT,
+                                   UR_QUEUE_INFO_SIZE, UR_QUEUE_INFO_EMPTY),
+                 uur::deviceTestWithParamPrinter<ur_queue_info_t>);
+
+TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, Success) {
+    ur_queue_info_t info_type = getParam();
+    size_t size = 0;
+    auto result = urQueueGetInfo(queue, info_type, 0, nullptr, &size);
+
+    if (result == UR_RESULT_SUCCESS) {
+        ASSERT_NE(size, 0);
+
+        if (const auto expected_size = queue_info_size_map.find(info_type);
+            expected_size != queue_info_size_map.end()) {
+            ASSERT_EQ(expected_size->second, size);
+        }
+
+        std::vector<uint8_t> data(size);
+        ASSERT_SUCCESS(
+            urQueueGetInfo(queue, info_type, size, data.data(), nullptr));
+    } else {
+        ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+    }
+}
+
 using urQueueGetInfoTest = uur::urQueueTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueGetInfoTest);
 
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index cf64aa13d3..bcdc94d524 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -406,7 +406,7 @@ template <class T> struct urQueueTestWithParam : urContextTestWithParam<T> {
     }
     ur_queue_properties_t queue_properties = {
         UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
-    ur_queue_handle_t queue;
+    ur_queue_handle_t queue = nullptr;
 };
 
 template <class T>
@@ -1098,6 +1098,11 @@ std::string deviceTestWithParamPrinter(
     return uur::GetPlatformAndDeviceName(device) + "__" + ss.str();
 }
 
+template <>
+std::string deviceTestWithParamPrinter<ur_image_format_t>(
+    const ::testing::TestParamInfo<
+        std::tuple<ur_device_handle_t, ur_image_format_t>> &info);
+
 // Helper struct to allow bool param tests with meaningful names.
 struct BoolTestParam {
     std::string name;
diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h
index 34e5f7768b..e9a1efce8e 100644
--- a/test/conformance/testing/include/uur/utils.h
+++ b/test/conformance/testing/include/uur/utils.h
@@ -189,9 +189,13 @@ inline std::string GetPlatformName(ur_platform_handle_t hPlatform) {
 }
 
 inline std::string GetDeviceName(ur_device_handle_t device) {
-    std::string device_name;
+    std::string device_name, device_uuid;
     GetDeviceInfo<std::string>(device, UR_DEVICE_INFO_NAME, device_name);
-    return GTestSanitizeString(device_name);
+    GetDeviceInfo<std::string>(device, UR_DEVICE_INFO_UUID, device_uuid);
+    if (!device_uuid.empty()) {
+        device_uuid += "____";
+    }
+    return GTestSanitizeString(device_name + device_uuid);
 }
 
 inline std::string GetPlatformAndDeviceName(ur_device_handle_t device) {
diff --git a/test/conformance/testing/source/fixtures.cpp b/test/conformance/testing/source/fixtures.cpp
index a0349181eb..bdb80c60be 100644
--- a/test/conformance/testing/source/fixtures.cpp
+++ b/test/conformance/testing/source/fixtures.cpp
@@ -39,4 +39,18 @@ std::string deviceTestWithParamPrinter<SamplerCreateParamT>(
     ss << addr_mode << "_" << filter_mode;
     return uur::GetPlatformAndDeviceName(device) + "__" + ss.str();
 }
+
+template <>
+std::string deviceTestWithParamPrinter<ur_image_format_t>(
+    const ::testing::TestParamInfo<
+        std::tuple<ur_device_handle_t, ur_image_format_t>> &info) {
+    auto device = std::get<0>(info.param);
+    auto param = std::get<1>(info.param);
+    auto ChannelOrder = param.channelOrder;
+    auto ChannelType = param.channelType;
+
+    std::stringstream ss;
+    ss << ChannelOrder << "__" << ChannelType;
+    return uur::GetPlatformAndDeviceName(device) + "__" + ss.str();
+}
 } // namespace uur
diff --git a/test/conformance/usm/urUSMFree.cpp b/test/conformance/usm/urUSMFree.cpp
index 6dc1f9ffd5..f5502c89a6 100644
--- a/test/conformance/usm/urUSMFree.cpp
+++ b/test/conformance/usm/urUSMFree.cpp
@@ -42,7 +42,7 @@ TEST_P(urUSMFreeTest, SuccessHostAlloc) {
     void *ptr = nullptr;
     size_t allocation_size = sizeof(int);
     ASSERT_SUCCESS(
-        urUSMHostAlloc(context, nullptr, nullptr, sizeof(int), &ptr));
+        urUSMHostAlloc(context, nullptr, nullptr, allocation_size, &ptr));
 
     ur_event_handle_t event = nullptr;
     uint8_t pattern = 0;
diff --git a/test/conformance/usm/usm_adapter_hip.match b/test/conformance/usm/usm_adapter_hip.match
index 19b88b8b75..2dfdaf7253 100644
--- a/test/conformance/usm/usm_adapter_hip.match
+++ b/test/conformance/usm/usm_adapter_hip.match
@@ -21,8 +21,6 @@ urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}
 urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_8
 urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_512
 urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_2048
-urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_BASE_PTR
-urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_SIZE
 urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_POOL
 urUSMHostAllocTest.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled
 urUSMHostAllocTest.SuccessWithDescriptors/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled